You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT_2x2_sse.S 39 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esi)
  43. #define STACK_N 8 + STACK + ARGS(%esi)
  44. #define STACK_K 12 + STACK + ARGS(%esi)
  45. #define STACK_A 24 + STACK + ARGS(%esi)
  46. #define STACK_B 28 + STACK + ARGS(%esi)
  47. #define STACK_C 32 + STACK + ARGS(%esi)
  48. #define STACK_LDC 36 + STACK + ARGS(%esi)
  49. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  50. #define POSINV 0(%esp)
  51. #define K 16(%esp)
  52. #define N 20(%esp)
  53. #define M 24(%esp)
  54. #define A 28(%esp)
  55. #define C 32(%esp)
  56. #define J 36(%esp)
  57. #define OLD_STACK 40(%esp)
  58. #define OFFSET 48(%esp)
  59. #define KK 52(%esp)
  60. #define KKK 56(%esp)
  61. #define AORIG 60(%esp)
  62. #define BORIG 64(%esp)
  63. #define BUFFER 128(%esp)
  64. #define B %edi
  65. #define LDC %ebp
  66. #define AA %edx
  67. #define BB %ecx
  68. #define CO1 %esi
  69. #define STACK_ALIGN 4096
  70. #define STACK_OFFSET 1024
  71. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  72. #define PREFETCHSIZE (16 * 10 + 8)
  73. #define WPREFETCHSIZE 112
  74. #define PREFETCH prefetch
  75. #define PREFETCHW prefetchw
  76. #endif
  77. #if defined(PENTIUM4) || defined(PENTIUMM)
  78. #define PREFETCH prefetcht1
  79. #define PREFETCHSIZE 168
  80. #define PREFETCHW prefetcht0
  81. #endif
  82. #if defined(PENRYN) || defined(DUNNINGTON)
  83. #define PREFETCH prefetcht1
  84. #define PREFETCHSIZE 168
  85. #define PREFETCHW prefetcht0
  86. #endif
  87. #if defined(OPTERON) || !defined(HAVE_SSE2)
  88. #define movsd movlps
  89. #endif
  90. #ifdef HAVE_SSE2
  91. #define xorps pxor
  92. #endif
  93. #define KERNEL1(address) \
  94. mulps %xmm0, %xmm2; \
  95. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  96. addps %xmm2, %xmm4; \
  97. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  98. mulps %xmm0, %xmm2; \
  99. addps %xmm2, %xmm5; \
  100. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  101. mulps %xmm0, %xmm2; \
  102. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  103. addps %xmm2, %xmm6; \
  104. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  105. addps %xmm0, %xmm7; \
  106. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  107. #define KERNEL2(address) \
  108. mulps %xmm0, %xmm3; \
  109. addps %xmm3, %xmm4; \
  110. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  111. mulps %xmm0, %xmm3; \
  112. addps %xmm3, %xmm5; \
  113. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  114. mulps %xmm0, %xmm3; \
  115. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  116. addps %xmm3, %xmm6; \
  117. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  118. addps %xmm0, %xmm7; \
  119. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  120. #define KERNEL3(address) \
  121. mulps %xmm0, %xmm2; \
  122. addps %xmm2, %xmm4; \
  123. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  124. mulps %xmm0, %xmm2; \
  125. addps %xmm2, %xmm5; \
  126. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  127. mulps %xmm0, %xmm2; \
  128. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  129. addps %xmm2, %xmm6; \
  130. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  131. addps %xmm0, %xmm7; \
  132. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  133. #define KERNEL4(address) \
  134. mulps %xmm0, %xmm3; \
  135. addps %xmm3, %xmm4; \
  136. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  137. mulps %xmm0, %xmm3; \
  138. addps %xmm3, %xmm5; \
  139. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  140. mulps %xmm0, %xmm3; \
  141. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  142. addps %xmm3, %xmm6; \
  143. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  144. addps %xmm0, %xmm7; \
  145. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  146. #define KERNEL5(address) \
  147. mulps %xmm1, %xmm2; \
  148. addps %xmm2, %xmm4; \
  149. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  150. mulps %xmm1, %xmm2; \
  151. addps %xmm2, %xmm5; \
  152. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  153. mulps %xmm1, %xmm2; \
  154. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  155. addps %xmm2, %xmm6; \
  156. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  157. addps %xmm1, %xmm7; \
  158. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  159. #define KERNEL6(address) \
  160. mulps %xmm1, %xmm3; \
  161. addps %xmm3, %xmm4; \
  162. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  163. mulps %xmm1, %xmm3; \
  164. addps %xmm3, %xmm5; \
  165. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  166. mulps %xmm1, %xmm3; \
  167. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  168. addps %xmm3, %xmm6; \
  169. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  170. addps %xmm1, %xmm7; \
  171. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  172. #define KERNEL7(address) \
  173. mulps %xmm1, %xmm2; \
  174. addps %xmm2, %xmm4; \
  175. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  176. mulps %xmm1, %xmm2; \
  177. addps %xmm2, %xmm5; \
  178. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  179. mulps %xmm1, %xmm2; \
  180. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  181. addps %xmm2, %xmm6; \
  182. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  183. addps %xmm1, %xmm7; \
  184. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  185. #define KERNEL8(address) \
  186. mulps %xmm1, %xmm3; \
  187. addps %xmm3, %xmm4; \
  188. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  189. mulps %xmm1, %xmm3; \
  190. addps %xmm3, %xmm5; \
  191. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  192. mulps %xmm1, %xmm3; \
  193. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  194. addps %xmm3, %xmm6; \
  195. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  196. addps %xmm1, %xmm7; \
  197. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
  198. PROLOGUE
  199. pushl %ebp
  200. pushl %edi
  201. pushl %esi
  202. pushl %ebx
  203. PROFCODE
  204. movl %esp, %esi # save old stack
  205. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  206. andl $-STACK_ALIGN, %esp # align stack
  207. addl $STACK_OFFSET, %esp
  208. STACK_TOUCHING
  209. movl STACK_M, %ebx
  210. movl STACK_N, %eax
  211. movl STACK_K, %ecx
  212. movl STACK_A, %edx
  213. movl %ebx, M
  214. movl %eax, N
  215. movl %ecx, K
  216. movl %edx, A
  217. movl %esi, OLD_STACK
  218. movl STACK_B, %edi
  219. movl STACK_C, %ebx
  220. movss STACK_OFFT, %xmm4
  221. xorps %xmm7, %xmm7
  222. pcmpeqb %xmm7, %xmm7
  223. pslld $31, %xmm7
  224. xorps %xmm2, %xmm2
  225. #ifndef CONJ
  226. movss %xmm7, 0 + POSINV
  227. movss %xmm2, 4 + POSINV
  228. movss %xmm7, 8 + POSINV
  229. movss %xmm2, 12 + POSINV
  230. #else
  231. movss %xmm2, 0 + POSINV
  232. movss %xmm7, 4 + POSINV
  233. movss %xmm2, 8 + POSINV
  234. movss %xmm7, 12 + POSINV
  235. #endif
  236. EMMS
  237. movl %ebx, C
  238. movl STACK_LDC, LDC
  239. movss %xmm4, OFFSET
  240. movss %xmm4, KK
  241. sall $ZBASE_SHIFT, LDC
  242. #ifdef LN
  243. movl M, %eax
  244. sall $ZBASE_SHIFT, %eax
  245. addl %eax, C
  246. imull K, %eax
  247. addl %eax, A
  248. #endif
  249. #ifdef RT
  250. movl N, %eax
  251. sall $ZBASE_SHIFT, %eax
  252. imull K, %eax
  253. addl %eax, B
  254. movl N, %eax
  255. imull LDC, %eax
  256. addl %eax, C
  257. #endif
  258. #ifdef RN
  259. negl KK
  260. #endif
  261. #ifdef RT
  262. movl N, %eax
  263. subl OFFSET, %eax
  264. movl %eax, KK
  265. #endif
  266. movl N, %eax
  267. movl %eax, J
  268. sarl $1, J
  269. jle .L100
  270. ALIGN_4
  271. .L01:
  272. #ifdef LN
  273. movl OFFSET, %eax
  274. addl M, %eax
  275. movl %eax, KK
  276. #endif
  277. leal BUFFER, %ecx
  278. #ifdef RT
  279. movl K, %eax
  280. sall $1 + ZBASE_SHIFT, %eax
  281. subl %eax, B
  282. #endif
  283. #if defined(LN) || defined(RT)
  284. movl KK, %eax
  285. movl B, BORIG
  286. sall $1 + ZBASE_SHIFT, %eax
  287. addl %eax, B
  288. leal (BB, %eax, 4), BB
  289. #endif
  290. #if defined(LT)
  291. movl OFFSET, %eax
  292. movl %eax, KK
  293. #endif
  294. #if defined(LT) || defined(RN)
  295. movl KK, %eax
  296. #else
  297. movl K, %eax
  298. subl KK, %eax
  299. #endif
  300. sarl $1, %eax
  301. jle .L03
  302. ALIGN_4
  303. .L02:
  304. movaps 0 * SIZE(B), %xmm3
  305. movaps 4 * SIZE(B), %xmm7
  306. pshufd $0x00, %xmm3, %xmm0
  307. pshufd $0x55, %xmm3, %xmm1
  308. pshufd $0xaa, %xmm3, %xmm2
  309. pshufd $0xff, %xmm3, %xmm3
  310. movaps %xmm0, 0 * SIZE(BB)
  311. movaps %xmm1, 4 * SIZE(BB)
  312. movaps %xmm2, 8 * SIZE(BB)
  313. movaps %xmm3, 12 * SIZE(BB)
  314. pshufd $0x00, %xmm7, %xmm4
  315. pshufd $0x55, %xmm7, %xmm5
  316. pshufd $0xaa, %xmm7, %xmm6
  317. pshufd $0xff, %xmm7, %xmm7
  318. movaps %xmm4, 16 * SIZE(BB)
  319. movaps %xmm5, 20 * SIZE(BB)
  320. movaps %xmm6, 24 * SIZE(BB)
  321. movaps %xmm7, 28 * SIZE(BB)
  322. addl $ 8 * SIZE, B
  323. addl $32 * SIZE, BB
  324. decl %eax
  325. jne .L02
  326. ALIGN_4
  327. .L03:
  328. #if defined(LT) || defined(RN)
  329. movl KK, %eax
  330. #else
  331. movl K, %eax
  332. subl KK, %eax
  333. #endif
  334. andl $1, %eax
  335. BRANCH
  336. jle .L05
  337. ALIGN_4
  338. .L04:
  339. movaps 0 * SIZE(B), %xmm3
  340. pshufd $0x00, %xmm3, %xmm0
  341. pshufd $0x55, %xmm3, %xmm1
  342. pshufd $0xaa, %xmm3, %xmm2
  343. pshufd $0xff, %xmm3, %xmm3
  344. movaps %xmm0, 0 * SIZE(BB)
  345. movaps %xmm1, 4 * SIZE(BB)
  346. movaps %xmm2, 8 * SIZE(BB)
  347. movaps %xmm3, 12 * SIZE(BB)
  348. addl $ 4 * SIZE, B
  349. ALIGN_4
  350. .L05:
  351. #if defined(LT) || defined(RN)
  352. movl A, %eax
  353. movl %eax, AA
  354. #else
  355. movl A, %eax
  356. movl %eax, AORIG
  357. #endif
  358. leal (, LDC, 2), %eax
  359. #ifdef RT
  360. subl %eax, C
  361. #endif
  362. movl C, CO1
  363. #ifndef RT
  364. addl %eax, C
  365. #endif
  366. movl M, %ebx
  367. sarl $1, %ebx
  368. jle .L30
  369. ALIGN_4
  370. .L10:
  371. #ifdef LN
  372. movl K, %eax
  373. sall $1 + ZBASE_SHIFT, %eax
  374. subl %eax, AORIG
  375. #endif
  376. #if defined(LN) || defined(RT)
  377. movl AORIG, %eax
  378. movl %eax, AA
  379. movl KK, %eax
  380. sall $1 + ZBASE_SHIFT, %eax
  381. addl %eax, AA
  382. #endif
  383. leal BUFFER, BB # boffset1 = boffset
  384. #if defined(LN) || defined(RT)
  385. movl KK, %eax
  386. sall $3 + ZBASE_SHIFT, %eax
  387. addl %eax, BB
  388. #endif
  389. movaps 0 * SIZE(AA), %xmm0
  390. xorps %xmm4, %xmm4
  391. movaps 16 * SIZE(AA), %xmm1
  392. xorps %xmm5, %xmm5
  393. movaps 0 * SIZE(BB), %xmm2
  394. xorps %xmm6, %xmm6
  395. movaps 16 * SIZE(BB), %xmm3
  396. xorps %xmm7, %xmm7
  397. PREFETCHW 3 * SIZE(CO1)
  398. PREFETCHW 3 * SIZE(CO1, LDC)
  399. #if defined(LT) || defined(RN)
  400. movl KK, %eax
  401. #else
  402. movl K, %eax
  403. subl KK, %eax
  404. #endif
  405. sarl $3, %eax
  406. je .L15
  407. ALIGN_4
  408. .L11:
  409. KERNEL1(0 * 16)
  410. KERNEL2(0 * 16)
  411. KERNEL3(0 * 16)
  412. KERNEL4(0 * 16)
  413. KERNEL5(0 * 16)
  414. KERNEL6(0 * 16)
  415. KERNEL7(0 * 16)
  416. KERNEL8(0 * 16)
  417. addl $ 32 * SIZE, AA
  418. addl $128 * SIZE, BB
  419. decl %eax
  420. jne .L11
  421. ALIGN_4
  422. .L15:
  423. #if defined(LT) || defined(RN)
  424. movl KK, %eax
  425. #else
  426. movl K, %eax
  427. subl KK, %eax
  428. #endif
  429. andl $7, %eax # if (k & 1)
  430. BRANCH
  431. je .L14
  432. ALIGN_4
  433. .L13:
  434. mulps %xmm0, %xmm2
  435. addps %xmm2, %xmm4
  436. movaps 4 * SIZE(BB), %xmm2
  437. mulps %xmm0, %xmm2
  438. addps %xmm2, %xmm5
  439. movaps 8 * SIZE(BB), %xmm2
  440. mulps %xmm0, %xmm2
  441. mulps 12 * SIZE(BB), %xmm0
  442. addps %xmm2, %xmm6
  443. movaps 16 * SIZE(BB), %xmm2
  444. addps %xmm0, %xmm7
  445. movaps 4 * SIZE(AA), %xmm0
  446. addl $ 4 * SIZE, AA
  447. addl $16 * SIZE, BB
  448. decl %eax
  449. jg .L13
  450. ALIGN_4
  451. .L14:
  452. movaps POSINV, %xmm0
  453. shufps $0xb1, %xmm5, %xmm5
  454. shufps $0xb1, %xmm7, %xmm7
  455. #if defined(LN) || defined(LT)
  456. #ifndef CONJ
  457. xorps %xmm0, %xmm5
  458. xorps %xmm0, %xmm7
  459. #else
  460. xorps %xmm0, %xmm4
  461. xorps %xmm0, %xmm6
  462. #endif
  463. #else
  464. xorps %xmm0, %xmm5
  465. xorps %xmm0, %xmm7
  466. #endif
  467. addps %xmm5, %xmm4
  468. addps %xmm7, %xmm6
  469. #if defined(LN) || defined(RT)
  470. movl KK, %eax
  471. #ifdef LN
  472. subl $2, %eax
  473. #else
  474. subl $2, %eax
  475. #endif
  476. movl AORIG, AA
  477. movl BORIG, B
  478. leal BUFFER, BB
  479. sall $ZBASE_SHIFT, %eax
  480. leal (AA, %eax, 2), AA
  481. leal (B, %eax, 2), B
  482. leal (BB, %eax, 8), BB
  483. #endif
  484. #if defined(LN) || defined(LT)
  485. movaps %xmm4, %xmm5
  486. unpcklpd %xmm6, %xmm4
  487. unpckhpd %xmm6, %xmm5
  488. movaps 0 * SIZE(B), %xmm2
  489. movaps 4 * SIZE(B), %xmm3
  490. subps %xmm4, %xmm2
  491. subps %xmm5, %xmm3
  492. #else
  493. movaps 0 * SIZE(AA), %xmm1
  494. movaps 4 * SIZE(AA), %xmm5
  495. subps %xmm4, %xmm1
  496. subps %xmm6, %xmm5
  497. #endif
  498. #ifdef LN
  499. movaps 4 * SIZE(AA), %xmm5
  500. pshufd $0xee, %xmm5, %xmm6
  501. pshufd $0xbb, %xmm5, %xmm7
  502. pshufd $0xa0, %xmm3, %xmm4
  503. pshufd $0xf5, %xmm3, %xmm3
  504. #ifndef CONJ
  505. xorps %xmm0, %xmm3
  506. #else
  507. xorps %xmm0, %xmm4
  508. #endif
  509. mulps %xmm6, %xmm4
  510. mulps %xmm7, %xmm3
  511. addps %xmm4, %xmm3
  512. pshufd $0x44, %xmm5, %xmm6
  513. pshufd $0x11, %xmm5, %xmm7
  514. pshufd $0xa0, %xmm3, %xmm4
  515. pshufd $0xf5, %xmm3, %xmm1
  516. #ifndef CONJ
  517. xorps %xmm0, %xmm1
  518. #else
  519. xorps %xmm0, %xmm4
  520. #endif
  521. mulps %xmm6, %xmm4
  522. mulps %xmm7, %xmm1
  523. subps %xmm4, %xmm2
  524. subps %xmm1, %xmm2
  525. movaps 0 * SIZE(AA), %xmm5
  526. pshufd $0x44, %xmm5, %xmm6
  527. pshufd $0x11, %xmm5, %xmm7
  528. pshufd $0xa0, %xmm2, %xmm4
  529. pshufd $0xf5, %xmm2, %xmm2
  530. #ifndef CONJ
  531. xorps %xmm0, %xmm2
  532. #else
  533. xorps %xmm0, %xmm4
  534. #endif
  535. mulps %xmm6, %xmm4
  536. mulps %xmm7, %xmm2
  537. addps %xmm4, %xmm2
  538. #endif
  539. #ifdef LT
  540. movaps 0 * SIZE(AA), %xmm5
  541. pshufd $0x44, %xmm5, %xmm6
  542. pshufd $0x11, %xmm5, %xmm7
  543. pshufd $0xa0, %xmm2, %xmm4
  544. pshufd $0xf5, %xmm2, %xmm2
  545. #ifndef CONJ
  546. xorps %xmm0, %xmm2
  547. #else
  548. xorps %xmm0, %xmm4
  549. #endif
  550. mulps %xmm6, %xmm4
  551. mulps %xmm7, %xmm2
  552. addps %xmm4, %xmm2
  553. pshufd $0xee, %xmm5, %xmm6
  554. pshufd $0xbb, %xmm5, %xmm7
  555. pshufd $0xa0, %xmm2, %xmm4
  556. pshufd $0xf5, %xmm2, %xmm1
  557. #ifndef CONJ
  558. xorps %xmm0, %xmm1
  559. #else
  560. xorps %xmm0, %xmm4
  561. #endif
  562. mulps %xmm6, %xmm4
  563. mulps %xmm7, %xmm1
  564. subps %xmm4, %xmm3
  565. subps %xmm1, %xmm3
  566. movaps 4 * SIZE(AA), %xmm5
  567. pshufd $0xee, %xmm5, %xmm6
  568. pshufd $0xbb, %xmm5, %xmm7
  569. pshufd $0xa0, %xmm3, %xmm4
  570. pshufd $0xf5, %xmm3, %xmm3
  571. #ifndef CONJ
  572. xorps %xmm0, %xmm3
  573. #else
  574. xorps %xmm0, %xmm4
  575. #endif
  576. mulps %xmm6, %xmm4
  577. mulps %xmm7, %xmm3
  578. addps %xmm4, %xmm3
  579. #endif
  580. #ifdef RN
  581. movaps 0 * SIZE(B), %xmm4
  582. pshufd $0x44, %xmm4, %xmm6
  583. pshufd $0x11, %xmm4, %xmm7
  584. pshufd $0xa0, %xmm1, %xmm3
  585. pshufd $0xf5, %xmm1, %xmm1
  586. #ifndef CONJ
  587. xorps %xmm0, %xmm1
  588. #else
  589. xorps %xmm0, %xmm3
  590. #endif
  591. mulps %xmm6, %xmm3
  592. mulps %xmm7, %xmm1
  593. addps %xmm3, %xmm1
  594. pshufd $0xee, %xmm4, %xmm6
  595. pshufd $0xbb, %xmm4, %xmm7
  596. pshufd $0xa0, %xmm1, %xmm3
  597. pshufd $0xf5, %xmm1, %xmm2
  598. #ifndef CONJ
  599. xorps %xmm0, %xmm2
  600. #else
  601. xorps %xmm0, %xmm3
  602. #endif
  603. mulps %xmm6, %xmm3
  604. mulps %xmm7, %xmm2
  605. subps %xmm3, %xmm5
  606. subps %xmm2, %xmm5
  607. movaps 4 * SIZE(B), %xmm4
  608. pshufd $0xee, %xmm4, %xmm6
  609. pshufd $0xbb, %xmm4, %xmm7
  610. pshufd $0xa0, %xmm5, %xmm3
  611. pshufd $0xf5, %xmm5, %xmm5
  612. #ifndef CONJ
  613. xorps %xmm0, %xmm5
  614. #else
  615. xorps %xmm0, %xmm3
  616. #endif
  617. mulps %xmm6, %xmm3
  618. mulps %xmm7, %xmm5
  619. addps %xmm3, %xmm5
  620. #endif
  621. #ifdef RT
  622. movaps 4 * SIZE(B), %xmm4
  623. pshufd $0xee, %xmm4, %xmm6
  624. pshufd $0xbb, %xmm4, %xmm7
  625. pshufd $0xa0, %xmm5, %xmm3
  626. pshufd $0xf5, %xmm5, %xmm5
  627. #ifndef CONJ
  628. xorps %xmm0, %xmm5
  629. #else
  630. xorps %xmm0, %xmm3
  631. #endif
  632. mulps %xmm6, %xmm3
  633. mulps %xmm7, %xmm5
  634. addps %xmm3, %xmm5
  635. pshufd $0x44, %xmm4, %xmm6
  636. pshufd $0x11, %xmm4, %xmm7
  637. pshufd $0xa0, %xmm5, %xmm3
  638. pshufd $0xf5, %xmm5, %xmm2
  639. #ifndef CONJ
  640. xorps %xmm0, %xmm2
  641. #else
  642. xorps %xmm0, %xmm3
  643. #endif
  644. mulps %xmm6, %xmm3
  645. mulps %xmm7, %xmm2
  646. subps %xmm3, %xmm1
  647. subps %xmm2, %xmm1
  648. movaps 0 * SIZE(B), %xmm4
  649. pshufd $0x44, %xmm4, %xmm6
  650. pshufd $0x11, %xmm4, %xmm7
  651. pshufd $0xa0, %xmm1, %xmm3
  652. pshufd $0xf5, %xmm1, %xmm1
  653. #ifndef CONJ
  654. xorps %xmm0, %xmm1
  655. #else
  656. xorps %xmm0, %xmm3
  657. #endif
  658. mulps %xmm6, %xmm3
  659. mulps %xmm7, %xmm1
  660. addps %xmm3, %xmm1
  661. #endif
  662. #ifdef LN
  663. subl $4 * SIZE, CO1
  664. #endif
  665. #if defined(LN) || defined(LT)
  666. movaps %xmm2, 0 * SIZE(B)
  667. movaps %xmm3, 4 * SIZE(B)
  668. pshufd $0x00, %xmm2, %xmm0
  669. pshufd $0x55, %xmm2, %xmm1
  670. pshufd $0xaa, %xmm2, %xmm4
  671. pshufd $0xff, %xmm2, %xmm5
  672. movaps %xmm0, 0 * SIZE(BB)
  673. movaps %xmm1, 4 * SIZE(BB)
  674. movaps %xmm4, 8 * SIZE(BB)
  675. movaps %xmm5, 12 * SIZE(BB)
  676. pshufd $0x00, %xmm3, %xmm0
  677. pshufd $0x55, %xmm3, %xmm1
  678. pshufd $0xaa, %xmm3, %xmm4
  679. pshufd $0xff, %xmm3, %xmm5
  680. movaps %xmm0, 16 * SIZE(BB)
  681. movaps %xmm1, 20 * SIZE(BB)
  682. movaps %xmm4, 24 * SIZE(BB)
  683. movaps %xmm5, 28 * SIZE(BB)
  684. movlps %xmm2, 0 * SIZE(CO1)
  685. movlps %xmm3, 2 * SIZE(CO1)
  686. movhps %xmm2, 0 * SIZE(CO1, LDC)
  687. movhps %xmm3, 2 * SIZE(CO1, LDC)
  688. #else
  689. movaps %xmm1, 0 * SIZE(AA)
  690. movaps %xmm5, 4 * SIZE(AA)
  691. movlps %xmm1, 0 * SIZE(CO1)
  692. movhps %xmm1, 2 * SIZE(CO1)
  693. movlps %xmm5, 0 * SIZE(CO1, LDC)
  694. movhps %xmm5, 2 * SIZE(CO1, LDC)
  695. #endif
  696. #ifndef LN
  697. addl $4 * SIZE, CO1
  698. #endif
  699. #if defined(LT) || defined(RN)
  700. movl K, %eax
  701. subl KK, %eax
  702. sall $1 + ZBASE_SHIFT, %eax
  703. addl %eax, AA
  704. #ifdef LT
  705. addl $8 * SIZE, B
  706. #endif
  707. #endif
  708. #ifdef LN
  709. subl $2, KK
  710. movl BORIG, B
  711. #endif
  712. #ifdef LT
  713. addl $2, KK
  714. #endif
  715. #ifdef RT
  716. movl K, %eax
  717. movl BORIG, B
  718. sall $1 + ZBASE_SHIFT, %eax
  719. addl %eax, AORIG
  720. #endif
  721. decl %ebx
  722. jg .L10
  723. ALIGN_4
  724. .L30:
  725. movl M, %ebx
  726. andl $1, %ebx
  727. jle .L99
  728. ALIGN_4
  729. .L40:
  730. #ifdef LN
  731. movl K, %eax
  732. sall $ZBASE_SHIFT, %eax
  733. subl %eax, AORIG
  734. #endif
  735. #if defined(LN) || defined(RT)
  736. movl AORIG, %eax
  737. movl %eax, AA
  738. movl KK, %eax
  739. sall $ZBASE_SHIFT, %eax
  740. addl %eax, AA
  741. #endif
  742. leal BUFFER, BB # boffset1 = boffset
  743. #if defined(LN) || defined(RT)
  744. movl KK, %eax
  745. sall $3 + ZBASE_SHIFT, %eax
  746. addl %eax, BB
  747. #endif
  748. xorps %xmm4, %xmm4
  749. xorps %xmm5, %xmm5
  750. xorps %xmm6, %xmm6
  751. xorps %xmm7, %xmm7
  752. #ifdef movsd
  753. xorps %xmm0, %xmm0
  754. #endif
  755. movsd 0 * SIZE(AA), %xmm0
  756. #ifdef movsd
  757. xorps %xmm1, %xmm1
  758. #endif
  759. movsd 8 * SIZE(AA), %xmm1
  760. movaps 0 * SIZE(BB), %xmm2
  761. movaps 16 * SIZE(BB), %xmm3
  762. #if defined(LT) || defined(RN)
  763. movl KK, %eax
  764. #else
  765. movl K, %eax
  766. subl KK, %eax
  767. #endif
  768. sarl $3, %eax
  769. je .L42
  770. ALIGN_4
  771. .L41:
  772. mulps %xmm0, %xmm2
  773. prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA)
  774. addps %xmm2, %xmm4
  775. movaps 4 * SIZE(BB), %xmm2
  776. mulps %xmm0, %xmm2
  777. addps %xmm2, %xmm5
  778. movaps 8 * SIZE(BB), %xmm2
  779. mulps %xmm0, %xmm2
  780. mulps 12 * SIZE(BB), %xmm0
  781. addps %xmm2, %xmm6
  782. movaps 32 * SIZE(BB), %xmm2
  783. addps %xmm0, %xmm7
  784. movsd 2 * SIZE(AA), %xmm0
  785. mulps %xmm0, %xmm3
  786. addps %xmm3, %xmm4
  787. movaps 20 * SIZE(BB), %xmm3
  788. mulps %xmm0, %xmm3
  789. addps %xmm3, %xmm5
  790. movaps 24 * SIZE(BB), %xmm3
  791. mulps %xmm0, %xmm3
  792. mulps 28 * SIZE(BB), %xmm0
  793. addps %xmm3, %xmm6
  794. movaps 48 * SIZE(BB), %xmm3
  795. addps %xmm0, %xmm7
  796. movsd 4 * SIZE(AA), %xmm0
  797. mulps %xmm0, %xmm2
  798. addps %xmm2, %xmm4
  799. movaps 36 * SIZE(BB), %xmm2
  800. mulps %xmm0, %xmm2
  801. addps %xmm2, %xmm5
  802. movaps 40 * SIZE(BB), %xmm2
  803. mulps %xmm0, %xmm2
  804. mulps 44 * SIZE(BB), %xmm0
  805. addps %xmm2, %xmm6
  806. movaps 64 * SIZE(BB), %xmm2
  807. addps %xmm0, %xmm7
  808. movsd 6 * SIZE(AA), %xmm0
  809. mulps %xmm0, %xmm3
  810. addps %xmm3, %xmm4
  811. movaps 52 * SIZE(BB), %xmm3
  812. mulps %xmm0, %xmm3
  813. addps %xmm3, %xmm5
  814. movaps 56 * SIZE(BB), %xmm3
  815. mulps %xmm0, %xmm3
  816. mulps 60 * SIZE(BB), %xmm0
  817. addps %xmm3, %xmm6
  818. movaps 80 * SIZE(BB), %xmm3
  819. addps %xmm0, %xmm7
  820. movsd 16 * SIZE(AA), %xmm0
  821. mulps %xmm1, %xmm2
  822. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  823. prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
  824. #endif
  825. addps %xmm2, %xmm4
  826. movaps 68 * SIZE(BB), %xmm2
  827. mulps %xmm1, %xmm2
  828. addps %xmm2, %xmm5
  829. movaps 72 * SIZE(BB), %xmm2
  830. mulps %xmm1, %xmm2
  831. mulps 76 * SIZE(BB), %xmm1
  832. addps %xmm2, %xmm6
  833. movaps 96 * SIZE(BB), %xmm2
  834. addps %xmm1, %xmm7
  835. movsd 10 * SIZE(AA), %xmm1
  836. mulps %xmm1, %xmm3
  837. addps %xmm3, %xmm4
  838. movaps 84 * SIZE(BB), %xmm3
  839. mulps %xmm1, %xmm3
  840. addps %xmm3, %xmm5
  841. movaps 88 * SIZE(BB), %xmm3
  842. mulps %xmm1, %xmm3
  843. mulps 92 * SIZE(BB), %xmm1
  844. addps %xmm3, %xmm6
  845. movaps 112 * SIZE(BB), %xmm3
  846. addps %xmm1, %xmm7
  847. movsd 12 * SIZE(AA), %xmm1
  848. mulps %xmm1, %xmm2
  849. addps %xmm2, %xmm4
  850. movaps 100 * SIZE(BB), %xmm2
  851. mulps %xmm1, %xmm2
  852. addps %xmm2, %xmm5
  853. movaps 104 * SIZE(BB), %xmm2
  854. mulps %xmm1, %xmm2
  855. mulps 108 * SIZE(BB), %xmm1
  856. addps %xmm2, %xmm6
  857. movaps 128 * SIZE(BB), %xmm2
  858. addps %xmm1, %xmm7
  859. movsd 14 * SIZE(AA), %xmm1
  860. mulps %xmm1, %xmm3
  861. addps %xmm3, %xmm4
  862. movaps 116 * SIZE(BB), %xmm3
  863. mulps %xmm1, %xmm3
  864. addps %xmm3, %xmm5
  865. movaps 120 * SIZE(BB), %xmm3
  866. mulps %xmm1, %xmm3
  867. mulps 124 * SIZE(BB), %xmm1
  868. addps %xmm3, %xmm6
  869. movaps 144 * SIZE(BB), %xmm3
  870. addps %xmm1, %xmm7
  871. movsd 24 * SIZE(AA), %xmm1
  872. addl $ 16 * SIZE, AA
  873. addl $128 * SIZE, BB
  874. decl %eax
  875. jne .L41
  876. ALIGN_4
  877. .L42:
  878. #if defined(LT) || defined(RN)
  879. movl KK, %eax
  880. #else
  881. movl K, %eax
  882. subl KK, %eax
  883. #endif
  884. andl $7, %eax # if (k & 1)
  885. BRANCH
  886. je .L44
  887. ALIGN_4
  888. .L43:
  889. mulps %xmm0, %xmm2
  890. addps %xmm2, %xmm4
  891. movaps 4 * SIZE(BB), %xmm2
  892. mulps %xmm0, %xmm2
  893. addps %xmm2, %xmm5
  894. movaps 8 * SIZE(BB), %xmm2
  895. mulps %xmm0, %xmm2
  896. mulps 12 * SIZE(BB), %xmm0
  897. addps %xmm2, %xmm6
  898. movaps 16 * SIZE(BB), %xmm2
  899. addps %xmm0, %xmm7
  900. movsd 2 * SIZE(AA), %xmm0
  901. addl $ 2 * SIZE, AA
  902. addl $16 * SIZE, BB
  903. decl %eax
  904. jg .L43
  905. ALIGN_4
  906. .L44:
  907. movaps POSINV, %xmm0
  908. shufps $0xb1, %xmm5, %xmm5
  909. shufps $0xb1, %xmm7, %xmm7
  910. #if defined(LN) || defined(LT)
  911. #ifndef CONJ
  912. xorps %xmm0, %xmm5
  913. xorps %xmm0, %xmm7
  914. #else
  915. xorps %xmm0, %xmm4
  916. xorps %xmm0, %xmm6
  917. #endif
  918. #else
  919. xorps %xmm0, %xmm5
  920. xorps %xmm0, %xmm7
  921. #endif
  922. addps %xmm5, %xmm4
  923. addps %xmm7, %xmm6
  924. #if defined(LN) || defined(RT)
  925. movl KK, %eax
  926. #ifdef LN
  927. subl $1, %eax
  928. #else
  929. subl $2, %eax
  930. #endif
  931. movl AORIG, AA
  932. movl BORIG, B
  933. leal BUFFER, BB
  934. sall $ZBASE_SHIFT, %eax
  935. leal (AA, %eax, 1), AA
  936. leal (B, %eax, 2), B
  937. leal (BB, %eax, 8), BB
  938. #endif
  939. #if defined(LN) || defined(LT)
  940. unpcklpd %xmm6, %xmm4
  941. movaps 0 * SIZE(B), %xmm2
  942. subps %xmm4, %xmm2
  943. #else
  944. #ifdef movsd
  945. xorps %xmm1, %xmm1
  946. #endif
  947. movsd 0 * SIZE(AA), %xmm1
  948. #ifdef movsd
  949. xorps %xmm5, %xmm5
  950. #endif
  951. movsd 2 * SIZE(AA), %xmm5
  952. subps %xmm4, %xmm1
  953. subps %xmm6, %xmm5
  954. #endif
  955. #if defined(LN) || defined(LT)
  956. movaps 0 * SIZE(AA), %xmm5
  957. pshufd $0x44, %xmm5, %xmm6
  958. pshufd $0x11, %xmm5, %xmm7
  959. pshufd $0xa0, %xmm2, %xmm4
  960. pshufd $0xf5, %xmm2, %xmm2
  961. #ifndef CONJ
  962. xorps %xmm0, %xmm2
  963. #else
  964. xorps %xmm0, %xmm4
  965. #endif
  966. mulps %xmm6, %xmm4
  967. mulps %xmm7, %xmm2
  968. addps %xmm4, %xmm2
  969. #endif
  970. #ifdef RN
  971. movaps 0 * SIZE(B), %xmm4
  972. pshufd $0x44, %xmm4, %xmm6
  973. pshufd $0x11, %xmm4, %xmm7
  974. pshufd $0xa0, %xmm1, %xmm3
  975. pshufd $0xf5, %xmm1, %xmm1
  976. #ifndef CONJ
  977. xorps %xmm0, %xmm1
  978. #else
  979. xorps %xmm0, %xmm3
  980. #endif
  981. mulps %xmm6, %xmm3
  982. mulps %xmm7, %xmm1
  983. addps %xmm3, %xmm1
  984. pshufd $0xee, %xmm4, %xmm6
  985. pshufd $0xbb, %xmm4, %xmm7
  986. pshufd $0xa0, %xmm1, %xmm3
  987. pshufd $0xf5, %xmm1, %xmm2
  988. #ifndef CONJ
  989. xorps %xmm0, %xmm2
  990. #else
  991. xorps %xmm0, %xmm3
  992. #endif
  993. mulps %xmm6, %xmm3
  994. mulps %xmm7, %xmm2
  995. subps %xmm3, %xmm5
  996. subps %xmm2, %xmm5
  997. movaps 4 * SIZE(B), %xmm4
  998. pshufd $0xee, %xmm4, %xmm6
  999. pshufd $0xbb, %xmm4, %xmm7
  1000. pshufd $0xa0, %xmm5, %xmm3
  1001. pshufd $0xf5, %xmm5, %xmm5
  1002. #ifndef CONJ
  1003. xorps %xmm0, %xmm5
  1004. #else
  1005. xorps %xmm0, %xmm3
  1006. #endif
  1007. mulps %xmm6, %xmm3
  1008. mulps %xmm7, %xmm5
  1009. addps %xmm3, %xmm5
  1010. #endif
  1011. #ifdef RT
  1012. movaps 4 * SIZE(B), %xmm4
  1013. pshufd $0xee, %xmm4, %xmm6
  1014. pshufd $0xbb, %xmm4, %xmm7
  1015. pshufd $0xa0, %xmm5, %xmm3
  1016. pshufd $0xf5, %xmm5, %xmm5
  1017. #ifndef CONJ
  1018. xorps %xmm0, %xmm5
  1019. #else
  1020. xorps %xmm0, %xmm3
  1021. #endif
  1022. mulps %xmm6, %xmm3
  1023. mulps %xmm7, %xmm5
  1024. addps %xmm3, %xmm5
  1025. pshufd $0x44, %xmm4, %xmm6
  1026. pshufd $0x11, %xmm4, %xmm7
  1027. pshufd $0xa0, %xmm5, %xmm3
  1028. pshufd $0xf5, %xmm5, %xmm2
  1029. #ifndef CONJ
  1030. xorps %xmm0, %xmm2
  1031. #else
  1032. xorps %xmm0, %xmm3
  1033. #endif
  1034. mulps %xmm6, %xmm3
  1035. mulps %xmm7, %xmm2
  1036. subps %xmm3, %xmm1
  1037. subps %xmm2, %xmm1
  1038. movaps 0 * SIZE(B), %xmm4
  1039. pshufd $0x44, %xmm4, %xmm6
  1040. pshufd $0x11, %xmm4, %xmm7
  1041. pshufd $0xa0, %xmm1, %xmm3
  1042. pshufd $0xf5, %xmm1, %xmm1
  1043. #ifndef CONJ
  1044. xorps %xmm0, %xmm1
  1045. #else
  1046. xorps %xmm0, %xmm3
  1047. #endif
  1048. mulps %xmm6, %xmm3
  1049. mulps %xmm7, %xmm1
  1050. addps %xmm3, %xmm1
  1051. #endif
  1052. #ifdef LN
  1053. subl $2 * SIZE, CO1
  1054. #endif
  1055. #if defined(LN) || defined(LT)
  1056. movaps %xmm2, 0 * SIZE(B)
  1057. pshufd $0x00, %xmm2, %xmm0
  1058. pshufd $0x55, %xmm2, %xmm1
  1059. pshufd $0xaa, %xmm2, %xmm4
  1060. pshufd $0xff, %xmm2, %xmm5
  1061. movaps %xmm0, 0 * SIZE(BB)
  1062. movaps %xmm1, 4 * SIZE(BB)
  1063. movaps %xmm4, 8 * SIZE(BB)
  1064. movaps %xmm5, 12 * SIZE(BB)
  1065. movlps %xmm2, 0 * SIZE(CO1)
  1066. movhps %xmm2, 0 * SIZE(CO1, LDC)
  1067. #else
  1068. movlps %xmm1, 0 * SIZE(AA)
  1069. movlps %xmm5, 2 * SIZE(AA)
  1070. movlps %xmm1, 0 * SIZE(CO1)
  1071. movlps %xmm5, 0 * SIZE(CO1, LDC)
  1072. #endif
  1073. #ifndef LN
  1074. addl $2 * SIZE, CO1
  1075. #endif
  1076. #if defined(LT) || defined(RN)
  1077. movl K, %eax
  1078. subl KK, %eax
  1079. sall $ZBASE_SHIFT, %eax
  1080. addl %eax, AA
  1081. #ifdef LT
  1082. addl $4 * SIZE, B
  1083. #endif
  1084. #endif
  1085. #ifdef LN
  1086. subl $1, KK
  1087. movl BORIG, B
  1088. #endif
  1089. #ifdef LT
  1090. addl $1, KK
  1091. #endif
  1092. #ifdef RT
  1093. movl K, %eax
  1094. movl BORIG, B
  1095. sall $ZBASE_SHIFT, %eax
  1096. addl %eax, AORIG
  1097. #endif
  1098. ALIGN_4
  1099. .L99:
  1100. #ifdef LN
  1101. movl K, %eax
  1102. sall $1 + ZBASE_SHIFT, %eax
  1103. addl %eax, B
  1104. #endif
  1105. #if defined(LT) || defined(RN)
  1106. movl K, %eax
  1107. subl KK, %eax
  1108. sall $1 + ZBASE_SHIFT, %eax
  1109. addl %eax, B
  1110. #endif
  1111. #ifdef RN
  1112. addl $2, KK
  1113. #endif
  1114. #ifdef RT
  1115. subl $2, KK
  1116. #endif
  1117. decl J # j --
  1118. jg .L01
  1119. ALIGN_4
  1120. .L100:
  1121. movl N, %eax
  1122. andl $1, %eax
  1123. jle .L999
  1124. ALIGN_4
  1125. .L101:
  1126. #ifdef LN
  1127. movl OFFSET, %eax
  1128. addl M, %eax
  1129. movl %eax, KK
  1130. #endif
  1131. leal BUFFER, %ecx
  1132. #ifdef RT
  1133. movl K, %eax
  1134. sall $ZBASE_SHIFT, %eax
  1135. subl %eax, B
  1136. #endif
  1137. #if defined(LN) || defined(RT)
  1138. movl KK, %eax
  1139. movl B, BORIG
  1140. sall $ZBASE_SHIFT, %eax
  1141. addl %eax, B
  1142. leal (BB, %eax, 4), BB
  1143. #endif
  1144. #if defined(LT)
  1145. movl OFFSET, %eax
  1146. movl %eax, KK
  1147. #endif
  1148. #if defined(LT) || defined(RN)
  1149. movl KK, %eax
  1150. #else
  1151. movl K, %eax
  1152. subl KK, %eax
  1153. #endif
  1154. sarl $2, %eax
  1155. jle .L103
  1156. ALIGN_4
  1157. .L102:
  1158. movaps 0 * SIZE(B), %xmm3
  1159. movaps 4 * SIZE(B), %xmm7
  1160. pshufd $0x00, %xmm3, %xmm0
  1161. pshufd $0x55, %xmm3, %xmm1
  1162. pshufd $0xaa, %xmm3, %xmm2
  1163. pshufd $0xff, %xmm3, %xmm3
  1164. movaps %xmm0, 0 * SIZE(BB)
  1165. movaps %xmm1, 4 * SIZE(BB)
  1166. movaps %xmm2, 8 * SIZE(BB)
  1167. movaps %xmm3, 12 * SIZE(BB)
  1168. pshufd $0x00, %xmm7, %xmm4
  1169. pshufd $0x55, %xmm7, %xmm5
  1170. pshufd $0xaa, %xmm7, %xmm6
  1171. pshufd $0xff, %xmm7, %xmm7
  1172. movaps %xmm4, 16 * SIZE(BB)
  1173. movaps %xmm5, 20 * SIZE(BB)
  1174. movaps %xmm6, 24 * SIZE(BB)
  1175. movaps %xmm7, 28 * SIZE(BB)
  1176. addl $ 8 * SIZE, B
  1177. addl $32 * SIZE, BB
  1178. decl %eax
  1179. jne .L102
  1180. ALIGN_4
  1181. .L103:
  1182. #if defined(LT) || defined(RN)
  1183. movl KK, %eax
  1184. #else
  1185. movl K, %eax
  1186. subl KK, %eax
  1187. #endif
  1188. andl $3, %eax
  1189. BRANCH
  1190. jle .L105
  1191. ALIGN_4
  1192. .L104:
  1193. #ifdef movsd
  1194. xorps %xmm3, %xmm3
  1195. #endif
  1196. movsd 0 * SIZE(B), %xmm3
  1197. pshufd $0x00, %xmm3, %xmm0
  1198. pshufd $0x55, %xmm3, %xmm1
  1199. movaps %xmm0, 0 * SIZE(BB)
  1200. movaps %xmm1, 4 * SIZE(BB)
  1201. addl $ 2 * SIZE, %edi
  1202. addl $ 8 * SIZE, %ecx
  1203. decl %eax
  1204. jne .L104
  1205. ALIGN_4
  1206. .L105:
  1207. #if defined(LT) || defined(RN)
  1208. movl A, %eax
  1209. movl %eax, AA
  1210. #else
  1211. movl A, %eax
  1212. movl %eax, AORIG
  1213. #endif
  1214. #ifdef RT
  1215. subl LDC, C
  1216. #endif
  1217. movl C, CO1
  1218. #ifndef RT
  1219. addl LDC, C
  1220. #endif
  1221. movl M, %ebx
  1222. sarl $1, %ebx
  1223. jle .L130
  1224. ALIGN_4
  1225. .L110:
  1226. #ifdef LN
  1227. movl K, %eax
  1228. sall $1 + ZBASE_SHIFT, %eax
  1229. subl %eax, AORIG
  1230. #endif
  1231. #if defined(LN) || defined(RT)
  1232. movl AORIG, %eax
  1233. movl %eax, AA
  1234. movl KK, %eax
  1235. sall $1 + ZBASE_SHIFT, %eax
  1236. addl %eax, AA
  1237. #endif
  1238. leal BUFFER, BB # boffset1 = boffset
  1239. #if defined(LN) || defined(RT)
  1240. movl KK, %eax
  1241. sall $2 + ZBASE_SHIFT, %eax
  1242. addl %eax, BB
  1243. #endif
  1244. xorps %xmm4, %xmm4
  1245. xorps %xmm5, %xmm5
  1246. xorps %xmm6, %xmm6
  1247. xorps %xmm7, %xmm7
  1248. movaps 0 * SIZE(AA), %xmm0
  1249. movaps 16 * SIZE(AA), %xmm1
  1250. movaps 0 * SIZE(BB), %xmm2
  1251. movaps 16 * SIZE(BB), %xmm3
  1252. PREFETCHW 3 * SIZE(CO1)
  1253. #if defined(LT) || defined(RN)
  1254. movl KK, %eax
  1255. #else
  1256. movl K, %eax
  1257. subl KK, %eax
  1258. #endif
  1259. sarl $3, %eax
  1260. je .L112
  1261. ALIGN_4
  1262. .L111:
  1263. mulps %xmm0, %xmm2
  1264. addps %xmm2, %xmm4
  1265. movaps 4 * SIZE(BB), %xmm2
  1266. mulps %xmm0, %xmm2
  1267. movaps 4 * SIZE(AA), %xmm0
  1268. addps %xmm2, %xmm5
  1269. movaps 8 * SIZE(BB), %xmm2
  1270. mulps %xmm0, %xmm2
  1271. addps %xmm2, %xmm6
  1272. movaps 12 * SIZE(BB), %xmm2
  1273. mulps %xmm0, %xmm2
  1274. movaps 8 * SIZE(AA), %xmm0
  1275. addps %xmm2, %xmm7
  1276. movaps 32 * SIZE(BB), %xmm2
  1277. mulps %xmm0, %xmm3
  1278. addps %xmm3, %xmm4
  1279. movaps 20 * SIZE(BB), %xmm3
  1280. mulps %xmm0, %xmm3
  1281. movaps 12 * SIZE(AA), %xmm0
  1282. addps %xmm3, %xmm5
  1283. movaps 24 * SIZE(BB), %xmm3
  1284. mulps %xmm0, %xmm3
  1285. addps %xmm3, %xmm6
  1286. movaps 28 * SIZE(BB), %xmm3
  1287. mulps %xmm0, %xmm3
  1288. movaps 32 * SIZE(AA), %xmm0
  1289. addps %xmm3, %xmm7
  1290. movaps 48 * SIZE(BB), %xmm3
  1291. mulps %xmm1, %xmm2
  1292. addps %xmm2, %xmm4
  1293. movaps 36 * SIZE(BB), %xmm2
  1294. mulps %xmm1, %xmm2
  1295. movaps 20 * SIZE(AA), %xmm1
  1296. addps %xmm2, %xmm5
  1297. movaps 40 * SIZE(BB), %xmm2
  1298. mulps %xmm1, %xmm2
  1299. addps %xmm2, %xmm6
  1300. movaps 44 * SIZE(BB), %xmm2
  1301. mulps %xmm1, %xmm2
  1302. movaps 24 * SIZE(AA), %xmm1
  1303. addps %xmm2, %xmm7
  1304. movaps 64 * SIZE(BB), %xmm2
  1305. mulps %xmm1, %xmm3
  1306. addps %xmm3, %xmm4
  1307. movaps 52 * SIZE(BB), %xmm3
  1308. mulps %xmm1, %xmm3
  1309. movaps 28 * SIZE(AA), %xmm1
  1310. addps %xmm3, %xmm5
  1311. movaps 56 * SIZE(BB), %xmm3
  1312. mulps %xmm1, %xmm3
  1313. addps %xmm3, %xmm6
  1314. movaps 60 * SIZE(BB), %xmm3
  1315. mulps %xmm1, %xmm3
  1316. movaps 48 * SIZE(AA), %xmm1
  1317. addps %xmm3, %xmm7
  1318. movaps 80 * SIZE(BB), %xmm3
  1319. addl $ 32 * SIZE, AA
  1320. addl $ 64 * SIZE, BB
  1321. decl %eax
  1322. jne .L111
  1323. ALIGN_4
  1324. .L112:
  1325. #if defined(LT) || defined(RN)
  1326. movl KK, %eax
  1327. #else
  1328. movl K, %eax
  1329. subl KK, %eax
  1330. #endif
  1331. andl $7, %eax # if (k & 1)
  1332. BRANCH
  1333. je .L114
  1334. ALIGN_4
  1335. .L113:
  1336. mulps %xmm0, %xmm2
  1337. mulps 4 * SIZE(BB), %xmm0
  1338. addps %xmm2, %xmm4
  1339. movaps 8 * SIZE(BB), %xmm2
  1340. addps %xmm0, %xmm5
  1341. movaps 4 * SIZE(AA), %xmm0
  1342. addl $ 4 * SIZE, AA
  1343. addl $ 8 * SIZE, BB
  1344. decl %eax
  1345. jg .L113
  1346. ALIGN_4
  1347. .L114:
  1348. addps %xmm6, %xmm4
  1349. addps %xmm7, %xmm5
  1350. movaps POSINV, %xmm0
  1351. shufps $0xb1, %xmm5, %xmm5
  1352. #if defined(LN) || defined(LT)
  1353. #ifndef CONJ
  1354. xorps %xmm0, %xmm5
  1355. #else
  1356. xorps %xmm0, %xmm4
  1357. #endif
  1358. #else
  1359. xorps %xmm0, %xmm5
  1360. #endif
  1361. addps %xmm5, %xmm4
  1362. #if defined(LN) || defined(RT)
  1363. movl KK, %eax
  1364. #ifdef LN
  1365. subl $2, %eax
  1366. #else
  1367. subl $1, %eax
  1368. #endif
  1369. movl AORIG, AA
  1370. movl BORIG, B
  1371. leal BUFFER, BB
  1372. sall $ZBASE_SHIFT, %eax
  1373. leal (AA, %eax, 2), AA
  1374. leal (B, %eax, 1), B
  1375. leal (BB, %eax, 4), BB
  1376. #endif
  1377. #if defined(LN) || defined(LT)
  1378. movaps %xmm4, %xmm5
  1379. unpcklpd %xmm6, %xmm4
  1380. unpckhpd %xmm6, %xmm5
  1381. #ifdef movsd
  1382. xorps %xmm2, %xmm2
  1383. #endif
  1384. movsd 0 * SIZE(B), %xmm2
  1385. #ifdef movsd
  1386. xorps %xmm3, %xmm3
  1387. #endif
  1388. movsd 2 * SIZE(B), %xmm3
  1389. subps %xmm4, %xmm2
  1390. subps %xmm5, %xmm3
  1391. #else
  1392. movaps 0 * SIZE(AA), %xmm1
  1393. subps %xmm4, %xmm1
  1394. #endif
  1395. #ifdef LN
  1396. movaps 4 * SIZE(AA), %xmm5
  1397. pshufd $0xee, %xmm5, %xmm6
  1398. pshufd $0xbb, %xmm5, %xmm7
  1399. pshufd $0xa0, %xmm3, %xmm4
  1400. pshufd $0xf5, %xmm3, %xmm3
  1401. #ifndef CONJ
  1402. xorps %xmm0, %xmm3
  1403. #else
  1404. xorps %xmm0, %xmm4
  1405. #endif
  1406. mulps %xmm6, %xmm4
  1407. mulps %xmm7, %xmm3
  1408. addps %xmm4, %xmm3
  1409. pshufd $0x44, %xmm5, %xmm6
  1410. pshufd $0x11, %xmm5, %xmm7
  1411. pshufd $0xa0, %xmm3, %xmm4
  1412. pshufd $0xf5, %xmm3, %xmm1
  1413. #ifndef CONJ
  1414. xorps %xmm0, %xmm1
  1415. #else
  1416. xorps %xmm0, %xmm4
  1417. #endif
  1418. mulps %xmm6, %xmm4
  1419. mulps %xmm7, %xmm1
  1420. subps %xmm4, %xmm2
  1421. subps %xmm1, %xmm2
  1422. movaps 0 * SIZE(AA), %xmm5
  1423. pshufd $0x44, %xmm5, %xmm6
  1424. pshufd $0x11, %xmm5, %xmm7
  1425. pshufd $0xa0, %xmm2, %xmm4
  1426. pshufd $0xf5, %xmm2, %xmm2
  1427. #ifndef CONJ
  1428. xorps %xmm0, %xmm2
  1429. #else
  1430. xorps %xmm0, %xmm4
  1431. #endif
  1432. mulps %xmm6, %xmm4
  1433. mulps %xmm7, %xmm2
  1434. addps %xmm4, %xmm2
  1435. #endif
  1436. #ifdef LT
  1437. movaps 0 * SIZE(AA), %xmm5
  1438. pshufd $0x44, %xmm5, %xmm6
  1439. pshufd $0x11, %xmm5, %xmm7
  1440. pshufd $0xa0, %xmm2, %xmm4
  1441. pshufd $0xf5, %xmm2, %xmm2
  1442. #ifndef CONJ
  1443. xorps %xmm0, %xmm2
  1444. #else
  1445. xorps %xmm0, %xmm4
  1446. #endif
  1447. mulps %xmm6, %xmm4
  1448. mulps %xmm7, %xmm2
  1449. addps %xmm4, %xmm2
  1450. pshufd $0xee, %xmm5, %xmm6
  1451. pshufd $0xbb, %xmm5, %xmm7
  1452. pshufd $0xa0, %xmm2, %xmm4
  1453. pshufd $0xf5, %xmm2, %xmm1
  1454. #ifndef CONJ
  1455. xorps %xmm0, %xmm1
  1456. #else
  1457. xorps %xmm0, %xmm4
  1458. #endif
  1459. mulps %xmm6, %xmm4
  1460. mulps %xmm7, %xmm1
  1461. subps %xmm4, %xmm3
  1462. subps %xmm1, %xmm3
  1463. movaps 4 * SIZE(AA), %xmm5
  1464. pshufd $0xee, %xmm5, %xmm6
  1465. pshufd $0xbb, %xmm5, %xmm7
  1466. pshufd $0xa0, %xmm3, %xmm4
  1467. pshufd $0xf5, %xmm3, %xmm3
  1468. #ifndef CONJ
  1469. xorps %xmm0, %xmm3
  1470. #else
  1471. xorps %xmm0, %xmm4
  1472. #endif
  1473. mulps %xmm6, %xmm4
  1474. mulps %xmm7, %xmm3
  1475. addps %xmm4, %xmm3
  1476. #endif
  1477. #if defined(RN) || defined(RT)
  1478. movaps 0 * SIZE(B), %xmm4
  1479. pshufd $0x44, %xmm4, %xmm6
  1480. pshufd $0x11, %xmm4, %xmm7
  1481. pshufd $0xa0, %xmm1, %xmm3
  1482. pshufd $0xf5, %xmm1, %xmm1
  1483. #ifndef CONJ
  1484. xorps %xmm0, %xmm1
  1485. #else
  1486. xorps %xmm0, %xmm3
  1487. #endif
  1488. mulps %xmm6, %xmm3
  1489. mulps %xmm7, %xmm1
  1490. addps %xmm3, %xmm1
  1491. #endif
  1492. #ifdef LN
  1493. subl $4 * SIZE, CO1
  1494. #endif
  1495. #if defined(LN) || defined(LT)
  1496. movlps %xmm2, 0 * SIZE(B)
  1497. movlps %xmm3, 2 * SIZE(B)
  1498. pshufd $0x00, %xmm2, %xmm0
  1499. pshufd $0x55, %xmm2, %xmm1
  1500. movaps %xmm0, 0 * SIZE(BB)
  1501. movaps %xmm1, 4 * SIZE(BB)
  1502. pshufd $0x00, %xmm3, %xmm0
  1503. pshufd $0x55, %xmm3, %xmm1
  1504. movaps %xmm0, 8 * SIZE(BB)
  1505. movaps %xmm1, 12 * SIZE(BB)
  1506. movlps %xmm2, 0 * SIZE(CO1)
  1507. movlps %xmm3, 2 * SIZE(CO1)
  1508. #else
  1509. movaps %xmm1, 0 * SIZE(AA)
  1510. movlps %xmm1, 0 * SIZE(CO1)
  1511. movhps %xmm1, 2 * SIZE(CO1)
  1512. #endif
  1513. #ifndef LN
  1514. addl $4 * SIZE, CO1
  1515. #endif
  1516. #if defined(LT) || defined(RN)
  1517. movl K, %eax
  1518. subl KK, %eax
  1519. sall $1 + ZBASE_SHIFT, %eax
  1520. addl %eax, AA
  1521. #ifdef LT
  1522. addl $4 * SIZE, B
  1523. #endif
  1524. #endif
  1525. #ifdef LN
  1526. subl $2, KK
  1527. movl BORIG, B
  1528. #endif
  1529. #ifdef LT
  1530. addl $2, KK
  1531. #endif
  1532. #ifdef RT
  1533. movl K, %eax
  1534. movl BORIG, B
  1535. sall $1 + ZBASE_SHIFT, %eax
  1536. addl %eax, AORIG
  1537. #endif
  1538. decl %ebx # i --
  1539. jg .L110
  1540. ALIGN_4
  1541. .L130:
  1542. movl M, %ebx
  1543. andl $1, %ebx
  1544. jle .L149
  1545. #ifdef LN
  1546. movl K, %eax
  1547. sall $ZBASE_SHIFT, %eax
  1548. subl %eax, AORIG
  1549. #endif
  1550. #if defined(LN) || defined(RT)
  1551. movl AORIG, %eax
  1552. movl %eax, AA
  1553. movl KK, %eax
  1554. sall $ZBASE_SHIFT, %eax
  1555. addl %eax, AA
  1556. #endif
  1557. leal BUFFER, BB # boffset1 = boffset
  1558. #if defined(LN) || defined(RT)
  1559. movl KK, %eax
  1560. sall $2 + ZBASE_SHIFT, %eax
  1561. addl %eax, BB
  1562. #endif
  1563. #ifdef movsd
  1564. xorps %xmm0, %xmm0
  1565. #endif
  1566. movsd 0 * SIZE(AA), %xmm0
  1567. xorps %xmm4, %xmm4
  1568. #ifdef movsd
  1569. xorps %xmm1, %xmm1
  1570. #endif
  1571. movsd 8 * SIZE(AA), %xmm1
  1572. xorps %xmm5, %xmm5
  1573. movaps 0 * SIZE(BB), %xmm2
  1574. xorps %xmm6, %xmm6
  1575. movaps 16 * SIZE(BB), %xmm3
  1576. xorps %xmm7, %xmm7
  1577. #if defined(LT) || defined(RN)
  1578. movl KK, %eax
  1579. #else
  1580. movl K, %eax
  1581. subl KK, %eax
  1582. #endif
  1583. sarl $3, %eax
  1584. je .L142
  1585. ALIGN_4
  1586. .L141:
  1587. mulps %xmm0, %xmm2
  1588. addps %xmm2, %xmm4
  1589. movaps 4 * SIZE(BB), %xmm2
  1590. mulps %xmm0, %xmm2
  1591. movsd 2 * SIZE(AA), %xmm0
  1592. addps %xmm2, %xmm5
  1593. movaps 8 * SIZE(BB), %xmm2
  1594. mulps %xmm0, %xmm2
  1595. addps %xmm2, %xmm6
  1596. movaps 12 * SIZE(BB), %xmm2
  1597. mulps %xmm0, %xmm2
  1598. movsd 4 * SIZE(AA), %xmm0
  1599. addps %xmm2, %xmm7
  1600. movaps 32 * SIZE(BB), %xmm2
  1601. mulps %xmm0, %xmm3
  1602. addps %xmm3, %xmm4
  1603. movaps 20 * SIZE(BB), %xmm3
  1604. mulps %xmm0, %xmm3
  1605. movsd 6 * SIZE(AA), %xmm0
  1606. addps %xmm3, %xmm5
  1607. movaps 24 * SIZE(BB), %xmm3
  1608. mulps %xmm0, %xmm3
  1609. addps %xmm3, %xmm6
  1610. movaps 28 * SIZE(BB), %xmm3
  1611. mulps %xmm0, %xmm3
  1612. movsd 16 * SIZE(AA), %xmm0
  1613. addps %xmm3, %xmm7
  1614. movaps 48 * SIZE(BB), %xmm3
  1615. mulps %xmm1, %xmm2
  1616. addps %xmm2, %xmm4
  1617. movaps 36 * SIZE(BB), %xmm2
  1618. mulps %xmm1, %xmm2
  1619. movsd 10 * SIZE(AA), %xmm1
  1620. addps %xmm2, %xmm5
  1621. movaps 40 * SIZE(BB), %xmm2
  1622. mulps %xmm1, %xmm2
  1623. addps %xmm2, %xmm6
  1624. movaps 44 * SIZE(BB), %xmm2
  1625. mulps %xmm1, %xmm2
  1626. movsd 12 * SIZE(AA), %xmm1
  1627. addps %xmm2, %xmm7
  1628. movaps 64 * SIZE(BB), %xmm2
  1629. mulps %xmm1, %xmm3
  1630. addps %xmm3, %xmm4
  1631. movaps 52 * SIZE(BB), %xmm3
  1632. mulps %xmm1, %xmm3
  1633. movsd 14 * SIZE(AA), %xmm1
  1634. addps %xmm3, %xmm5
  1635. movaps 56 * SIZE(BB), %xmm3
  1636. mulps %xmm1, %xmm3
  1637. addps %xmm3, %xmm6
  1638. movaps 60 * SIZE(BB), %xmm3
  1639. mulps %xmm1, %xmm3
  1640. movsd 24 * SIZE(AA), %xmm1
  1641. addps %xmm3, %xmm7
  1642. movaps 80 * SIZE(BB), %xmm3
  1643. addl $ 16 * SIZE, AA
  1644. addl $ 64 * SIZE, BB
  1645. decl %eax
  1646. jne .L141
  1647. ALIGN_4
  1648. .L142:
  1649. #if defined(LT) || defined(RN)
  1650. movl KK, %eax
  1651. #else
  1652. movl K, %eax
  1653. subl KK, %eax
  1654. #endif
  1655. andl $7, %eax # if (k & 1)
  1656. BRANCH
  1657. je .L144
  1658. ALIGN_4
  1659. .L143:
  1660. mulps %xmm0, %xmm2
  1661. mulps 4 * SIZE(BB), %xmm0
  1662. addps %xmm2, %xmm4
  1663. movaps 8 * SIZE(BB), %xmm2
  1664. addps %xmm0, %xmm5
  1665. movsd 2 * SIZE(AA), %xmm0
  1666. addl $2 * SIZE, AA
  1667. addl $8 * SIZE, BB
  1668. decl %eax
  1669. jg .L143
  1670. ALIGN_4
  1671. .L144:
  1672. addps %xmm6, %xmm4
  1673. addps %xmm7, %xmm5
  1674. movaps POSINV, %xmm0
  1675. shufps $0xb1, %xmm5, %xmm5
  1676. #if defined(LN) || defined(LT)
  1677. #ifndef CONJ
  1678. xorps %xmm0, %xmm5
  1679. #else
  1680. xorps %xmm0, %xmm4
  1681. #endif
  1682. #else
  1683. xorps %xmm0, %xmm5
  1684. #endif
  1685. addps %xmm5, %xmm4
  1686. #if defined(LN) || defined(RT)
  1687. movl KK, %eax
  1688. subl $1, %eax
  1689. movl AORIG, AA
  1690. movl BORIG, B
  1691. leal BUFFER, BB
  1692. sall $ZBASE_SHIFT, %eax
  1693. addl %eax, AA
  1694. addl %eax, B
  1695. leal (BB, %eax, 4), BB
  1696. #endif
  1697. #if defined(LN) || defined(LT)
  1698. #ifdef movsd
  1699. xorps %xmm2, %xmm2
  1700. #endif
  1701. movsd 0 * SIZE(B), %xmm2
  1702. subps %xmm4, %xmm2
  1703. #else
  1704. #ifdef movsd
  1705. xorps %xmm1, %xmm1
  1706. #endif
  1707. movsd 0 * SIZE(AA), %xmm1
  1708. subps %xmm4, %xmm1
  1709. #endif
  1710. #if defined(LN) || defined(LT)
  1711. movaps 0 * SIZE(AA), %xmm5
  1712. pshufd $0x44, %xmm5, %xmm6
  1713. pshufd $0x11, %xmm5, %xmm7
  1714. pshufd $0xa0, %xmm2, %xmm4
  1715. pshufd $0xf5, %xmm2, %xmm2
  1716. #ifndef CONJ
  1717. xorps %xmm0, %xmm2
  1718. #else
  1719. xorps %xmm0, %xmm4
  1720. #endif
  1721. mulps %xmm6, %xmm4
  1722. mulps %xmm7, %xmm2
  1723. addps %xmm4, %xmm2
  1724. #endif
  1725. #if defined(RN) || defined(RT)
  1726. movaps 0 * SIZE(B), %xmm4
  1727. pshufd $0x44, %xmm4, %xmm6
  1728. pshufd $0x11, %xmm4, %xmm7
  1729. pshufd $0xa0, %xmm1, %xmm3
  1730. pshufd $0xf5, %xmm1, %xmm1
  1731. #ifndef CONJ
  1732. xorps %xmm0, %xmm1
  1733. #else
  1734. xorps %xmm0, %xmm3
  1735. #endif
  1736. mulps %xmm6, %xmm3
  1737. mulps %xmm7, %xmm1
  1738. addps %xmm3, %xmm1
  1739. #endif
  1740. #ifdef LN
  1741. subl $2 * SIZE, CO1
  1742. #endif
  1743. #if defined(LN) || defined(LT)
  1744. movlps %xmm2, 0 * SIZE(B)
  1745. pshufd $0x00, %xmm2, %xmm0
  1746. pshufd $0x55, %xmm2, %xmm1
  1747. movaps %xmm0, 0 * SIZE(BB)
  1748. movaps %xmm1, 4 * SIZE(BB)
  1749. movlps %xmm2, 0 * SIZE(CO1)
  1750. #else
  1751. movlps %xmm1, 0 * SIZE(AA)
  1752. movlps %xmm1, 0 * SIZE(CO1)
  1753. #endif
  1754. #ifndef LN
  1755. addl $2 * SIZE, CO1
  1756. #endif
  1757. #if defined(LT) || defined(RN)
  1758. movl K, %eax
  1759. subl KK, %eax
  1760. sall $ZBASE_SHIFT, %eax
  1761. addl %eax, AA
  1762. #ifdef LT
  1763. addl $2 * SIZE, B
  1764. #endif
  1765. #endif
  1766. #ifdef LN
  1767. subl $1, KK
  1768. movl BORIG, B
  1769. #endif
  1770. #ifdef LT
  1771. addl $1, KK
  1772. #endif
  1773. #ifdef RT
  1774. movl K, %eax
  1775. movl BORIG, B
  1776. sall $ZBASE_SHIFT, %eax
  1777. addl %eax, AORIG
  1778. #endif
  1779. ALIGN_4
  1780. .L149:
  1781. #ifdef LN
  1782. movl K, %eax
  1783. sall $ZBASE_SHIFT, %eax
  1784. addl %eax, B
  1785. #endif
  1786. #if defined(LT) || defined(RN)
  1787. movl K, %eax
  1788. subl KK, %eax
  1789. sall $ZBASE_SHIFT, %eax
  1790. addl %eax, B
  1791. #endif
  1792. #ifdef RN
  1793. addl $1, KK
  1794. #endif
  1795. #ifdef RT
  1796. subl $1, KK
  1797. #endif
  1798. ALIGN_4
  1799. .L999:
  1800. EMMS
  1801. movl OLD_STACK, %esp
  1802. popl %ebx
  1803. popl %esi
  1804. popl %edi
  1805. popl %ebp
  1806. ret
  1807. EPILOGUE