You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LN_2x2_sse.S 39 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esi)
  43. #define STACK_N 8 + STACK + ARGS(%esi)
  44. #define STACK_K 12 + STACK + ARGS(%esi)
  45. #define STACK_A 24 + STACK + ARGS(%esi)
  46. #define STACK_B 28 + STACK + ARGS(%esi)
  47. #define STACK_C 32 + STACK + ARGS(%esi)
  48. #define STACK_LDC 36 + STACK + ARGS(%esi)
  49. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  50. #define POSINV 0(%esp)
  51. #define K 16(%esp)
  52. #define N 20(%esp)
  53. #define M 24(%esp)
  54. #define A 28(%esp)
  55. #define C 32(%esp)
  56. #define J 36(%esp)
  57. #define OLD_STACK 40(%esp)
  58. #define OFFSET 48(%esp)
  59. #define KK 52(%esp)
  60. #define KKK 56(%esp)
  61. #define AORIG 60(%esp)
  62. #define BORIG 64(%esp)
  63. #define BUFFER 128(%esp)
  64. #define B %edi
  65. #define LDC %ebp
  66. #define AA %edx
  67. #define BB %ecx
  68. #define CO1 %esi
  69. #define STACK_ALIGN 4096
  70. #define STACK_OFFSET 1024
  71. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  72. #define PREFETCHSIZE (16 * 10 + 8)
  73. #define WPREFETCHSIZE 112
  74. #define PREFETCH prefetch
  75. #define PREFETCHW prefetchw
  76. #endif
  77. #if defined(PENTIUM4) || defined(PENTIUMM)
  78. #define PREFETCH prefetcht1
  79. #define PREFETCHSIZE 168
  80. #define PREFETCHW prefetcht0
  81. #endif
  82. #if defined(PENRYN) || defined(DUNNINGTON)
  83. #define PREFETCH prefetcht1
  84. #define PREFETCHSIZE 168
  85. #define PREFETCHW prefetcht0
  86. #endif
  87. #if defined(OPTERON) || !defined(HAVE_SSE2)
  88. #define movsd movlps
  89. #endif
  90. #ifdef HAVE_SSE2
  91. #define xorps pxor
  92. #endif
  93. #define KERNEL1(address) \
  94. mulps %xmm0, %xmm2; \
  95. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  96. addps %xmm2, %xmm4; \
  97. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  98. mulps %xmm0, %xmm2; \
  99. addps %xmm2, %xmm5; \
  100. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  101. mulps %xmm0, %xmm2; \
  102. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  103. addps %xmm2, %xmm6; \
  104. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  105. addps %xmm0, %xmm7; \
  106. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  107. #define KERNEL2(address) \
  108. mulps %xmm0, %xmm3; \
  109. addps %xmm3, %xmm4; \
  110. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  111. mulps %xmm0, %xmm3; \
  112. addps %xmm3, %xmm5; \
  113. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  114. mulps %xmm0, %xmm3; \
  115. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  116. addps %xmm3, %xmm6; \
  117. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  118. addps %xmm0, %xmm7; \
  119. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  120. #define KERNEL3(address) \
  121. mulps %xmm0, %xmm2; \
  122. addps %xmm2, %xmm4; \
  123. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  124. mulps %xmm0, %xmm2; \
  125. addps %xmm2, %xmm5; \
  126. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  127. mulps %xmm0, %xmm2; \
  128. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  129. addps %xmm2, %xmm6; \
  130. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  131. addps %xmm0, %xmm7; \
  132. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  133. #define KERNEL4(address) \
  134. mulps %xmm0, %xmm3; \
  135. addps %xmm3, %xmm4; \
  136. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  137. mulps %xmm0, %xmm3; \
  138. addps %xmm3, %xmm5; \
  139. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  140. mulps %xmm0, %xmm3; \
  141. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  142. addps %xmm3, %xmm6; \
  143. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  144. addps %xmm0, %xmm7; \
  145. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  146. #define KERNEL5(address) \
  147. mulps %xmm1, %xmm2; \
  148. addps %xmm2, %xmm4; \
  149. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  150. mulps %xmm1, %xmm2; \
  151. addps %xmm2, %xmm5; \
  152. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  153. mulps %xmm1, %xmm2; \
  154. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  155. addps %xmm2, %xmm6; \
  156. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  157. addps %xmm1, %xmm7; \
  158. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  159. #define KERNEL6(address) \
  160. mulps %xmm1, %xmm3; \
  161. addps %xmm3, %xmm4; \
  162. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  163. mulps %xmm1, %xmm3; \
  164. addps %xmm3, %xmm5; \
  165. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  166. mulps %xmm1, %xmm3; \
  167. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  168. addps %xmm3, %xmm6; \
  169. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  170. addps %xmm1, %xmm7; \
  171. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  172. #define KERNEL7(address) \
  173. mulps %xmm1, %xmm2; \
  174. addps %xmm2, %xmm4; \
  175. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  176. mulps %xmm1, %xmm2; \
  177. addps %xmm2, %xmm5; \
  178. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  179. mulps %xmm1, %xmm2; \
  180. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  181. addps %xmm2, %xmm6; \
  182. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  183. addps %xmm1, %xmm7; \
  184. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  185. #define KERNEL8(address) \
  186. mulps %xmm1, %xmm3; \
  187. addps %xmm3, %xmm4; \
  188. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  189. mulps %xmm1, %xmm3; \
  190. addps %xmm3, %xmm5; \
  191. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  192. mulps %xmm1, %xmm3; \
  193. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  194. addps %xmm3, %xmm6; \
  195. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  196. addps %xmm1, %xmm7; \
  197. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
  198. PROLOGUE
  199. pushl %ebp
  200. pushl %edi
  201. pushl %esi
  202. pushl %ebx
  203. PROFCODE
  204. movl %esp, %esi # save old stack
  205. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  206. andl $-STACK_ALIGN, %esp # align stack
  207. addl $STACK_OFFSET, %esp
  208. STACK_TOUCHING
  209. movl STACK_M, %ebx
  210. movl STACK_N, %eax
  211. movl STACK_K, %ecx
  212. movl STACK_A, %edx
  213. movl %ebx, M
  214. movl %eax, N
  215. movl %ecx, K
  216. movl %edx, A
  217. movl %esi, OLD_STACK
  218. movl STACK_B, %edi
  219. movl STACK_C, %ebx
  220. movss STACK_OFFT, %xmm4
  221. xorps %xmm7, %xmm7
  222. pcmpeqb %xmm7, %xmm7
  223. pslld $31, %xmm7
  224. xorps %xmm2, %xmm2
  225. #ifndef CONJ
  226. movss %xmm7, 0 + POSINV
  227. movss %xmm2, 4 + POSINV
  228. movss %xmm7, 8 + POSINV
  229. movss %xmm2, 12 + POSINV
  230. #else
  231. movss %xmm2, 0 + POSINV
  232. movss %xmm7, 4 + POSINV
  233. movss %xmm2, 8 + POSINV
  234. movss %xmm7, 12 + POSINV
  235. #endif
  236. EMMS
  237. movl %ebx, C
  238. movl STACK_LDC, LDC
  239. movss %xmm4, OFFSET
  240. movss %xmm4, KK
  241. sall $ZBASE_SHIFT, LDC
  242. #ifdef LN
  243. movl M, %eax
  244. sall $ZBASE_SHIFT, %eax
  245. addl %eax, C
  246. imull K, %eax
  247. addl %eax, A
  248. #endif
  249. #ifdef RT
  250. movl N, %eax
  251. sall $ZBASE_SHIFT, %eax
  252. imull K, %eax
  253. addl %eax, B
  254. movl N, %eax
  255. imull LDC, %eax
  256. addl %eax, C
  257. #endif
  258. #ifdef RN
  259. negl KK
  260. #endif
  261. #ifdef RT
  262. movl N, %eax
  263. subl OFFSET, %eax
  264. movl %eax, KK
  265. #endif
  266. movl N, %eax
  267. movl %eax, J
  268. sarl $1, J
  269. jle .L100
  270. ALIGN_4
  271. .L01:
  272. #ifdef LN
  273. movl OFFSET, %eax
  274. addl M, %eax
  275. movl %eax, KK
  276. #endif
  277. leal BUFFER, %ecx
  278. #ifdef RT
  279. movl K, %eax
  280. sall $1 + ZBASE_SHIFT, %eax
  281. subl %eax, B
  282. #endif
  283. #if defined(LN) || defined(RT)
  284. movl KK, %eax
  285. movl B, BORIG
  286. sall $1 + ZBASE_SHIFT, %eax
  287. addl %eax, B
  288. leal (BB, %eax, 4), BB
  289. #endif
  290. #if defined(LT)
  291. movl OFFSET, %eax
  292. movl %eax, KK
  293. #endif
  294. #if defined(LT) || defined(RN)
  295. movl KK, %eax
  296. #else
  297. movl K, %eax
  298. subl KK, %eax
  299. #endif
  300. sarl $1, %eax
  301. jle .L03
  302. ALIGN_4
  303. .L02:
  304. movaps 0 * SIZE(B), %xmm3
  305. movaps 4 * SIZE(B), %xmm7
  306. pshufd $0x00, %xmm3, %xmm0
  307. pshufd $0x55, %xmm3, %xmm1
  308. pshufd $0xaa, %xmm3, %xmm2
  309. pshufd $0xff, %xmm3, %xmm3
  310. movaps %xmm0, 0 * SIZE(BB)
  311. movaps %xmm1, 4 * SIZE(BB)
  312. movaps %xmm2, 8 * SIZE(BB)
  313. movaps %xmm3, 12 * SIZE(BB)
  314. pshufd $0x00, %xmm7, %xmm4
  315. pshufd $0x55, %xmm7, %xmm5
  316. pshufd $0xaa, %xmm7, %xmm6
  317. pshufd $0xff, %xmm7, %xmm7
  318. movaps %xmm4, 16 * SIZE(BB)
  319. movaps %xmm5, 20 * SIZE(BB)
  320. movaps %xmm6, 24 * SIZE(BB)
  321. movaps %xmm7, 28 * SIZE(BB)
  322. addl $ 8 * SIZE, B
  323. addl $32 * SIZE, BB
  324. decl %eax
  325. jne .L02
  326. ALIGN_4
  327. .L03:
  328. #if defined(LT) || defined(RN)
  329. movl KK, %eax
  330. #else
  331. movl K, %eax
  332. subl KK, %eax
  333. #endif
  334. andl $1, %eax
  335. BRANCH
  336. jle .L05
  337. ALIGN_4
  338. .L04:
  339. movaps 0 * SIZE(B), %xmm3
  340. pshufd $0x00, %xmm3, %xmm0
  341. pshufd $0x55, %xmm3, %xmm1
  342. pshufd $0xaa, %xmm3, %xmm2
  343. pshufd $0xff, %xmm3, %xmm3
  344. movaps %xmm0, 0 * SIZE(BB)
  345. movaps %xmm1, 4 * SIZE(BB)
  346. movaps %xmm2, 8 * SIZE(BB)
  347. movaps %xmm3, 12 * SIZE(BB)
  348. addl $ 4 * SIZE, B
  349. ALIGN_4
  350. .L05:
  351. #if defined(LT) || defined(RN)
  352. movl A, %eax
  353. movl %eax, AA
  354. #else
  355. movl A, %eax
  356. movl %eax, AORIG
  357. #endif
  358. leal (, LDC, 2), %eax
  359. #ifdef RT
  360. subl %eax, C
  361. #endif
  362. movl C, CO1
  363. #ifndef RT
  364. addl %eax, C
  365. #endif
  366. movl M, %ebx
  367. andl $1, %ebx
  368. jle .L30
  369. ALIGN_4
  370. .L40:
  371. #ifdef LN
  372. movl K, %eax
  373. sall $ZBASE_SHIFT, %eax
  374. subl %eax, AORIG
  375. #endif
  376. #if defined(LN) || defined(RT)
  377. movl AORIG, %eax
  378. movl %eax, AA
  379. movl KK, %eax
  380. sall $ZBASE_SHIFT, %eax
  381. addl %eax, AA
  382. #endif
  383. leal BUFFER, BB # boffset1 = boffset
  384. #if defined(LN) || defined(RT)
  385. movl KK, %eax
  386. sall $3 + ZBASE_SHIFT, %eax
  387. addl %eax, BB
  388. #endif
  389. xorps %xmm4, %xmm4
  390. xorps %xmm5, %xmm5
  391. xorps %xmm6, %xmm6
  392. xorps %xmm7, %xmm7
  393. #ifdef movsd
  394. xorps %xmm0, %xmm0
  395. #endif
  396. movsd 0 * SIZE(AA), %xmm0
  397. #ifdef movsd
  398. xorps %xmm1, %xmm1
  399. #endif
  400. movsd 8 * SIZE(AA), %xmm1
  401. movaps 0 * SIZE(BB), %xmm2
  402. movaps 16 * SIZE(BB), %xmm3
  403. #if defined(LT) || defined(RN)
  404. movl KK, %eax
  405. #else
  406. movl K, %eax
  407. subl KK, %eax
  408. #endif
  409. sarl $3, %eax
  410. je .L42
  411. ALIGN_4
  412. .L41:
  413. mulps %xmm0, %xmm2
  414. prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA)
  415. addps %xmm2, %xmm4
  416. movaps 4 * SIZE(BB), %xmm2
  417. mulps %xmm0, %xmm2
  418. addps %xmm2, %xmm5
  419. movaps 8 * SIZE(BB), %xmm2
  420. mulps %xmm0, %xmm2
  421. mulps 12 * SIZE(BB), %xmm0
  422. addps %xmm2, %xmm6
  423. movaps 32 * SIZE(BB), %xmm2
  424. addps %xmm0, %xmm7
  425. movsd 2 * SIZE(AA), %xmm0
  426. mulps %xmm0, %xmm3
  427. addps %xmm3, %xmm4
  428. movaps 20 * SIZE(BB), %xmm3
  429. mulps %xmm0, %xmm3
  430. addps %xmm3, %xmm5
  431. movaps 24 * SIZE(BB), %xmm3
  432. mulps %xmm0, %xmm3
  433. mulps 28 * SIZE(BB), %xmm0
  434. addps %xmm3, %xmm6
  435. movaps 48 * SIZE(BB), %xmm3
  436. addps %xmm0, %xmm7
  437. movsd 4 * SIZE(AA), %xmm0
  438. mulps %xmm0, %xmm2
  439. addps %xmm2, %xmm4
  440. movaps 36 * SIZE(BB), %xmm2
  441. mulps %xmm0, %xmm2
  442. addps %xmm2, %xmm5
  443. movaps 40 * SIZE(BB), %xmm2
  444. mulps %xmm0, %xmm2
  445. mulps 44 * SIZE(BB), %xmm0
  446. addps %xmm2, %xmm6
  447. movaps 64 * SIZE(BB), %xmm2
  448. addps %xmm0, %xmm7
  449. movsd 6 * SIZE(AA), %xmm0
  450. mulps %xmm0, %xmm3
  451. addps %xmm3, %xmm4
  452. movaps 52 * SIZE(BB), %xmm3
  453. mulps %xmm0, %xmm3
  454. addps %xmm3, %xmm5
  455. movaps 56 * SIZE(BB), %xmm3
  456. mulps %xmm0, %xmm3
  457. mulps 60 * SIZE(BB), %xmm0
  458. addps %xmm3, %xmm6
  459. movaps 80 * SIZE(BB), %xmm3
  460. addps %xmm0, %xmm7
  461. movsd 16 * SIZE(AA), %xmm0
  462. mulps %xmm1, %xmm2
  463. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  464. prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
  465. #endif
  466. addps %xmm2, %xmm4
  467. movaps 68 * SIZE(BB), %xmm2
  468. mulps %xmm1, %xmm2
  469. addps %xmm2, %xmm5
  470. movaps 72 * SIZE(BB), %xmm2
  471. mulps %xmm1, %xmm2
  472. mulps 76 * SIZE(BB), %xmm1
  473. addps %xmm2, %xmm6
  474. movaps 96 * SIZE(BB), %xmm2
  475. addps %xmm1, %xmm7
  476. movsd 10 * SIZE(AA), %xmm1
  477. mulps %xmm1, %xmm3
  478. addps %xmm3, %xmm4
  479. movaps 84 * SIZE(BB), %xmm3
  480. mulps %xmm1, %xmm3
  481. addps %xmm3, %xmm5
  482. movaps 88 * SIZE(BB), %xmm3
  483. mulps %xmm1, %xmm3
  484. mulps 92 * SIZE(BB), %xmm1
  485. addps %xmm3, %xmm6
  486. movaps 112 * SIZE(BB), %xmm3
  487. addps %xmm1, %xmm7
  488. movsd 12 * SIZE(AA), %xmm1
  489. mulps %xmm1, %xmm2
  490. addps %xmm2, %xmm4
  491. movaps 100 * SIZE(BB), %xmm2
  492. mulps %xmm1, %xmm2
  493. addps %xmm2, %xmm5
  494. movaps 104 * SIZE(BB), %xmm2
  495. mulps %xmm1, %xmm2
  496. mulps 108 * SIZE(BB), %xmm1
  497. addps %xmm2, %xmm6
  498. movaps 128 * SIZE(BB), %xmm2
  499. addps %xmm1, %xmm7
  500. movsd 14 * SIZE(AA), %xmm1
  501. mulps %xmm1, %xmm3
  502. addps %xmm3, %xmm4
  503. movaps 116 * SIZE(BB), %xmm3
  504. mulps %xmm1, %xmm3
  505. addps %xmm3, %xmm5
  506. movaps 120 * SIZE(BB), %xmm3
  507. mulps %xmm1, %xmm3
  508. mulps 124 * SIZE(BB), %xmm1
  509. addps %xmm3, %xmm6
  510. movaps 144 * SIZE(BB), %xmm3
  511. addps %xmm1, %xmm7
  512. movsd 24 * SIZE(AA), %xmm1
  513. addl $ 16 * SIZE, AA
  514. addl $128 * SIZE, BB
  515. decl %eax
  516. jne .L41
  517. ALIGN_4
  518. .L42:
  519. #if defined(LT) || defined(RN)
  520. movl KK, %eax
  521. #else
  522. movl K, %eax
  523. subl KK, %eax
  524. #endif
  525. andl $7, %eax # if (k & 1)
  526. BRANCH
  527. je .L44
  528. ALIGN_4
  529. .L43:
  530. mulps %xmm0, %xmm2
  531. addps %xmm2, %xmm4
  532. movaps 4 * SIZE(BB), %xmm2
  533. mulps %xmm0, %xmm2
  534. addps %xmm2, %xmm5
  535. movaps 8 * SIZE(BB), %xmm2
  536. mulps %xmm0, %xmm2
  537. mulps 12 * SIZE(BB), %xmm0
  538. addps %xmm2, %xmm6
  539. movaps 16 * SIZE(BB), %xmm2
  540. addps %xmm0, %xmm7
  541. movsd 2 * SIZE(AA), %xmm0
  542. addl $ 2 * SIZE, AA
  543. addl $16 * SIZE, BB
  544. decl %eax
  545. jg .L43
  546. ALIGN_4
  547. .L44:
  548. movaps POSINV, %xmm0
  549. shufps $0xb1, %xmm5, %xmm5
  550. shufps $0xb1, %xmm7, %xmm7
  551. #if defined(LN) || defined(LT)
  552. #ifndef CONJ
  553. xorps %xmm0, %xmm5
  554. xorps %xmm0, %xmm7
  555. #else
  556. xorps %xmm0, %xmm4
  557. xorps %xmm0, %xmm6
  558. #endif
  559. #else
  560. xorps %xmm0, %xmm5
  561. xorps %xmm0, %xmm7
  562. #endif
  563. addps %xmm5, %xmm4
  564. addps %xmm7, %xmm6
  565. #if defined(LN) || defined(RT)
  566. movl KK, %eax
  567. #ifdef LN
  568. subl $1, %eax
  569. #else
  570. subl $2, %eax
  571. #endif
  572. movl AORIG, AA
  573. movl BORIG, B
  574. leal BUFFER, BB
  575. sall $ZBASE_SHIFT, %eax
  576. leal (AA, %eax, 1), AA
  577. leal (B, %eax, 2), B
  578. leal (BB, %eax, 8), BB
  579. #endif
  580. #if defined(LN) || defined(LT)
  581. unpcklpd %xmm6, %xmm4
  582. movaps 0 * SIZE(B), %xmm2
  583. subps %xmm4, %xmm2
  584. #else
  585. #ifdef movsd
  586. xorps %xmm1, %xmm1
  587. #endif
  588. movsd 0 * SIZE(AA), %xmm1
  589. #ifdef movsd
  590. xorps %xmm5, %xmm5
  591. #endif
  592. movsd 2 * SIZE(AA), %xmm5
  593. subps %xmm4, %xmm1
  594. subps %xmm6, %xmm5
  595. #endif
  596. #if defined(LN) || defined(LT)
  597. movaps 0 * SIZE(AA), %xmm5
  598. pshufd $0x44, %xmm5, %xmm6
  599. pshufd $0x11, %xmm5, %xmm7
  600. pshufd $0xa0, %xmm2, %xmm4
  601. pshufd $0xf5, %xmm2, %xmm2
  602. #ifndef CONJ
  603. xorps %xmm0, %xmm2
  604. #else
  605. xorps %xmm0, %xmm4
  606. #endif
  607. mulps %xmm6, %xmm4
  608. mulps %xmm7, %xmm2
  609. addps %xmm4, %xmm2
  610. #endif
  611. #ifdef RN
  612. movaps 0 * SIZE(B), %xmm4
  613. pshufd $0x44, %xmm4, %xmm6
  614. pshufd $0x11, %xmm4, %xmm7
  615. pshufd $0xa0, %xmm1, %xmm3
  616. pshufd $0xf5, %xmm1, %xmm1
  617. #ifndef CONJ
  618. xorps %xmm0, %xmm1
  619. #else
  620. xorps %xmm0, %xmm3
  621. #endif
  622. mulps %xmm6, %xmm3
  623. mulps %xmm7, %xmm1
  624. addps %xmm3, %xmm1
  625. pshufd $0xee, %xmm4, %xmm6
  626. pshufd $0xbb, %xmm4, %xmm7
  627. pshufd $0xa0, %xmm1, %xmm3
  628. pshufd $0xf5, %xmm1, %xmm2
  629. #ifndef CONJ
  630. xorps %xmm0, %xmm2
  631. #else
  632. xorps %xmm0, %xmm3
  633. #endif
  634. mulps %xmm6, %xmm3
  635. mulps %xmm7, %xmm2
  636. subps %xmm3, %xmm5
  637. subps %xmm2, %xmm5
  638. movaps 4 * SIZE(B), %xmm4
  639. pshufd $0xee, %xmm4, %xmm6
  640. pshufd $0xbb, %xmm4, %xmm7
  641. pshufd $0xa0, %xmm5, %xmm3
  642. pshufd $0xf5, %xmm5, %xmm5
  643. #ifndef CONJ
  644. xorps %xmm0, %xmm5
  645. #else
  646. xorps %xmm0, %xmm3
  647. #endif
  648. mulps %xmm6, %xmm3
  649. mulps %xmm7, %xmm5
  650. addps %xmm3, %xmm5
  651. #endif
  652. #ifdef RT
  653. movaps 4 * SIZE(B), %xmm4
  654. pshufd $0xee, %xmm4, %xmm6
  655. pshufd $0xbb, %xmm4, %xmm7
  656. pshufd $0xa0, %xmm5, %xmm3
  657. pshufd $0xf5, %xmm5, %xmm5
  658. #ifndef CONJ
  659. xorps %xmm0, %xmm5
  660. #else
  661. xorps %xmm0, %xmm3
  662. #endif
  663. mulps %xmm6, %xmm3
  664. mulps %xmm7, %xmm5
  665. addps %xmm3, %xmm5
  666. pshufd $0x44, %xmm4, %xmm6
  667. pshufd $0x11, %xmm4, %xmm7
  668. pshufd $0xa0, %xmm5, %xmm3
  669. pshufd $0xf5, %xmm5, %xmm2
  670. #ifndef CONJ
  671. xorps %xmm0, %xmm2
  672. #else
  673. xorps %xmm0, %xmm3
  674. #endif
  675. mulps %xmm6, %xmm3
  676. mulps %xmm7, %xmm2
  677. subps %xmm3, %xmm1
  678. subps %xmm2, %xmm1
  679. movaps 0 * SIZE(B), %xmm4
  680. pshufd $0x44, %xmm4, %xmm6
  681. pshufd $0x11, %xmm4, %xmm7
  682. pshufd $0xa0, %xmm1, %xmm3
  683. pshufd $0xf5, %xmm1, %xmm1
  684. #ifndef CONJ
  685. xorps %xmm0, %xmm1
  686. #else
  687. xorps %xmm0, %xmm3
  688. #endif
  689. mulps %xmm6, %xmm3
  690. mulps %xmm7, %xmm1
  691. addps %xmm3, %xmm1
  692. #endif
  693. #ifdef LN
  694. subl $2 * SIZE, CO1
  695. #endif
  696. #if defined(LN) || defined(LT)
  697. movaps %xmm2, 0 * SIZE(B)
  698. pshufd $0x00, %xmm2, %xmm0
  699. pshufd $0x55, %xmm2, %xmm1
  700. pshufd $0xaa, %xmm2, %xmm4
  701. pshufd $0xff, %xmm2, %xmm5
  702. movaps %xmm0, 0 * SIZE(BB)
  703. movaps %xmm1, 4 * SIZE(BB)
  704. movaps %xmm4, 8 * SIZE(BB)
  705. movaps %xmm5, 12 * SIZE(BB)
  706. movlps %xmm2, 0 * SIZE(CO1)
  707. movhps %xmm2, 0 * SIZE(CO1, LDC)
  708. #else
  709. movlps %xmm1, 0 * SIZE(AA)
  710. movlps %xmm5, 2 * SIZE(AA)
  711. movlps %xmm1, 0 * SIZE(CO1)
  712. movlps %xmm5, 0 * SIZE(CO1, LDC)
  713. #endif
  714. #ifndef LN
  715. addl $2 * SIZE, CO1
  716. #endif
  717. #if defined(LT) || defined(RN)
  718. movl K, %eax
  719. subl KK, %eax
  720. sall $ZBASE_SHIFT, %eax
  721. addl %eax, AA
  722. #ifdef LT
  723. addl $4 * SIZE, B
  724. #endif
  725. #endif
  726. #ifdef LN
  727. subl $1, KK
  728. movl BORIG, B
  729. #endif
  730. #ifdef LT
  731. addl $1, KK
  732. #endif
  733. #ifdef RT
  734. movl K, %eax
  735. movl BORIG, B
  736. sall $ZBASE_SHIFT, %eax
  737. addl %eax, AORIG
  738. #endif
  739. ALIGN_4
  740. .L30:
  741. movl M, %ebx
  742. sarl $1, %ebx
  743. jle .L99
  744. ALIGN_4
  745. .L10:
  746. #ifdef LN
  747. movl K, %eax
  748. sall $1 + ZBASE_SHIFT, %eax
  749. subl %eax, AORIG
  750. #endif
  751. #if defined(LN) || defined(RT)
  752. movl AORIG, %eax
  753. movl %eax, AA
  754. movl KK, %eax
  755. sall $1 + ZBASE_SHIFT, %eax
  756. addl %eax, AA
  757. #endif
  758. leal BUFFER, BB # boffset1 = boffset
  759. #if defined(LN) || defined(RT)
  760. movl KK, %eax
  761. sall $3 + ZBASE_SHIFT, %eax
  762. addl %eax, BB
  763. #endif
  764. movaps 0 * SIZE(AA), %xmm0
  765. xorps %xmm4, %xmm4
  766. movaps 16 * SIZE(AA), %xmm1
  767. xorps %xmm5, %xmm5
  768. movaps 0 * SIZE(BB), %xmm2
  769. xorps %xmm6, %xmm6
  770. movaps 16 * SIZE(BB), %xmm3
  771. xorps %xmm7, %xmm7
  772. PREFETCHW -4 * SIZE(CO1)
  773. PREFETCHW -4 * SIZE(CO1, LDC)
  774. #if defined(LT) || defined(RN)
  775. movl KK, %eax
  776. #else
  777. movl K, %eax
  778. subl KK, %eax
  779. #endif
  780. sarl $3, %eax
  781. je .L15
  782. ALIGN_4
  783. .L11:
  784. KERNEL1(0 * 16)
  785. KERNEL2(0 * 16)
  786. KERNEL3(0 * 16)
  787. KERNEL4(0 * 16)
  788. KERNEL5(0 * 16)
  789. KERNEL6(0 * 16)
  790. KERNEL7(0 * 16)
  791. KERNEL8(0 * 16)
  792. addl $ 32 * SIZE, AA
  793. addl $128 * SIZE, BB
  794. decl %eax
  795. jne .L11
  796. ALIGN_4
  797. .L15:
  798. #if defined(LT) || defined(RN)
  799. movl KK, %eax
  800. #else
  801. movl K, %eax
  802. subl KK, %eax
  803. #endif
  804. andl $7, %eax # if (k & 1)
  805. BRANCH
  806. je .L14
  807. ALIGN_4
  808. .L13:
  809. mulps %xmm0, %xmm2
  810. addps %xmm2, %xmm4
  811. movaps 4 * SIZE(BB), %xmm2
  812. mulps %xmm0, %xmm2
  813. addps %xmm2, %xmm5
  814. movaps 8 * SIZE(BB), %xmm2
  815. mulps %xmm0, %xmm2
  816. mulps 12 * SIZE(BB), %xmm0
  817. addps %xmm2, %xmm6
  818. movaps 16 * SIZE(BB), %xmm2
  819. addps %xmm0, %xmm7
  820. movaps 4 * SIZE(AA), %xmm0
  821. addl $ 4 * SIZE, AA
  822. addl $16 * SIZE, BB
  823. decl %eax
  824. jg .L13
  825. ALIGN_4
  826. .L14:
  827. movaps POSINV, %xmm0
  828. shufps $0xb1, %xmm5, %xmm5
  829. shufps $0xb1, %xmm7, %xmm7
  830. #if defined(LN) || defined(LT)
  831. #ifndef CONJ
  832. xorps %xmm0, %xmm5
  833. xorps %xmm0, %xmm7
  834. #else
  835. xorps %xmm0, %xmm4
  836. xorps %xmm0, %xmm6
  837. #endif
  838. #else
  839. xorps %xmm0, %xmm5
  840. xorps %xmm0, %xmm7
  841. #endif
  842. addps %xmm5, %xmm4
  843. addps %xmm7, %xmm6
  844. #if defined(LN) || defined(RT)
  845. movl KK, %eax
  846. #ifdef LN
  847. subl $2, %eax
  848. #else
  849. subl $2, %eax
  850. #endif
  851. movl AORIG, AA
  852. movl BORIG, B
  853. leal BUFFER, BB
  854. sall $ZBASE_SHIFT, %eax
  855. leal (AA, %eax, 2), AA
  856. leal (B, %eax, 2), B
  857. leal (BB, %eax, 8), BB
  858. #endif
  859. #if defined(LN) || defined(LT)
  860. movaps %xmm4, %xmm5
  861. unpcklpd %xmm6, %xmm4
  862. unpckhpd %xmm6, %xmm5
  863. movaps 0 * SIZE(B), %xmm2
  864. movaps 4 * SIZE(B), %xmm3
  865. subps %xmm4, %xmm2
  866. subps %xmm5, %xmm3
  867. #else
  868. movaps 0 * SIZE(AA), %xmm1
  869. movaps 4 * SIZE(AA), %xmm5
  870. subps %xmm4, %xmm1
  871. subps %xmm6, %xmm5
  872. #endif
  873. #ifdef LN
  874. movaps 4 * SIZE(AA), %xmm5
  875. pshufd $0xee, %xmm5, %xmm6
  876. pshufd $0xbb, %xmm5, %xmm7
  877. pshufd $0xa0, %xmm3, %xmm4
  878. pshufd $0xf5, %xmm3, %xmm3
  879. #ifndef CONJ
  880. xorps %xmm0, %xmm3
  881. #else
  882. xorps %xmm0, %xmm4
  883. #endif
  884. mulps %xmm6, %xmm4
  885. mulps %xmm7, %xmm3
  886. addps %xmm4, %xmm3
  887. pshufd $0x44, %xmm5, %xmm6
  888. pshufd $0x11, %xmm5, %xmm7
  889. pshufd $0xa0, %xmm3, %xmm4
  890. pshufd $0xf5, %xmm3, %xmm1
  891. #ifndef CONJ
  892. xorps %xmm0, %xmm1
  893. #else
  894. xorps %xmm0, %xmm4
  895. #endif
  896. mulps %xmm6, %xmm4
  897. mulps %xmm7, %xmm1
  898. subps %xmm4, %xmm2
  899. subps %xmm1, %xmm2
  900. movaps 0 * SIZE(AA), %xmm5
  901. pshufd $0x44, %xmm5, %xmm6
  902. pshufd $0x11, %xmm5, %xmm7
  903. pshufd $0xa0, %xmm2, %xmm4
  904. pshufd $0xf5, %xmm2, %xmm2
  905. #ifndef CONJ
  906. xorps %xmm0, %xmm2
  907. #else
  908. xorps %xmm0, %xmm4
  909. #endif
  910. mulps %xmm6, %xmm4
  911. mulps %xmm7, %xmm2
  912. addps %xmm4, %xmm2
  913. #endif
  914. #ifdef LT
  915. movaps 0 * SIZE(AA), %xmm5
  916. pshufd $0x44, %xmm5, %xmm6
  917. pshufd $0x11, %xmm5, %xmm7
  918. pshufd $0xa0, %xmm2, %xmm4
  919. pshufd $0xf5, %xmm2, %xmm2
  920. #ifndef CONJ
  921. xorps %xmm0, %xmm2
  922. #else
  923. xorps %xmm0, %xmm4
  924. #endif
  925. mulps %xmm6, %xmm4
  926. mulps %xmm7, %xmm2
  927. addps %xmm4, %xmm2
  928. pshufd $0xee, %xmm5, %xmm6
  929. pshufd $0xbb, %xmm5, %xmm7
  930. pshufd $0xa0, %xmm2, %xmm4
  931. pshufd $0xf5, %xmm2, %xmm1
  932. #ifndef CONJ
  933. xorps %xmm0, %xmm1
  934. #else
  935. xorps %xmm0, %xmm4
  936. #endif
  937. mulps %xmm6, %xmm4
  938. mulps %xmm7, %xmm1
  939. subps %xmm4, %xmm3
  940. subps %xmm1, %xmm3
  941. movaps 4 * SIZE(AA), %xmm5
  942. pshufd $0xee, %xmm5, %xmm6
  943. pshufd $0xbb, %xmm5, %xmm7
  944. pshufd $0xa0, %xmm3, %xmm4
  945. pshufd $0xf5, %xmm3, %xmm3
  946. #ifndef CONJ
  947. xorps %xmm0, %xmm3
  948. #else
  949. xorps %xmm0, %xmm4
  950. #endif
  951. mulps %xmm6, %xmm4
  952. mulps %xmm7, %xmm3
  953. addps %xmm4, %xmm3
  954. #endif
  955. #ifdef RN
  956. movaps 0 * SIZE(B), %xmm4
  957. pshufd $0x44, %xmm4, %xmm6
  958. pshufd $0x11, %xmm4, %xmm7
  959. pshufd $0xa0, %xmm1, %xmm3
  960. pshufd $0xf5, %xmm1, %xmm1
  961. #ifndef CONJ
  962. xorps %xmm0, %xmm1
  963. #else
  964. xorps %xmm0, %xmm3
  965. #endif
  966. mulps %xmm6, %xmm3
  967. mulps %xmm7, %xmm1
  968. addps %xmm3, %xmm1
  969. pshufd $0xee, %xmm4, %xmm6
  970. pshufd $0xbb, %xmm4, %xmm7
  971. pshufd $0xa0, %xmm1, %xmm3
  972. pshufd $0xf5, %xmm1, %xmm2
  973. #ifndef CONJ
  974. xorps %xmm0, %xmm2
  975. #else
  976. xorps %xmm0, %xmm3
  977. #endif
  978. mulps %xmm6, %xmm3
  979. mulps %xmm7, %xmm2
  980. subps %xmm3, %xmm5
  981. subps %xmm2, %xmm5
  982. movaps 4 * SIZE(B), %xmm4
  983. pshufd $0xee, %xmm4, %xmm6
  984. pshufd $0xbb, %xmm4, %xmm7
  985. pshufd $0xa0, %xmm5, %xmm3
  986. pshufd $0xf5, %xmm5, %xmm5
  987. #ifndef CONJ
  988. xorps %xmm0, %xmm5
  989. #else
  990. xorps %xmm0, %xmm3
  991. #endif
  992. mulps %xmm6, %xmm3
  993. mulps %xmm7, %xmm5
  994. addps %xmm3, %xmm5
  995. #endif
  996. #ifdef RT
  997. movaps 4 * SIZE(B), %xmm4
  998. pshufd $0xee, %xmm4, %xmm6
  999. pshufd $0xbb, %xmm4, %xmm7
  1000. pshufd $0xa0, %xmm5, %xmm3
  1001. pshufd $0xf5, %xmm5, %xmm5
  1002. #ifndef CONJ
  1003. xorps %xmm0, %xmm5
  1004. #else
  1005. xorps %xmm0, %xmm3
  1006. #endif
  1007. mulps %xmm6, %xmm3
  1008. mulps %xmm7, %xmm5
  1009. addps %xmm3, %xmm5
  1010. pshufd $0x44, %xmm4, %xmm6
  1011. pshufd $0x11, %xmm4, %xmm7
  1012. pshufd $0xa0, %xmm5, %xmm3
  1013. pshufd $0xf5, %xmm5, %xmm2
  1014. #ifndef CONJ
  1015. xorps %xmm0, %xmm2
  1016. #else
  1017. xorps %xmm0, %xmm3
  1018. #endif
  1019. mulps %xmm6, %xmm3
  1020. mulps %xmm7, %xmm2
  1021. subps %xmm3, %xmm1
  1022. subps %xmm2, %xmm1
  1023. movaps 0 * SIZE(B), %xmm4
  1024. pshufd $0x44, %xmm4, %xmm6
  1025. pshufd $0x11, %xmm4, %xmm7
  1026. pshufd $0xa0, %xmm1, %xmm3
  1027. pshufd $0xf5, %xmm1, %xmm1
  1028. #ifndef CONJ
  1029. xorps %xmm0, %xmm1
  1030. #else
  1031. xorps %xmm0, %xmm3
  1032. #endif
  1033. mulps %xmm6, %xmm3
  1034. mulps %xmm7, %xmm1
  1035. addps %xmm3, %xmm1
  1036. #endif
  1037. #ifdef LN
  1038. subl $4 * SIZE, CO1
  1039. #endif
  1040. #if defined(LN) || defined(LT)
  1041. movaps %xmm2, 0 * SIZE(B)
  1042. movaps %xmm3, 4 * SIZE(B)
  1043. pshufd $0x00, %xmm2, %xmm0
  1044. pshufd $0x55, %xmm2, %xmm1
  1045. pshufd $0xaa, %xmm2, %xmm4
  1046. pshufd $0xff, %xmm2, %xmm5
  1047. movaps %xmm0, 0 * SIZE(BB)
  1048. movaps %xmm1, 4 * SIZE(BB)
  1049. movaps %xmm4, 8 * SIZE(BB)
  1050. movaps %xmm5, 12 * SIZE(BB)
  1051. pshufd $0x00, %xmm3, %xmm0
  1052. pshufd $0x55, %xmm3, %xmm1
  1053. pshufd $0xaa, %xmm3, %xmm4
  1054. pshufd $0xff, %xmm3, %xmm5
  1055. movaps %xmm0, 16 * SIZE(BB)
  1056. movaps %xmm1, 20 * SIZE(BB)
  1057. movaps %xmm4, 24 * SIZE(BB)
  1058. movaps %xmm5, 28 * SIZE(BB)
  1059. movlps %xmm2, 0 * SIZE(CO1)
  1060. movlps %xmm3, 2 * SIZE(CO1)
  1061. movhps %xmm2, 0 * SIZE(CO1, LDC)
  1062. movhps %xmm3, 2 * SIZE(CO1, LDC)
  1063. #else
  1064. movaps %xmm1, 0 * SIZE(AA)
  1065. movaps %xmm5, 4 * SIZE(AA)
  1066. movlps %xmm1, 0 * SIZE(CO1)
  1067. movhps %xmm1, 2 * SIZE(CO1)
  1068. movlps %xmm5, 0 * SIZE(CO1, LDC)
  1069. movhps %xmm5, 2 * SIZE(CO1, LDC)
  1070. #endif
  1071. #ifndef LN
  1072. addl $4 * SIZE, CO1
  1073. #endif
  1074. #if defined(LT) || defined(RN)
  1075. movl K, %eax
  1076. subl KK, %eax
  1077. sall $1 + ZBASE_SHIFT, %eax
  1078. addl %eax, AA
  1079. #ifdef LT
  1080. addl $8 * SIZE, B
  1081. #endif
  1082. #endif
  1083. #ifdef LN
  1084. subl $2, KK
  1085. movl BORIG, B
  1086. #endif
  1087. #ifdef LT
  1088. addl $2, KK
  1089. #endif
  1090. #ifdef RT
  1091. movl K, %eax
  1092. movl BORIG, B
  1093. sall $1 + ZBASE_SHIFT, %eax
  1094. addl %eax, AORIG
  1095. #endif
  1096. decl %ebx
  1097. jg .L10
  1098. ALIGN_4
  1099. .L99:
  1100. #ifdef LN
  1101. movl K, %eax
  1102. sall $1 + ZBASE_SHIFT, %eax
  1103. addl %eax, B
  1104. #endif
  1105. #if defined(LT) || defined(RN)
  1106. movl K, %eax
  1107. subl KK, %eax
  1108. sall $1 + ZBASE_SHIFT, %eax
  1109. addl %eax, B
  1110. #endif
  1111. #ifdef RN
  1112. addl $2, KK
  1113. #endif
  1114. #ifdef RT
  1115. subl $2, KK
  1116. #endif
  1117. decl J # j --
  1118. jg .L01
  1119. ALIGN_4
  1120. .L100:
  1121. movl N, %eax
  1122. andl $1, %eax
  1123. jle .L999
  1124. ALIGN_4
  1125. .L101:
  1126. #ifdef LN
  1127. movl OFFSET, %eax
  1128. addl M, %eax
  1129. movl %eax, KK
  1130. #endif
  1131. leal BUFFER, %ecx
  1132. #ifdef RT
  1133. movl K, %eax
  1134. sall $ZBASE_SHIFT, %eax
  1135. subl %eax, B
  1136. #endif
  1137. #if defined(LN) || defined(RT)
  1138. movl KK, %eax
  1139. movl B, BORIG
  1140. sall $ZBASE_SHIFT, %eax
  1141. addl %eax, B
  1142. leal (BB, %eax, 4), BB
  1143. #endif
  1144. #if defined(LT)
  1145. movl OFFSET, %eax
  1146. movl %eax, KK
  1147. #endif
  1148. #if defined(LT) || defined(RN)
  1149. movl KK, %eax
  1150. #else
  1151. movl K, %eax
  1152. subl KK, %eax
  1153. #endif
  1154. sarl $2, %eax
  1155. jle .L103
  1156. ALIGN_4
  1157. .L102:
  1158. movaps 0 * SIZE(B), %xmm3
  1159. movaps 4 * SIZE(B), %xmm7
  1160. pshufd $0x00, %xmm3, %xmm0
  1161. pshufd $0x55, %xmm3, %xmm1
  1162. pshufd $0xaa, %xmm3, %xmm2
  1163. pshufd $0xff, %xmm3, %xmm3
  1164. movaps %xmm0, 0 * SIZE(BB)
  1165. movaps %xmm1, 4 * SIZE(BB)
  1166. movaps %xmm2, 8 * SIZE(BB)
  1167. movaps %xmm3, 12 * SIZE(BB)
  1168. pshufd $0x00, %xmm7, %xmm4
  1169. pshufd $0x55, %xmm7, %xmm5
  1170. pshufd $0xaa, %xmm7, %xmm6
  1171. pshufd $0xff, %xmm7, %xmm7
  1172. movaps %xmm4, 16 * SIZE(BB)
  1173. movaps %xmm5, 20 * SIZE(BB)
  1174. movaps %xmm6, 24 * SIZE(BB)
  1175. movaps %xmm7, 28 * SIZE(BB)
  1176. addl $ 8 * SIZE, B
  1177. addl $32 * SIZE, BB
  1178. decl %eax
  1179. jne .L102
  1180. ALIGN_4
  1181. .L103:
  1182. #if defined(LT) || defined(RN)
  1183. movl KK, %eax
  1184. #else
  1185. movl K, %eax
  1186. subl KK, %eax
  1187. #endif
  1188. andl $3, %eax
  1189. BRANCH
  1190. jle .L105
  1191. ALIGN_4
  1192. .L104:
  1193. #ifdef movsd
  1194. xorps %xmm3, %xmm3
  1195. #endif
  1196. movsd 0 * SIZE(B), %xmm3
  1197. pshufd $0x00, %xmm3, %xmm0
  1198. pshufd $0x55, %xmm3, %xmm1
  1199. movaps %xmm0, 0 * SIZE(BB)
  1200. movaps %xmm1, 4 * SIZE(BB)
  1201. addl $ 2 * SIZE, %edi
  1202. addl $ 8 * SIZE, %ecx
  1203. decl %eax
  1204. jne .L104
  1205. ALIGN_4
  1206. .L105:
  1207. #if defined(LT) || defined(RN)
  1208. movl A, %eax
  1209. movl %eax, AA
  1210. #else
  1211. movl A, %eax
  1212. movl %eax, AORIG
  1213. #endif
  1214. #ifdef RT
  1215. subl LDC, C
  1216. #endif
  1217. movl C, CO1
  1218. #ifndef RT
  1219. addl LDC, C
  1220. #endif
  1221. movl M, %ebx
  1222. andl $1, %ebx
  1223. jle .L130
  1224. #ifdef LN
  1225. movl K, %eax
  1226. sall $ZBASE_SHIFT, %eax
  1227. subl %eax, AORIG
  1228. #endif
  1229. #if defined(LN) || defined(RT)
  1230. movl AORIG, %eax
  1231. movl %eax, AA
  1232. movl KK, %eax
  1233. sall $ZBASE_SHIFT, %eax
  1234. addl %eax, AA
  1235. #endif
  1236. leal BUFFER, BB # boffset1 = boffset
  1237. #if defined(LN) || defined(RT)
  1238. movl KK, %eax
  1239. sall $2 + ZBASE_SHIFT, %eax
  1240. addl %eax, BB
  1241. #endif
  1242. #ifdef movsd
  1243. xorps %xmm0, %xmm0
  1244. #endif
  1245. movsd 0 * SIZE(AA), %xmm0
  1246. xorps %xmm4, %xmm4
  1247. #ifdef movsd
  1248. xorps %xmm1, %xmm1
  1249. #endif
  1250. movsd 8 * SIZE(AA), %xmm1
  1251. xorps %xmm5, %xmm5
  1252. movaps 0 * SIZE(BB), %xmm2
  1253. xorps %xmm6, %xmm6
  1254. movaps 16 * SIZE(BB), %xmm3
  1255. xorps %xmm7, %xmm7
  1256. #if defined(LT) || defined(RN)
  1257. movl KK, %eax
  1258. #else
  1259. movl K, %eax
  1260. subl KK, %eax
  1261. #endif
  1262. sarl $3, %eax
  1263. je .L142
  1264. ALIGN_4
  1265. .L141:
  1266. mulps %xmm0, %xmm2
  1267. addps %xmm2, %xmm4
  1268. movaps 4 * SIZE(BB), %xmm2
  1269. mulps %xmm0, %xmm2
  1270. movsd 2 * SIZE(AA), %xmm0
  1271. addps %xmm2, %xmm5
  1272. movaps 8 * SIZE(BB), %xmm2
  1273. mulps %xmm0, %xmm2
  1274. addps %xmm2, %xmm6
  1275. movaps 12 * SIZE(BB), %xmm2
  1276. mulps %xmm0, %xmm2
  1277. movsd 4 * SIZE(AA), %xmm0
  1278. addps %xmm2, %xmm7
  1279. movaps 32 * SIZE(BB), %xmm2
  1280. mulps %xmm0, %xmm3
  1281. addps %xmm3, %xmm4
  1282. movaps 20 * SIZE(BB), %xmm3
  1283. mulps %xmm0, %xmm3
  1284. movsd 6 * SIZE(AA), %xmm0
  1285. addps %xmm3, %xmm5
  1286. movaps 24 * SIZE(BB), %xmm3
  1287. mulps %xmm0, %xmm3
  1288. addps %xmm3, %xmm6
  1289. movaps 28 * SIZE(BB), %xmm3
  1290. mulps %xmm0, %xmm3
  1291. movsd 16 * SIZE(AA), %xmm0
  1292. addps %xmm3, %xmm7
  1293. movaps 48 * SIZE(BB), %xmm3
  1294. mulps %xmm1, %xmm2
  1295. addps %xmm2, %xmm4
  1296. movaps 36 * SIZE(BB), %xmm2
  1297. mulps %xmm1, %xmm2
  1298. movsd 10 * SIZE(AA), %xmm1
  1299. addps %xmm2, %xmm5
  1300. movaps 40 * SIZE(BB), %xmm2
  1301. mulps %xmm1, %xmm2
  1302. addps %xmm2, %xmm6
  1303. movaps 44 * SIZE(BB), %xmm2
  1304. mulps %xmm1, %xmm2
  1305. movsd 12 * SIZE(AA), %xmm1
  1306. addps %xmm2, %xmm7
  1307. movaps 64 * SIZE(BB), %xmm2
  1308. mulps %xmm1, %xmm3
  1309. addps %xmm3, %xmm4
  1310. movaps 52 * SIZE(BB), %xmm3
  1311. mulps %xmm1, %xmm3
  1312. movsd 14 * SIZE(AA), %xmm1
  1313. addps %xmm3, %xmm5
  1314. movaps 56 * SIZE(BB), %xmm3
  1315. mulps %xmm1, %xmm3
  1316. addps %xmm3, %xmm6
  1317. movaps 60 * SIZE(BB), %xmm3
  1318. mulps %xmm1, %xmm3
  1319. movsd 24 * SIZE(AA), %xmm1
  1320. addps %xmm3, %xmm7
  1321. movaps 80 * SIZE(BB), %xmm3
  1322. addl $ 16 * SIZE, AA
  1323. addl $ 64 * SIZE, BB
  1324. decl %eax
  1325. jne .L141
  1326. ALIGN_4
  1327. .L142:
  1328. #if defined(LT) || defined(RN)
  1329. movl KK, %eax
  1330. #else
  1331. movl K, %eax
  1332. subl KK, %eax
  1333. #endif
  1334. andl $7, %eax # if (k & 1)
  1335. BRANCH
  1336. je .L144
  1337. ALIGN_4
  1338. .L143:
  1339. mulps %xmm0, %xmm2
  1340. mulps 4 * SIZE(BB), %xmm0
  1341. addps %xmm2, %xmm4
  1342. movaps 8 * SIZE(BB), %xmm2
  1343. addps %xmm0, %xmm5
  1344. movsd 2 * SIZE(AA), %xmm0
  1345. addl $2 * SIZE, AA
  1346. addl $8 * SIZE, BB
  1347. decl %eax
  1348. jg .L143
  1349. ALIGN_4
  1350. .L144:
  1351. addps %xmm6, %xmm4
  1352. addps %xmm7, %xmm5
  1353. movaps POSINV, %xmm0
  1354. shufps $0xb1, %xmm5, %xmm5
  1355. #if defined(LN) || defined(LT)
  1356. #ifndef CONJ
  1357. xorps %xmm0, %xmm5
  1358. #else
  1359. xorps %xmm0, %xmm4
  1360. #endif
  1361. #else
  1362. xorps %xmm0, %xmm5
  1363. #endif
  1364. addps %xmm5, %xmm4
  1365. #if defined(LN) || defined(RT)
  1366. movl KK, %eax
  1367. subl $1, %eax
  1368. movl AORIG, AA
  1369. movl BORIG, B
  1370. leal BUFFER, BB
  1371. sall $ZBASE_SHIFT, %eax
  1372. addl %eax, AA
  1373. addl %eax, B
  1374. leal (BB, %eax, 4), BB
  1375. #endif
  1376. #if defined(LN) || defined(LT)
  1377. #ifdef movsd
  1378. xorps %xmm2, %xmm2
  1379. #endif
  1380. movsd 0 * SIZE(B), %xmm2
  1381. subps %xmm4, %xmm2
  1382. #else
  1383. #ifdef movsd
  1384. xorps %xmm1, %xmm1
  1385. #endif
  1386. movsd 0 * SIZE(AA), %xmm1
  1387. subps %xmm4, %xmm1
  1388. #endif
  1389. #if defined(LN) || defined(LT)
  1390. movaps 0 * SIZE(AA), %xmm5
  1391. pshufd $0x44, %xmm5, %xmm6
  1392. pshufd $0x11, %xmm5, %xmm7
  1393. pshufd $0xa0, %xmm2, %xmm4
  1394. pshufd $0xf5, %xmm2, %xmm2
  1395. #ifndef CONJ
  1396. xorps %xmm0, %xmm2
  1397. #else
  1398. xorps %xmm0, %xmm4
  1399. #endif
  1400. mulps %xmm6, %xmm4
  1401. mulps %xmm7, %xmm2
  1402. addps %xmm4, %xmm2
  1403. #endif
  1404. #if defined(RN) || defined(RT)
  1405. movaps 0 * SIZE(B), %xmm4
  1406. pshufd $0x44, %xmm4, %xmm6
  1407. pshufd $0x11, %xmm4, %xmm7
  1408. pshufd $0xa0, %xmm1, %xmm3
  1409. pshufd $0xf5, %xmm1, %xmm1
  1410. #ifndef CONJ
  1411. xorps %xmm0, %xmm1
  1412. #else
  1413. xorps %xmm0, %xmm3
  1414. #endif
  1415. mulps %xmm6, %xmm3
  1416. mulps %xmm7, %xmm1
  1417. addps %xmm3, %xmm1
  1418. #endif
  1419. #ifdef LN
  1420. subl $2 * SIZE, CO1
  1421. #endif
  1422. #if defined(LN) || defined(LT)
  1423. movlps %xmm2, 0 * SIZE(B)
  1424. pshufd $0x00, %xmm2, %xmm0
  1425. pshufd $0x55, %xmm2, %xmm1
  1426. movaps %xmm0, 0 * SIZE(BB)
  1427. movaps %xmm1, 4 * SIZE(BB)
  1428. movlps %xmm2, 0 * SIZE(CO1)
  1429. #else
  1430. movlps %xmm1, 0 * SIZE(AA)
  1431. movlps %xmm1, 0 * SIZE(CO1)
  1432. #endif
  1433. #ifndef LN
  1434. addl $2 * SIZE, CO1
  1435. #endif
  1436. #if defined(LT) || defined(RN)
  1437. movl K, %eax
  1438. subl KK, %eax
  1439. sall $ZBASE_SHIFT, %eax
  1440. addl %eax, AA
  1441. #ifdef LT
  1442. addl $2 * SIZE, B
  1443. #endif
  1444. #endif
  1445. #ifdef LN
  1446. subl $1, KK
  1447. movl BORIG, B
  1448. #endif
  1449. #ifdef LT
  1450. addl $1, KK
  1451. #endif
  1452. #ifdef RT
  1453. movl K, %eax
  1454. movl BORIG, B
  1455. sall $ZBASE_SHIFT, %eax
  1456. addl %eax, AORIG
  1457. #endif
  1458. ALIGN_4
  1459. .L130:
  1460. movl M, %ebx
  1461. sarl $1, %ebx
  1462. jle .L149
  1463. ALIGN_4
  1464. .L110:
  1465. #ifdef LN
  1466. movl K, %eax
  1467. sall $1 + ZBASE_SHIFT, %eax
  1468. subl %eax, AORIG
  1469. #endif
  1470. #if defined(LN) || defined(RT)
  1471. movl AORIG, %eax
  1472. movl %eax, AA
  1473. movl KK, %eax
  1474. sall $1 + ZBASE_SHIFT, %eax
  1475. addl %eax, AA
  1476. #endif
  1477. leal BUFFER, BB # boffset1 = boffset
  1478. #if defined(LN) || defined(RT)
  1479. movl KK, %eax
  1480. sall $2 + ZBASE_SHIFT, %eax
  1481. addl %eax, BB
  1482. #endif
  1483. xorps %xmm4, %xmm4
  1484. xorps %xmm5, %xmm5
  1485. xorps %xmm6, %xmm6
  1486. xorps %xmm7, %xmm7
  1487. movaps 0 * SIZE(AA), %xmm0
  1488. movaps 16 * SIZE(AA), %xmm1
  1489. movaps 0 * SIZE(BB), %xmm2
  1490. movaps 16 * SIZE(BB), %xmm3
  1491. PREFETCHW -4 * SIZE(CO1)
  1492. #if defined(LT) || defined(RN)
  1493. movl KK, %eax
  1494. #else
  1495. movl K, %eax
  1496. subl KK, %eax
  1497. #endif
  1498. sarl $3, %eax
  1499. je .L112
  1500. ALIGN_4
  1501. .L111:
  1502. mulps %xmm0, %xmm2
  1503. addps %xmm2, %xmm4
  1504. movaps 4 * SIZE(BB), %xmm2
  1505. mulps %xmm0, %xmm2
  1506. movaps 4 * SIZE(AA), %xmm0
  1507. addps %xmm2, %xmm5
  1508. movaps 8 * SIZE(BB), %xmm2
  1509. mulps %xmm0, %xmm2
  1510. addps %xmm2, %xmm6
  1511. movaps 12 * SIZE(BB), %xmm2
  1512. mulps %xmm0, %xmm2
  1513. movaps 8 * SIZE(AA), %xmm0
  1514. addps %xmm2, %xmm7
  1515. movaps 32 * SIZE(BB), %xmm2
  1516. mulps %xmm0, %xmm3
  1517. addps %xmm3, %xmm4
  1518. movaps 20 * SIZE(BB), %xmm3
  1519. mulps %xmm0, %xmm3
  1520. movaps 12 * SIZE(AA), %xmm0
  1521. addps %xmm3, %xmm5
  1522. movaps 24 * SIZE(BB), %xmm3
  1523. mulps %xmm0, %xmm3
  1524. addps %xmm3, %xmm6
  1525. movaps 28 * SIZE(BB), %xmm3
  1526. mulps %xmm0, %xmm3
  1527. movaps 32 * SIZE(AA), %xmm0
  1528. addps %xmm3, %xmm7
  1529. movaps 48 * SIZE(BB), %xmm3
  1530. mulps %xmm1, %xmm2
  1531. addps %xmm2, %xmm4
  1532. movaps 36 * SIZE(BB), %xmm2
  1533. mulps %xmm1, %xmm2
  1534. movaps 20 * SIZE(AA), %xmm1
  1535. addps %xmm2, %xmm5
  1536. movaps 40 * SIZE(BB), %xmm2
  1537. mulps %xmm1, %xmm2
  1538. addps %xmm2, %xmm6
  1539. movaps 44 * SIZE(BB), %xmm2
  1540. mulps %xmm1, %xmm2
  1541. movaps 24 * SIZE(AA), %xmm1
  1542. addps %xmm2, %xmm7
  1543. movaps 64 * SIZE(BB), %xmm2
  1544. mulps %xmm1, %xmm3
  1545. addps %xmm3, %xmm4
  1546. movaps 52 * SIZE(BB), %xmm3
  1547. mulps %xmm1, %xmm3
  1548. movaps 28 * SIZE(AA), %xmm1
  1549. addps %xmm3, %xmm5
  1550. movaps 56 * SIZE(BB), %xmm3
  1551. mulps %xmm1, %xmm3
  1552. addps %xmm3, %xmm6
  1553. movaps 60 * SIZE(BB), %xmm3
  1554. mulps %xmm1, %xmm3
  1555. movaps 48 * SIZE(AA), %xmm1
  1556. addps %xmm3, %xmm7
  1557. movaps 80 * SIZE(BB), %xmm3
  1558. addl $ 32 * SIZE, AA
  1559. addl $ 64 * SIZE, BB
  1560. decl %eax
  1561. jne .L111
  1562. ALIGN_4
  1563. .L112:
  1564. #if defined(LT) || defined(RN)
  1565. movl KK, %eax
  1566. #else
  1567. movl K, %eax
  1568. subl KK, %eax
  1569. #endif
  1570. andl $7, %eax # if (k & 1)
  1571. BRANCH
  1572. je .L114
  1573. ALIGN_4
  1574. .L113:
  1575. mulps %xmm0, %xmm2
  1576. mulps 4 * SIZE(BB), %xmm0
  1577. addps %xmm2, %xmm4
  1578. movaps 8 * SIZE(BB), %xmm2
  1579. addps %xmm0, %xmm5
  1580. movaps 4 * SIZE(AA), %xmm0
  1581. addl $ 4 * SIZE, AA
  1582. addl $ 8 * SIZE, BB
  1583. decl %eax
  1584. jg .L113
  1585. ALIGN_4
  1586. .L114:
  1587. addps %xmm6, %xmm4
  1588. addps %xmm7, %xmm5
  1589. movaps POSINV, %xmm0
  1590. shufps $0xb1, %xmm5, %xmm5
  1591. #if defined(LN) || defined(LT)
  1592. #ifndef CONJ
  1593. xorps %xmm0, %xmm5
  1594. #else
  1595. xorps %xmm0, %xmm4
  1596. #endif
  1597. #else
  1598. xorps %xmm0, %xmm5
  1599. #endif
  1600. addps %xmm5, %xmm4
  1601. #if defined(LN) || defined(RT)
  1602. movl KK, %eax
  1603. #ifdef LN
  1604. subl $2, %eax
  1605. #else
  1606. subl $1, %eax
  1607. #endif
  1608. movl AORIG, AA
  1609. movl BORIG, B
  1610. leal BUFFER, BB
  1611. sall $ZBASE_SHIFT, %eax
  1612. leal (AA, %eax, 2), AA
  1613. leal (B, %eax, 1), B
  1614. leal (BB, %eax, 4), BB
  1615. #endif
  1616. #if defined(LN) || defined(LT)
  1617. movaps %xmm4, %xmm5
  1618. unpcklpd %xmm6, %xmm4
  1619. unpckhpd %xmm6, %xmm5
  1620. #ifdef movsd
  1621. xorps %xmm2, %xmm2
  1622. #endif
  1623. movsd 0 * SIZE(B), %xmm2
  1624. #ifdef movsd
  1625. xorps %xmm3, %xmm3
  1626. #endif
  1627. movsd 2 * SIZE(B), %xmm3
  1628. subps %xmm4, %xmm2
  1629. subps %xmm5, %xmm3
  1630. #else
  1631. movaps 0 * SIZE(AA), %xmm1
  1632. subps %xmm4, %xmm1
  1633. #endif
  1634. #ifdef LN
  1635. movaps 4 * SIZE(AA), %xmm5
  1636. pshufd $0xee, %xmm5, %xmm6
  1637. pshufd $0xbb, %xmm5, %xmm7
  1638. pshufd $0xa0, %xmm3, %xmm4
  1639. pshufd $0xf5, %xmm3, %xmm3
  1640. #ifndef CONJ
  1641. xorps %xmm0, %xmm3
  1642. #else
  1643. xorps %xmm0, %xmm4
  1644. #endif
  1645. mulps %xmm6, %xmm4
  1646. mulps %xmm7, %xmm3
  1647. addps %xmm4, %xmm3
  1648. pshufd $0x44, %xmm5, %xmm6
  1649. pshufd $0x11, %xmm5, %xmm7
  1650. pshufd $0xa0, %xmm3, %xmm4
  1651. pshufd $0xf5, %xmm3, %xmm1
  1652. #ifndef CONJ
  1653. xorps %xmm0, %xmm1
  1654. #else
  1655. xorps %xmm0, %xmm4
  1656. #endif
  1657. mulps %xmm6, %xmm4
  1658. mulps %xmm7, %xmm1
  1659. subps %xmm4, %xmm2
  1660. subps %xmm1, %xmm2
  1661. movaps 0 * SIZE(AA), %xmm5
  1662. pshufd $0x44, %xmm5, %xmm6
  1663. pshufd $0x11, %xmm5, %xmm7
  1664. pshufd $0xa0, %xmm2, %xmm4
  1665. pshufd $0xf5, %xmm2, %xmm2
  1666. #ifndef CONJ
  1667. xorps %xmm0, %xmm2
  1668. #else
  1669. xorps %xmm0, %xmm4
  1670. #endif
  1671. mulps %xmm6, %xmm4
  1672. mulps %xmm7, %xmm2
  1673. addps %xmm4, %xmm2
  1674. #endif
  1675. #ifdef LT
  1676. movaps 0 * SIZE(AA), %xmm5
  1677. pshufd $0x44, %xmm5, %xmm6
  1678. pshufd $0x11, %xmm5, %xmm7
  1679. pshufd $0xa0, %xmm2, %xmm4
  1680. pshufd $0xf5, %xmm2, %xmm2
  1681. #ifndef CONJ
  1682. xorps %xmm0, %xmm2
  1683. #else
  1684. xorps %xmm0, %xmm4
  1685. #endif
  1686. mulps %xmm6, %xmm4
  1687. mulps %xmm7, %xmm2
  1688. addps %xmm4, %xmm2
  1689. pshufd $0xee, %xmm5, %xmm6
  1690. pshufd $0xbb, %xmm5, %xmm7
  1691. pshufd $0xa0, %xmm2, %xmm4
  1692. pshufd $0xf5, %xmm2, %xmm1
  1693. #ifndef CONJ
  1694. xorps %xmm0, %xmm1
  1695. #else
  1696. xorps %xmm0, %xmm4
  1697. #endif
  1698. mulps %xmm6, %xmm4
  1699. mulps %xmm7, %xmm1
  1700. subps %xmm4, %xmm3
  1701. subps %xmm1, %xmm3
  1702. movaps 4 * SIZE(AA), %xmm5
  1703. pshufd $0xee, %xmm5, %xmm6
  1704. pshufd $0xbb, %xmm5, %xmm7
  1705. pshufd $0xa0, %xmm3, %xmm4
  1706. pshufd $0xf5, %xmm3, %xmm3
  1707. #ifndef CONJ
  1708. xorps %xmm0, %xmm3
  1709. #else
  1710. xorps %xmm0, %xmm4
  1711. #endif
  1712. mulps %xmm6, %xmm4
  1713. mulps %xmm7, %xmm3
  1714. addps %xmm4, %xmm3
  1715. #endif
  1716. #if defined(RN) || defined(RT)
  1717. movaps 0 * SIZE(B), %xmm4
  1718. pshufd $0x44, %xmm4, %xmm6
  1719. pshufd $0x11, %xmm4, %xmm7
  1720. pshufd $0xa0, %xmm1, %xmm3
  1721. pshufd $0xf5, %xmm1, %xmm1
  1722. #ifndef CONJ
  1723. xorps %xmm0, %xmm1
  1724. #else
  1725. xorps %xmm0, %xmm3
  1726. #endif
  1727. mulps %xmm6, %xmm3
  1728. mulps %xmm7, %xmm1
  1729. addps %xmm3, %xmm1
  1730. #endif
  1731. #ifdef LN
  1732. subl $4 * SIZE, CO1
  1733. #endif
  1734. #if defined(LN) || defined(LT)
  1735. movlps %xmm2, 0 * SIZE(B)
  1736. movlps %xmm3, 2 * SIZE(B)
  1737. pshufd $0x00, %xmm2, %xmm0
  1738. pshufd $0x55, %xmm2, %xmm1
  1739. movaps %xmm0, 0 * SIZE(BB)
  1740. movaps %xmm1, 4 * SIZE(BB)
  1741. pshufd $0x00, %xmm3, %xmm0
  1742. pshufd $0x55, %xmm3, %xmm1
  1743. movaps %xmm0, 8 * SIZE(BB)
  1744. movaps %xmm1, 12 * SIZE(BB)
  1745. movlps %xmm2, 0 * SIZE(CO1)
  1746. movlps %xmm3, 2 * SIZE(CO1)
  1747. #else
  1748. movaps %xmm1, 0 * SIZE(AA)
  1749. movlps %xmm1, 0 * SIZE(CO1)
  1750. movhps %xmm1, 2 * SIZE(CO1)
  1751. #endif
  1752. #ifndef LN
  1753. addl $4 * SIZE, CO1
  1754. #endif
  1755. #if defined(LT) || defined(RN)
  1756. movl K, %eax
  1757. subl KK, %eax
  1758. sall $1 + ZBASE_SHIFT, %eax
  1759. addl %eax, AA
  1760. #ifdef LT
  1761. addl $4 * SIZE, B
  1762. #endif
  1763. #endif
  1764. #ifdef LN
  1765. subl $2, KK
  1766. movl BORIG, B
  1767. #endif
  1768. #ifdef LT
  1769. addl $2, KK
  1770. #endif
  1771. #ifdef RT
  1772. movl K, %eax
  1773. movl BORIG, B
  1774. sall $1 + ZBASE_SHIFT, %eax
  1775. addl %eax, AORIG
  1776. #endif
  1777. decl %ebx # i --
  1778. jg .L110
  1779. ALIGN_4
  1780. .L149:
  1781. #ifdef LN
  1782. movl K, %eax
  1783. sall $ZBASE_SHIFT, %eax
  1784. addl %eax, B
  1785. #endif
  1786. #if defined(LT) || defined(RN)
  1787. movl K, %eax
  1788. subl KK, %eax
  1789. sall $ZBASE_SHIFT, %eax
  1790. addl %eax, B
  1791. #endif
  1792. #ifdef RN
  1793. addl $1, KK
  1794. #endif
  1795. #ifdef RT
  1796. subl $1, KK
  1797. #endif
  1798. ALIGN_4
  1799. .L999:
  1800. EMMS
  1801. movl OLD_STACK, %esp
  1802. popl %ebx
  1803. popl %esi
  1804. popl %edi
  1805. popl %ebp
  1806. ret
  1807. EPILOGUE