You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT_4x1_sse.S 33 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(HAVE_SSE) || !defined(HAVE_MMX)
  41. #error You have to check your configuration.
  42. #endif
  43. #define STACK 16
  44. #define ARGS 0
  45. #define STACK_M 4 + STACK + ARGS(%esi)
  46. #define STACK_N 8 + STACK + ARGS(%esi)
  47. #define STACK_K 12 + STACK + ARGS(%esi)
  48. #define STACK_A 24 + STACK + ARGS(%esi)
  49. #define STACK_B 28 + STACK + ARGS(%esi)
  50. #define STACK_C 32 + STACK + ARGS(%esi)
  51. #define STACK_LDC 36 + STACK + ARGS(%esi)
  52. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  53. #define POSINV 0(%esp)
  54. #define K 16(%esp)
  55. #define N 20(%esp)
  56. #define M 24(%esp)
  57. #define A 28(%esp)
  58. #define C 32(%esp)
  59. #define J 36(%esp)
  60. #define OLD_STACK 40(%esp)
  61. #define OFFSET 48(%esp)
  62. #define KK 52(%esp)
  63. #define KKK 56(%esp)
  64. #define AORIG 60(%esp)
  65. #define BORIG 64(%esp)
  66. #define BUFFER 128(%esp)
  67. #define B %edi
  68. #define LDC %ebp
  69. #define AA %edx
  70. #define BB %ecx
  71. #define CO1 %esi
  72. #define STACK_ALIGN 4096
  73. #define STACK_OFFSET 1024
  74. #if !defined(HAVE_SSE2) || defined(OPTERON)
  75. #define movsd movlps
  76. #endif
  77. #ifdef HAVE_SSE2
  78. #define xorps pxor
  79. #endif
  80. PROLOGUE
  81. pushl %ebp
  82. pushl %edi
  83. pushl %esi
  84. pushl %ebx
  85. PROFCODE
  86. movl %esp, %esi # save old stack
  87. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  88. andl $-STACK_ALIGN, %esp # align stack
  89. addl $STACK_OFFSET, %esp
  90. STACK_TOUCHING
  91. movl STACK_M, %ebx
  92. movl STACK_N, %eax
  93. movl STACK_K, %ecx
  94. movl STACK_A, %edx
  95. movl %ebx, M
  96. movl %eax, N
  97. movl %ecx, K
  98. movl %edx, A
  99. movl %esi, OLD_STACK
  100. movl STACK_B, %edi
  101. movl STACK_C, %ebx
  102. movss STACK_OFFT, %xmm4
  103. #ifndef CONJ
  104. movl $0x80000000, 0 + POSINV
  105. movl $0x00000000, 4 + POSINV
  106. movl $0x80000000, 8 + POSINV
  107. movl $0x00000000, 12 + POSINV
  108. #else
  109. movl $0x00000000, 0 + POSINV
  110. movl $0x80000000, 4 + POSINV
  111. movl $0x00000000, 8 + POSINV
  112. movl $0x80000000, 12 + POSINV
  113. #endif
  114. movl %ebx, C
  115. movl STACK_LDC, LDC
  116. movss %xmm4, OFFSET
  117. movss %xmm4, KK
  118. sall $ZBASE_SHIFT, LDC
  119. #ifdef LN
  120. movl M, %eax
  121. sall $ZBASE_SHIFT, %eax
  122. addl %eax, C
  123. imull K, %eax
  124. addl %eax, A
  125. #endif
  126. #ifdef RT
  127. movl N, %eax
  128. sall $ZBASE_SHIFT, %eax
  129. imull K, %eax
  130. addl %eax, B
  131. movl N, %eax
  132. imull LDC, %eax
  133. addl %eax, C
  134. #endif
  135. #ifdef RN
  136. negl KK
  137. #endif
  138. #ifdef RT
  139. movl N, %eax
  140. subl OFFSET, %eax
  141. movl %eax, KK
  142. #endif
  143. movl N, %eax
  144. movl %eax, J # j = n
  145. testl %eax, %eax
  146. jle .L999
  147. .L01:
  148. #ifdef LN
  149. movl OFFSET, %eax
  150. addl M, %eax
  151. movl %eax, KK
  152. #endif
  153. leal BUFFER, BB
  154. #ifdef RT
  155. movl K, %eax
  156. sall $ZBASE_SHIFT, %eax
  157. subl %eax, B
  158. #endif
  159. #if defined(LN) || defined(RT)
  160. movl KK, %eax
  161. movl B, BORIG
  162. sall $ZBASE_SHIFT, %eax
  163. addl %eax, B
  164. leal (BB, %eax, 4), BB
  165. #endif
  166. #if defined(LT)
  167. movl OFFSET, %eax
  168. movl %eax, KK
  169. #endif
  170. #if defined(LT) || defined(RN)
  171. movl KK, %eax
  172. #else
  173. movl K, %eax
  174. subl KK, %eax
  175. #endif
  176. sarl $2, %eax
  177. jle .L03
  178. .L02:
  179. movss 0 * SIZE(B), %xmm0
  180. movss 1 * SIZE(B), %xmm1
  181. movss 2 * SIZE(B), %xmm2
  182. movss 3 * SIZE(B), %xmm3
  183. shufps $0, %xmm0, %xmm0
  184. shufps $0, %xmm1, %xmm1
  185. shufps $0, %xmm2, %xmm2
  186. shufps $0, %xmm3, %xmm3
  187. movaps %xmm0, 0 * SIZE(BB)
  188. movaps %xmm1, 4 * SIZE(BB)
  189. movaps %xmm2, 8 * SIZE(BB)
  190. movaps %xmm3, 12 * SIZE(BB)
  191. movss 4 * SIZE(B), %xmm0
  192. movss 5 * SIZE(B), %xmm1
  193. movss 6 * SIZE(B), %xmm2
  194. movss 7 * SIZE(B), %xmm3
  195. shufps $0, %xmm0, %xmm0
  196. shufps $0, %xmm1, %xmm1
  197. shufps $0, %xmm2, %xmm2
  198. shufps $0, %xmm3, %xmm3
  199. movaps %xmm0, 16 * SIZE(BB)
  200. movaps %xmm1, 20 * SIZE(BB)
  201. movaps %xmm2, 24 * SIZE(BB)
  202. movaps %xmm3, 28 * SIZE(BB)
  203. prefetcht0 104 * SIZE(B)
  204. addl $ 8 * SIZE, B
  205. addl $32 * SIZE, BB
  206. decl %eax
  207. jne .L02
  208. .L03:
  209. #if defined(LT) || defined(RN)
  210. movl KK, %eax
  211. #else
  212. movl K, %eax
  213. subl KK, %eax
  214. #endif
  215. andl $3, %eax
  216. BRANCH
  217. jle .L05
  218. .L04:
  219. movss 0 * SIZE(B), %xmm0
  220. movss 1 * SIZE(B), %xmm1
  221. shufps $0, %xmm0, %xmm0
  222. shufps $0, %xmm1, %xmm1
  223. movaps %xmm0, 0 * SIZE(BB)
  224. movaps %xmm1, 4 * SIZE(BB)
  225. addl $2 * SIZE, B
  226. addl $8 * SIZE, BB
  227. decl %eax
  228. jne .L04
  229. ALIGN_4
  230. .L05:
  231. #if defined(LT) || defined(RN)
  232. movl A, %eax
  233. movl %eax, AA
  234. #else
  235. movl A, %eax
  236. movl %eax, AORIG
  237. #endif
  238. #ifdef RT
  239. subl LDC, C
  240. #endif
  241. movl C, CO1
  242. #ifndef RT
  243. addl LDC, C
  244. #endif
  245. movl M, %ebx
  246. sarl $2, %ebx
  247. jle .L50
  248. ALIGN_4
  249. .L10:
  250. #ifdef LN
  251. movl K, %eax
  252. sall $2 + ZBASE_SHIFT, %eax
  253. subl %eax, AORIG
  254. #endif
  255. #if defined(LN) || defined(RT)
  256. movl AORIG, %eax
  257. movl %eax, AA
  258. movl KK, %eax
  259. sall $2 + ZBASE_SHIFT, %eax
  260. addl %eax, AA
  261. #endif
  262. leal BUFFER, BB
  263. #if defined(LN) || defined(RT)
  264. movl KK, %eax
  265. sall $2 + ZBASE_SHIFT, %eax
  266. addl %eax, BB
  267. #endif
  268. movaps 0 * SIZE(BB), %xmm2
  269. xorps %xmm4, %xmm4
  270. movaps 0 * SIZE(AA), %xmm0
  271. xorps %xmm5, %xmm5
  272. movaps 8 * SIZE(BB), %xmm3
  273. xorps %xmm6, %xmm6
  274. movaps 8 * SIZE(AA), %xmm1
  275. xorps %xmm7, %xmm7
  276. #if defined(LT) || defined(RN)
  277. movl KK, %eax
  278. #else
  279. movl K, %eax
  280. subl KK, %eax
  281. #endif
  282. sarl $3, %eax
  283. prefetcht0 8 * SIZE(CO1)
  284. je .L12
  285. ALIGN_4
  286. #define PREFETCHSIZE 48
  287. .L11:
  288. #ifdef CORE_KATMAI
  289. prefetcht0 PREFETCHSIZE * SIZE(AA)
  290. #endif
  291. mulps %xmm0, %xmm2
  292. mulps 4 * SIZE(BB), %xmm0
  293. addps %xmm2, %xmm4
  294. movaps 0 * SIZE(BB), %xmm2
  295. addps %xmm0, %xmm5
  296. movaps 4 * SIZE(AA), %xmm0
  297. mulps %xmm0, %xmm2
  298. mulps 4 * SIZE(BB), %xmm0
  299. addps %xmm2, %xmm6
  300. movaps 16 * SIZE(BB), %xmm2
  301. addps %xmm0, %xmm7
  302. movaps 16 * SIZE(AA), %xmm0
  303. #ifdef CORE_KATMAI
  304. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  305. #endif
  306. mulps %xmm1, %xmm3
  307. mulps 12 * SIZE(BB), %xmm1
  308. addps %xmm3, %xmm4
  309. movaps 8 * SIZE(BB), %xmm3
  310. addps %xmm1, %xmm5
  311. movaps 12 * SIZE(AA), %xmm1
  312. mulps %xmm1, %xmm3
  313. mulps 12 * SIZE(BB), %xmm1
  314. addps %xmm3, %xmm6
  315. movaps 24 * SIZE(BB), %xmm3
  316. addps %xmm1, %xmm7
  317. movaps 24 * SIZE(AA), %xmm1
  318. #ifdef CORE_KATMAI
  319. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  320. #endif
  321. mulps %xmm0, %xmm2
  322. mulps 20 * SIZE(BB), %xmm0
  323. addps %xmm2, %xmm4
  324. movaps 16 * SIZE(BB), %xmm2
  325. addps %xmm0, %xmm5
  326. movaps 20 * SIZE(AA), %xmm0
  327. mulps %xmm0, %xmm2
  328. mulps 20 * SIZE(BB), %xmm0
  329. addps %xmm2, %xmm6
  330. movaps 32 * SIZE(BB), %xmm2
  331. addps %xmm0, %xmm7
  332. movaps 32 * SIZE(AA), %xmm0
  333. #ifdef CORE_KATMAI
  334. prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA)
  335. #endif
  336. mulps %xmm1, %xmm3
  337. mulps 28 * SIZE(BB), %xmm1
  338. addps %xmm3, %xmm4
  339. movaps 24 * SIZE(BB), %xmm3
  340. addps %xmm1, %xmm5
  341. movaps 28 * SIZE(AA), %xmm1
  342. mulps %xmm1, %xmm3
  343. mulps 28 * SIZE(BB), %xmm1
  344. addps %xmm3, %xmm6
  345. movaps 40 * SIZE(BB), %xmm3
  346. addps %xmm1, %xmm7
  347. movaps 40 * SIZE(AA), %xmm1
  348. #ifdef CORE_KATMAI
  349. prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA)
  350. #endif
  351. mulps %xmm0, %xmm2
  352. mulps 36 * SIZE(BB), %xmm0
  353. addps %xmm2, %xmm4
  354. movaps 32 * SIZE(BB), %xmm2
  355. addps %xmm0, %xmm5
  356. movaps 36 * SIZE(AA), %xmm0
  357. mulps %xmm0, %xmm2
  358. mulps 36 * SIZE(BB), %xmm0
  359. addps %xmm2, %xmm6
  360. movaps 48 * SIZE(BB), %xmm2
  361. addps %xmm0, %xmm7
  362. movaps 48 * SIZE(AA), %xmm0
  363. #ifdef CORE_KATMAI
  364. prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA)
  365. #endif
  366. mulps %xmm1, %xmm3
  367. mulps 44 * SIZE(BB), %xmm1
  368. addps %xmm3, %xmm4
  369. movaps 40 * SIZE(BB), %xmm3
  370. addps %xmm1, %xmm5
  371. movaps 44 * SIZE(AA), %xmm1
  372. mulps %xmm1, %xmm3
  373. mulps 44 * SIZE(BB), %xmm1
  374. addps %xmm3, %xmm6
  375. movaps 56 * SIZE(BB), %xmm3
  376. addps %xmm1, %xmm7
  377. movaps 56 * SIZE(AA), %xmm1
  378. #ifdef CORE_KATMAI
  379. prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA)
  380. #endif
  381. mulps %xmm0, %xmm2
  382. mulps 52 * SIZE(BB), %xmm0
  383. addps %xmm2, %xmm4
  384. movaps 48 * SIZE(BB), %xmm2
  385. addps %xmm0, %xmm5
  386. movaps 52 * SIZE(AA), %xmm0
  387. mulps %xmm0, %xmm2
  388. mulps 52 * SIZE(BB), %xmm0
  389. addps %xmm2, %xmm6
  390. movaps 64 * SIZE(BB), %xmm2
  391. addps %xmm0, %xmm7
  392. movaps 64 * SIZE(AA), %xmm0
  393. #ifdef CORE_KATMAI
  394. prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA)
  395. #endif
  396. mulps %xmm1, %xmm3
  397. mulps 60 * SIZE(BB), %xmm1
  398. addps %xmm3, %xmm4
  399. movaps 56 * SIZE(BB), %xmm3
  400. addps %xmm1, %xmm5
  401. movaps 60 * SIZE(AA), %xmm1
  402. mulps %xmm1, %xmm3
  403. mulps 60 * SIZE(BB), %xmm1
  404. addps %xmm3, %xmm6
  405. movaps 72 * SIZE(BB), %xmm3
  406. addps %xmm1, %xmm7
  407. movaps 72 * SIZE(AA), %xmm1
  408. addl $64 * SIZE, BB
  409. addl $64 * SIZE, AA
  410. decl %eax
  411. jne .L11
  412. .L12:
  413. #if defined(LT) || defined(RN)
  414. movl KK, %eax
  415. #else
  416. movl K, %eax
  417. subl KK, %eax
  418. #endif
  419. andl $7, %eax # if (k & 1)
  420. BRANCH
  421. je .L14
  422. .L13:
  423. mulps %xmm0, %xmm2
  424. mulps 4 * SIZE(BB), %xmm0
  425. addps %xmm2, %xmm4
  426. movaps 0 * SIZE(BB), %xmm2
  427. addps %xmm0, %xmm5
  428. movaps 4 * SIZE(AA), %xmm0
  429. mulps %xmm0, %xmm2
  430. mulps 4 * SIZE(BB), %xmm0
  431. addps %xmm2, %xmm6
  432. movaps 8 * SIZE(BB), %xmm2
  433. addps %xmm0, %xmm7
  434. movaps 8 * SIZE(AA), %xmm0
  435. addl $8 * SIZE, AA # aoffset += 8
  436. addl $8 * SIZE, BB # boffset1 += 8
  437. decl %eax
  438. jg .L13
  439. .L14:
  440. movaps POSINV, %xmm0
  441. shufps $0xb1, %xmm5, %xmm5
  442. shufps $0xb1, %xmm7, %xmm7
  443. #if defined(LN) || defined(LT)
  444. #ifndef CONJ
  445. xorps %xmm0, %xmm5
  446. xorps %xmm0, %xmm7
  447. #else
  448. xorps %xmm0, %xmm4
  449. xorps %xmm0, %xmm6
  450. #endif
  451. #else
  452. xorps %xmm0, %xmm5
  453. xorps %xmm0, %xmm7
  454. #endif
  455. addps %xmm5, %xmm4
  456. addps %xmm7, %xmm6
  457. #if defined(LN) || defined(RT)
  458. movl KK, %eax
  459. #ifdef LN
  460. subl $4, %eax
  461. #else
  462. subl $1, %eax
  463. #endif
  464. movl AORIG, AA
  465. movl BORIG, B
  466. leal BUFFER, BB
  467. sall $ZBASE_SHIFT, %eax
  468. leal (AA, %eax, 4), AA
  469. leal (B, %eax, 1), B
  470. leal (BB, %eax, 4), BB
  471. #endif
  472. #if defined(LN) || defined(LT)
  473. movsd 0 * SIZE(B), %xmm5
  474. movhps 2 * SIZE(B), %xmm5
  475. movsd 4 * SIZE(B), %xmm7
  476. movhps 6 * SIZE(B), %xmm7
  477. #else
  478. movaps 0 * SIZE(AA), %xmm5
  479. movaps 4 * SIZE(AA), %xmm7
  480. #endif
  481. subps %xmm4, %xmm5
  482. subps %xmm6, %xmm7
  483. #if defined(LN) || defined(LT)
  484. movhlps %xmm5, %xmm4
  485. movhlps %xmm7, %xmm6
  486. #endif
  487. #ifdef LN
  488. #ifdef movsd
  489. xorps %xmm1, %xmm1
  490. #endif
  491. movsd 30 * SIZE(AA), %xmm1
  492. movaps %xmm1, %xmm0
  493. shufps $0x44, %xmm0, %xmm0
  494. shufps $0x11, %xmm1, %xmm1
  495. movaps %xmm6, %xmm3
  496. shufps $0xa0, %xmm3, %xmm3
  497. shufps $0xf5, %xmm6, %xmm6
  498. #ifndef CONJ
  499. xorps POSINV, %xmm6
  500. #else
  501. xorps POSINV, %xmm3
  502. #endif
  503. mulps %xmm0, %xmm3
  504. mulps %xmm1, %xmm6
  505. addps %xmm3, %xmm6
  506. movsd 28 * SIZE(AA), %xmm1
  507. movaps %xmm1, %xmm0
  508. shufps $0x44, %xmm0, %xmm0
  509. shufps $0x11, %xmm1, %xmm1
  510. movaps %xmm6, %xmm2
  511. shufps $0xa0, %xmm2, %xmm2
  512. movaps %xmm6, %xmm3
  513. shufps $0xf5, %xmm3, %xmm3
  514. #ifndef CONJ
  515. xorps POSINV, %xmm3
  516. #else
  517. xorps POSINV, %xmm2
  518. #endif
  519. mulps %xmm0, %xmm2
  520. mulps %xmm1, %xmm3
  521. subps %xmm2, %xmm7
  522. subps %xmm3, %xmm7
  523. movsd 26 * SIZE(AA), %xmm1
  524. movaps %xmm1, %xmm0
  525. shufps $0x44, %xmm0, %xmm0
  526. shufps $0x11, %xmm1, %xmm1
  527. movaps %xmm6, %xmm2
  528. shufps $0xa0, %xmm2, %xmm2
  529. movaps %xmm6, %xmm3
  530. shufps $0xf5, %xmm3, %xmm3
  531. #ifndef CONJ
  532. xorps POSINV, %xmm3
  533. #else
  534. xorps POSINV, %xmm2
  535. #endif
  536. mulps %xmm0, %xmm2
  537. mulps %xmm1, %xmm3
  538. subps %xmm2, %xmm4
  539. subps %xmm3, %xmm4
  540. movsd 24 * SIZE(AA), %xmm1
  541. movaps %xmm1, %xmm0
  542. shufps $0x44, %xmm0, %xmm0
  543. shufps $0x11, %xmm1, %xmm1
  544. movaps %xmm6, %xmm2
  545. shufps $0xa0, %xmm2, %xmm2
  546. movaps %xmm6, %xmm3
  547. shufps $0xf5, %xmm3, %xmm3
  548. #ifndef CONJ
  549. xorps POSINV, %xmm3
  550. #else
  551. xorps POSINV, %xmm2
  552. #endif
  553. mulps %xmm0, %xmm2
  554. mulps %xmm1, %xmm3
  555. subps %xmm2, %xmm5
  556. subps %xmm3, %xmm5
  557. movsd 20 * SIZE(AA), %xmm1
  558. movaps %xmm1, %xmm0
  559. shufps $0x44, %xmm0, %xmm0
  560. shufps $0x11, %xmm1, %xmm1
  561. movaps %xmm7, %xmm3
  562. shufps $0xa0, %xmm3, %xmm3
  563. shufps $0xf5, %xmm7, %xmm7
  564. #ifndef CONJ
  565. xorps POSINV, %xmm7
  566. #else
  567. xorps POSINV, %xmm3
  568. #endif
  569. mulps %xmm0, %xmm3
  570. mulps %xmm1, %xmm7
  571. addps %xmm3, %xmm7
  572. movsd 18 * SIZE(AA), %xmm1
  573. movaps %xmm1, %xmm0
  574. shufps $0x44, %xmm0, %xmm0
  575. shufps $0x11, %xmm1, %xmm1
  576. movaps %xmm7, %xmm2
  577. shufps $0xa0, %xmm2, %xmm2
  578. movaps %xmm7, %xmm3
  579. shufps $0xf5, %xmm3, %xmm3
  580. #ifndef CONJ
  581. xorps POSINV, %xmm3
  582. #else
  583. xorps POSINV, %xmm2
  584. #endif
  585. mulps %xmm0, %xmm2
  586. mulps %xmm1, %xmm3
  587. subps %xmm2, %xmm4
  588. subps %xmm3, %xmm4
  589. movsd 16 * SIZE(AA), %xmm1
  590. movaps %xmm1, %xmm0
  591. shufps $0x44, %xmm0, %xmm0
  592. shufps $0x11, %xmm1, %xmm1
  593. movaps %xmm7, %xmm2
  594. shufps $0xa0, %xmm2, %xmm2
  595. movaps %xmm7, %xmm3
  596. shufps $0xf5, %xmm3, %xmm3
  597. #ifndef CONJ
  598. xorps POSINV, %xmm3
  599. #else
  600. xorps POSINV, %xmm2
  601. #endif
  602. mulps %xmm0, %xmm2
  603. mulps %xmm1, %xmm3
  604. subps %xmm2, %xmm5
  605. subps %xmm3, %xmm5
  606. movsd 10 * SIZE(AA), %xmm1
  607. movaps %xmm1, %xmm0
  608. shufps $0x44, %xmm0, %xmm0
  609. shufps $0x11, %xmm1, %xmm1
  610. movaps %xmm4, %xmm3
  611. shufps $0xa0, %xmm3, %xmm3
  612. shufps $0xf5, %xmm4, %xmm4
  613. #ifndef CONJ
  614. xorps POSINV, %xmm4
  615. #else
  616. xorps POSINV, %xmm3
  617. #endif
  618. mulps %xmm0, %xmm3
  619. mulps %xmm1, %xmm4
  620. addps %xmm3, %xmm4
  621. movsd 8 * SIZE(AA), %xmm1
  622. movaps %xmm1, %xmm0
  623. shufps $0x44, %xmm0, %xmm0
  624. shufps $0x11, %xmm1, %xmm1
  625. movaps %xmm4, %xmm2
  626. shufps $0xa0, %xmm2, %xmm2
  627. movaps %xmm4, %xmm3
  628. shufps $0xf5, %xmm3, %xmm3
  629. #ifndef CONJ
  630. xorps POSINV, %xmm3
  631. #else
  632. xorps POSINV, %xmm2
  633. #endif
  634. mulps %xmm0, %xmm2
  635. mulps %xmm1, %xmm3
  636. subps %xmm2, %xmm5
  637. subps %xmm3, %xmm5
  638. movsd 0 * SIZE(AA), %xmm1
  639. movaps %xmm1, %xmm0
  640. shufps $0x44, %xmm0, %xmm0
  641. shufps $0x11, %xmm1, %xmm1
  642. movaps %xmm5, %xmm3
  643. shufps $0xa0, %xmm3, %xmm3
  644. shufps $0xf5, %xmm5, %xmm5
  645. #ifndef CONJ
  646. xorps POSINV, %xmm5
  647. #else
  648. xorps POSINV, %xmm3
  649. #endif
  650. mulps %xmm0, %xmm3
  651. mulps %xmm1, %xmm5
  652. addps %xmm3, %xmm5
  653. #endif
  654. #ifdef LT
  655. #ifdef movsd
  656. xorps %xmm1, %xmm1
  657. #endif
  658. movsd 0 * SIZE(AA), %xmm1
  659. movaps %xmm1, %xmm0
  660. shufps $0x44, %xmm0, %xmm0
  661. shufps $0x11, %xmm1, %xmm1
  662. movaps %xmm5, %xmm3
  663. shufps $0xa0, %xmm3, %xmm3
  664. shufps $0xf5, %xmm5, %xmm5
  665. #ifndef CONJ
  666. xorps POSINV, %xmm5
  667. #else
  668. xorps POSINV, %xmm3
  669. #endif
  670. mulps %xmm0, %xmm3
  671. mulps %xmm1, %xmm5
  672. addps %xmm3, %xmm5
  673. movsd 2 * SIZE(AA), %xmm1
  674. movaps %xmm1, %xmm0
  675. shufps $0x44, %xmm0, %xmm0
  676. shufps $0x11, %xmm1, %xmm1
  677. movaps %xmm5, %xmm2
  678. shufps $0xa0, %xmm2, %xmm2
  679. movaps %xmm5, %xmm3
  680. shufps $0xf5, %xmm3, %xmm3
  681. #ifndef CONJ
  682. xorps POSINV, %xmm3
  683. #else
  684. xorps POSINV, %xmm2
  685. #endif
  686. mulps %xmm0, %xmm2
  687. mulps %xmm1, %xmm3
  688. subps %xmm2, %xmm4
  689. subps %xmm3, %xmm4
  690. movsd 4 * SIZE(AA), %xmm1
  691. movaps %xmm1, %xmm0
  692. shufps $0x44, %xmm0, %xmm0
  693. shufps $0x11, %xmm1, %xmm1
  694. movaps %xmm5, %xmm2
  695. shufps $0xa0, %xmm2, %xmm2
  696. movaps %xmm5, %xmm3
  697. shufps $0xf5, %xmm3, %xmm3
  698. #ifndef CONJ
  699. xorps POSINV, %xmm3
  700. #else
  701. xorps POSINV, %xmm2
  702. #endif
  703. mulps %xmm0, %xmm2
  704. mulps %xmm1, %xmm3
  705. subps %xmm2, %xmm7
  706. subps %xmm3, %xmm7
  707. movsd 6 * SIZE(AA), %xmm1
  708. movaps %xmm1, %xmm0
  709. shufps $0x44, %xmm0, %xmm0
  710. shufps $0x11, %xmm1, %xmm1
  711. movaps %xmm5, %xmm2
  712. shufps $0xa0, %xmm2, %xmm2
  713. movaps %xmm5, %xmm3
  714. shufps $0xf5, %xmm3, %xmm3
  715. #ifndef CONJ
  716. xorps POSINV, %xmm3
  717. #else
  718. xorps POSINV, %xmm2
  719. #endif
  720. mulps %xmm0, %xmm2
  721. mulps %xmm1, %xmm3
  722. subps %xmm2, %xmm6
  723. subps %xmm3, %xmm6
  724. movsd 10 * SIZE(AA), %xmm1
  725. movaps %xmm1, %xmm0
  726. shufps $0x44, %xmm0, %xmm0
  727. shufps $0x11, %xmm1, %xmm1
  728. movaps %xmm4, %xmm3
  729. shufps $0xa0, %xmm3, %xmm3
  730. shufps $0xf5, %xmm4, %xmm4
  731. #ifndef CONJ
  732. xorps POSINV, %xmm4
  733. #else
  734. xorps POSINV, %xmm3
  735. #endif
  736. mulps %xmm0, %xmm3
  737. mulps %xmm1, %xmm4
  738. addps %xmm3, %xmm4
  739. movsd 12 * SIZE(AA), %xmm1
  740. movaps %xmm1, %xmm0
  741. shufps $0x44, %xmm0, %xmm0
  742. shufps $0x11, %xmm1, %xmm1
  743. movaps %xmm4, %xmm2
  744. shufps $0xa0, %xmm2, %xmm2
  745. movaps %xmm4, %xmm3
  746. shufps $0xf5, %xmm3, %xmm3
  747. #ifndef CONJ
  748. xorps POSINV, %xmm3
  749. #else
  750. xorps POSINV, %xmm2
  751. #endif
  752. mulps %xmm0, %xmm2
  753. mulps %xmm1, %xmm3
  754. subps %xmm2, %xmm7
  755. subps %xmm3, %xmm7
  756. movsd 14 * SIZE(AA), %xmm1
  757. movaps %xmm1, %xmm0
  758. shufps $0x44, %xmm0, %xmm0
  759. shufps $0x11, %xmm1, %xmm1
  760. movaps %xmm4, %xmm2
  761. shufps $0xa0, %xmm2, %xmm2
  762. movaps %xmm4, %xmm3
  763. shufps $0xf5, %xmm3, %xmm3
  764. #ifndef CONJ
  765. xorps POSINV, %xmm3
  766. #else
  767. xorps POSINV, %xmm2
  768. #endif
  769. mulps %xmm0, %xmm2
  770. mulps %xmm1, %xmm3
  771. subps %xmm2, %xmm6
  772. subps %xmm3, %xmm6
  773. movsd 20 * SIZE(AA), %xmm1
  774. movaps %xmm1, %xmm0
  775. shufps $0x44, %xmm0, %xmm0
  776. shufps $0x11, %xmm1, %xmm1
  777. movaps %xmm7, %xmm3
  778. shufps $0xa0, %xmm3, %xmm3
  779. shufps $0xf5, %xmm7, %xmm7
  780. #ifndef CONJ
  781. xorps POSINV, %xmm7
  782. #else
  783. xorps POSINV, %xmm3
  784. #endif
  785. mulps %xmm0, %xmm3
  786. mulps %xmm1, %xmm7
  787. addps %xmm3, %xmm7
  788. movsd 22 * SIZE(AA), %xmm1
  789. movaps %xmm1, %xmm0
  790. shufps $0x44, %xmm0, %xmm0
  791. shufps $0x11, %xmm1, %xmm1
  792. movaps %xmm7, %xmm2
  793. shufps $0xa0, %xmm2, %xmm2
  794. movaps %xmm7, %xmm3
  795. shufps $0xf5, %xmm3, %xmm3
  796. #ifndef CONJ
  797. xorps POSINV, %xmm3
  798. #else
  799. xorps POSINV, %xmm2
  800. #endif
  801. mulps %xmm0, %xmm2
  802. mulps %xmm1, %xmm3
  803. subps %xmm2, %xmm6
  804. subps %xmm3, %xmm6
  805. movsd 30 * SIZE(AA), %xmm1
  806. movaps %xmm1, %xmm0
  807. shufps $0x44, %xmm0, %xmm0
  808. shufps $0x11, %xmm1, %xmm1
  809. movaps %xmm6, %xmm3
  810. shufps $0xa0, %xmm3, %xmm3
  811. shufps $0xf5, %xmm6, %xmm6
  812. #ifndef CONJ
  813. xorps POSINV, %xmm6
  814. #else
  815. xorps POSINV, %xmm3
  816. #endif
  817. mulps %xmm0, %xmm3
  818. mulps %xmm1, %xmm6
  819. addps %xmm3, %xmm6
  820. #endif
  821. #if defined(RN) || defined(RT)
  822. movsd 0 * SIZE(B), %xmm1
  823. movhps 2 * SIZE(B), %xmm1
  824. #ifdef HAVE_SSE2
  825. pshufd $0x44, %xmm1, %xmm2
  826. pshufd $0x11, %xmm1, %xmm3
  827. pshufd $0xa0, %xmm5, %xmm4
  828. pshufd $0xf5, %xmm5, %xmm5
  829. pshufd $0xa0, %xmm7, %xmm6
  830. pshufd $0xf5, %xmm7, %xmm7
  831. #else
  832. movaps %xmm1, %xmm2
  833. shufps $0x44, %xmm2, %xmm2
  834. movaps %xmm1, %xmm3
  835. shufps $0x11, %xmm3, %xmm3
  836. movaps %xmm5, %xmm4
  837. shufps $0xa0, %xmm4, %xmm4
  838. shufps $0xf5, %xmm5, %xmm5
  839. movaps %xmm7, %xmm6
  840. shufps $0xa0, %xmm6, %xmm6
  841. shufps $0xf5, %xmm7, %xmm7
  842. #endif
  843. #ifndef CONJ
  844. xorps %xmm0, %xmm5
  845. xorps %xmm0, %xmm7
  846. #else
  847. xorps %xmm0, %xmm4
  848. xorps %xmm0, %xmm6
  849. #endif
  850. mulps %xmm2, %xmm4
  851. mulps %xmm3, %xmm5
  852. mulps %xmm2, %xmm6
  853. mulps %xmm3, %xmm7
  854. addps %xmm4, %xmm5
  855. addps %xmm6, %xmm7
  856. #endif
  857. #ifdef LN
  858. subl $8 * SIZE, CO1
  859. #endif
  860. #if defined(LN) || defined(LT)
  861. movlhps %xmm4, %xmm5
  862. movlhps %xmm6, %xmm7
  863. movlps %xmm5, 0 * SIZE(B)
  864. movhps %xmm5, 2 * SIZE(B)
  865. movlps %xmm7, 4 * SIZE(B)
  866. movhps %xmm7, 6 * SIZE(B)
  867. #ifdef HAVE_SSE2
  868. pshufd $0x00, %xmm5, %xmm0
  869. pshufd $0x55, %xmm5, %xmm1
  870. pshufd $0xaa, %xmm5, %xmm2
  871. pshufd $0xff, %xmm5, %xmm3
  872. #else
  873. movaps %xmm5, %xmm0
  874. shufps $0x00, %xmm0, %xmm0
  875. movaps %xmm5, %xmm1
  876. shufps $0x55, %xmm1, %xmm1
  877. movaps %xmm5, %xmm2
  878. shufps $0xaa, %xmm2, %xmm2
  879. movaps %xmm5, %xmm3
  880. shufps $0xff, %xmm3, %xmm3
  881. #endif
  882. movaps %xmm0, 0 * SIZE(BB)
  883. movaps %xmm1, 4 * SIZE(BB)
  884. movaps %xmm2, 8 * SIZE(BB)
  885. movaps %xmm3, 12 * SIZE(BB)
  886. #ifdef HAVE_SSE2
  887. pshufd $0x00, %xmm7, %xmm0
  888. pshufd $0x55, %xmm7, %xmm1
  889. pshufd $0xaa, %xmm7, %xmm2
  890. pshufd $0xff, %xmm7, %xmm3
  891. #else
  892. movaps %xmm7, %xmm0
  893. shufps $0x00, %xmm0, %xmm0
  894. movaps %xmm7, %xmm1
  895. shufps $0x55, %xmm1, %xmm1
  896. movaps %xmm7, %xmm2
  897. shufps $0xaa, %xmm2, %xmm2
  898. movaps %xmm7, %xmm3
  899. shufps $0xff, %xmm3, %xmm3
  900. #endif
  901. movaps %xmm0, 16 * SIZE(BB)
  902. movaps %xmm1, 20 * SIZE(BB)
  903. movaps %xmm2, 24 * SIZE(BB)
  904. movaps %xmm3, 28 * SIZE(BB)
  905. #else
  906. movaps %xmm5, 0 * SIZE(AA)
  907. movaps %xmm7, 4 * SIZE(AA)
  908. #endif
  909. movlps %xmm5, 0 * SIZE(CO1)
  910. movhps %xmm5, 2 * SIZE(CO1)
  911. movlps %xmm7, 4 * SIZE(CO1)
  912. movhps %xmm7, 6 * SIZE(CO1)
  913. #ifndef LN
  914. addl $8 * SIZE, CO1
  915. #endif
  916. #if defined(LT) || defined(RN)
  917. movl K, %eax
  918. subl KK, %eax
  919. sall $2 + ZBASE_SHIFT, %eax
  920. addl %eax, AA
  921. #ifdef LT
  922. addl $8 * SIZE, B
  923. #endif
  924. #endif
  925. #ifdef LN
  926. subl $4, KK
  927. movl BORIG, B
  928. #endif
  929. #ifdef LT
  930. addl $4, KK
  931. #endif
  932. #ifdef RT
  933. movl K, %eax
  934. movl BORIG, B
  935. sall $2 + ZBASE_SHIFT, %eax
  936. addl %eax, AORIG
  937. #endif
  938. decl %ebx # i --
  939. jg .L10
  940. ALIGN_2
  941. .L50:
  942. movl M, %ebx
  943. testl $2, %ebx
  944. jle .L70
  945. #ifdef LN
  946. movl K, %eax
  947. sall $1 + ZBASE_SHIFT, %eax
  948. subl %eax, AORIG
  949. #endif
  950. #if defined(LN) || defined(RT)
  951. movl AORIG, %eax
  952. movl %eax, AA
  953. movl KK, %eax
  954. sall $1 + ZBASE_SHIFT, %eax
  955. addl %eax, AA
  956. #endif
  957. leal BUFFER, BB
  958. #if defined(LN) || defined(RT)
  959. movl KK, %eax
  960. sall $2 + ZBASE_SHIFT, %eax
  961. addl %eax, BB
  962. #endif
  963. movaps 0 * SIZE(BB), %xmm2
  964. xorps %xmm4, %xmm4
  965. movaps 0 * SIZE(AA), %xmm0
  966. xorps %xmm5, %xmm5
  967. movaps 8 * SIZE(BB), %xmm3
  968. xorps %xmm6, %xmm6
  969. movaps 8 * SIZE(AA), %xmm1
  970. xorps %xmm7, %xmm7
  971. #if defined(LT) || defined(RN)
  972. movl KK, %eax
  973. #else
  974. movl K, %eax
  975. subl KK, %eax
  976. #endif
  977. sarl $3, %eax
  978. je .L52
  979. ALIGN_4
  980. .L51:
  981. mulps %xmm0, %xmm2
  982. mulps 4 * SIZE(BB), %xmm0
  983. addps %xmm2, %xmm4
  984. movaps 16 * SIZE(BB), %xmm2
  985. addps %xmm0, %xmm5
  986. movaps 4 * SIZE(AA), %xmm0
  987. mulps %xmm0, %xmm3
  988. mulps 12 * SIZE(BB), %xmm0
  989. addps %xmm3, %xmm6
  990. movaps 24 * SIZE(BB), %xmm3
  991. addps %xmm0, %xmm7
  992. movaps 16 * SIZE(AA), %xmm0
  993. mulps %xmm1, %xmm2
  994. mulps 20 * SIZE(BB), %xmm1
  995. addps %xmm2, %xmm4
  996. movaps 32 * SIZE(BB), %xmm2
  997. addps %xmm1, %xmm5
  998. movaps 12 * SIZE(AA), %xmm1
  999. mulps %xmm1, %xmm3
  1000. mulps 28 * SIZE(BB), %xmm1
  1001. addps %xmm3, %xmm6
  1002. movaps 40 * SIZE(BB), %xmm3
  1003. addps %xmm1, %xmm7
  1004. movaps 24 * SIZE(AA), %xmm1
  1005. mulps %xmm0, %xmm2
  1006. mulps 36 * SIZE(BB), %xmm0
  1007. addps %xmm2, %xmm4
  1008. movaps 48 * SIZE(BB), %xmm2
  1009. addps %xmm0, %xmm5
  1010. movaps 20 * SIZE(AA), %xmm0
  1011. mulps %xmm0, %xmm3
  1012. mulps 44 * SIZE(BB), %xmm0
  1013. addps %xmm3, %xmm6
  1014. movaps 56 * SIZE(BB), %xmm3
  1015. addps %xmm0, %xmm7
  1016. movaps 32 * SIZE(AA), %xmm0
  1017. mulps %xmm1, %xmm2
  1018. mulps 52 * SIZE(BB), %xmm1
  1019. addps %xmm2, %xmm4
  1020. movaps 64 * SIZE(BB), %xmm2
  1021. addps %xmm1, %xmm5
  1022. movaps 28 * SIZE(AA), %xmm1
  1023. mulps %xmm1, %xmm3
  1024. mulps 60 * SIZE(BB), %xmm1
  1025. addps %xmm3, %xmm6
  1026. movaps 72 * SIZE(BB), %xmm3
  1027. addps %xmm1, %xmm7
  1028. movaps 40 * SIZE(AA), %xmm1
  1029. addl $32 * SIZE, AA
  1030. addl $64 * SIZE, BB
  1031. decl %eax
  1032. jne .L51
  1033. ALIGN_4
  1034. .L52:
  1035. #if defined(LT) || defined(RN)
  1036. movl KK, %eax
  1037. #else
  1038. movl K, %eax
  1039. subl KK, %eax
  1040. #endif
  1041. andl $7, %eax # if (k & 1)
  1042. BRANCH
  1043. je .L54
  1044. .L53:
  1045. mulps %xmm0, %xmm2
  1046. mulps 4 * SIZE(BB), %xmm0
  1047. addps %xmm2, %xmm4
  1048. movaps 8 * SIZE(BB), %xmm2
  1049. addps %xmm0, %xmm5
  1050. movaps 4 * SIZE(AA), %xmm0
  1051. addl $4 * SIZE, AA # aoffset += 8
  1052. addl $8 * SIZE, BB # boffset1 += 8
  1053. decl %eax
  1054. jg .L53
  1055. .L54:
  1056. addps %xmm6, %xmm4
  1057. addps %xmm7, %xmm5
  1058. movaps POSINV, %xmm0
  1059. shufps $0xb1, %xmm5, %xmm5
  1060. #if defined(LN) || defined(LT)
  1061. #ifndef CONJ
  1062. xorps %xmm0, %xmm5
  1063. #else
  1064. xorps %xmm0, %xmm4
  1065. #endif
  1066. #else
  1067. xorps %xmm0, %xmm5
  1068. #endif
  1069. addps %xmm5, %xmm4
  1070. #if defined(LN) || defined(RT)
  1071. movl KK, %eax
  1072. #ifdef LN
  1073. subl $2, %eax
  1074. #else
  1075. subl $1, %eax
  1076. #endif
  1077. movl AORIG, AA
  1078. movl BORIG, B
  1079. leal BUFFER, BB
  1080. sall $ZBASE_SHIFT, %eax
  1081. leal (AA, %eax, 2), AA
  1082. leal (B, %eax, 1), B
  1083. leal (BB, %eax, 4), BB
  1084. #endif
  1085. #if defined(LN) || defined(LT)
  1086. movsd 0 * SIZE(B), %xmm5
  1087. movhps 2 * SIZE(B), %xmm5
  1088. #else
  1089. movaps 0 * SIZE(AA), %xmm5
  1090. #endif
  1091. subps %xmm4, %xmm5
  1092. #if defined(LN) || defined(LT)
  1093. movhlps %xmm5, %xmm4
  1094. #endif
  1095. #ifdef LN
  1096. #ifdef movsd
  1097. xorps %xmm1, %xmm1
  1098. #endif
  1099. movsd 6 * SIZE(AA), %xmm1
  1100. movaps %xmm1, %xmm0
  1101. shufps $0x44, %xmm0, %xmm0
  1102. shufps $0x11, %xmm1, %xmm1
  1103. movaps %xmm4, %xmm3
  1104. shufps $0xa0, %xmm3, %xmm3
  1105. shufps $0xf5, %xmm4, %xmm4
  1106. #ifndef CONJ
  1107. xorps POSINV, %xmm4
  1108. #else
  1109. xorps POSINV, %xmm3
  1110. #endif
  1111. mulps %xmm0, %xmm3
  1112. mulps %xmm1, %xmm4
  1113. addps %xmm3, %xmm4
  1114. movsd 4 * SIZE(AA), %xmm1
  1115. movaps %xmm1, %xmm0
  1116. shufps $0x44, %xmm0, %xmm0
  1117. shufps $0x11, %xmm1, %xmm1
  1118. movaps %xmm4, %xmm2
  1119. shufps $0xa0, %xmm2, %xmm2
  1120. movaps %xmm4, %xmm3
  1121. shufps $0xf5, %xmm3, %xmm3
  1122. #ifndef CONJ
  1123. xorps POSINV, %xmm3
  1124. #else
  1125. xorps POSINV, %xmm2
  1126. #endif
  1127. mulps %xmm0, %xmm2
  1128. mulps %xmm1, %xmm3
  1129. subps %xmm2, %xmm5
  1130. subps %xmm3, %xmm5
  1131. movsd 0 * SIZE(AA), %xmm1
  1132. movaps %xmm1, %xmm0
  1133. shufps $0x44, %xmm0, %xmm0
  1134. shufps $0x11, %xmm1, %xmm1
  1135. movaps %xmm5, %xmm3
  1136. shufps $0xa0, %xmm3, %xmm3
  1137. shufps $0xf5, %xmm5, %xmm5
  1138. #ifndef CONJ
  1139. xorps POSINV, %xmm5
  1140. #else
  1141. xorps POSINV, %xmm3
  1142. #endif
  1143. mulps %xmm0, %xmm3
  1144. mulps %xmm1, %xmm5
  1145. addps %xmm3, %xmm5
  1146. #endif
  1147. #ifdef LT
  1148. #ifdef movsd
  1149. xorps %xmm1, %xmm1
  1150. #endif
  1151. movsd 0 * SIZE(AA), %xmm1
  1152. movaps %xmm1, %xmm0
  1153. shufps $0x44, %xmm0, %xmm0
  1154. shufps $0x11, %xmm1, %xmm1
  1155. movaps %xmm5, %xmm3
  1156. shufps $0xa0, %xmm3, %xmm3
  1157. shufps $0xf5, %xmm5, %xmm5
  1158. #ifndef CONJ
  1159. xorps POSINV, %xmm5
  1160. #else
  1161. xorps POSINV, %xmm3
  1162. #endif
  1163. mulps %xmm0, %xmm3
  1164. mulps %xmm1, %xmm5
  1165. addps %xmm3, %xmm5
  1166. movsd 2 * SIZE(AA), %xmm1
  1167. movaps %xmm1, %xmm0
  1168. shufps $0x44, %xmm0, %xmm0
  1169. shufps $0x11, %xmm1, %xmm1
  1170. movaps %xmm5, %xmm2
  1171. shufps $0xa0, %xmm2, %xmm2
  1172. movaps %xmm5, %xmm3
  1173. shufps $0xf5, %xmm3, %xmm3
  1174. #ifndef CONJ
  1175. xorps POSINV, %xmm3
  1176. #else
  1177. xorps POSINV, %xmm2
  1178. #endif
  1179. mulps %xmm0, %xmm2
  1180. mulps %xmm1, %xmm3
  1181. subps %xmm2, %xmm4
  1182. subps %xmm3, %xmm4
  1183. movsd 6 * SIZE(AA), %xmm1
  1184. movaps %xmm1, %xmm0
  1185. shufps $0x44, %xmm0, %xmm0
  1186. shufps $0x11, %xmm1, %xmm1
  1187. movaps %xmm4, %xmm3
  1188. shufps $0xa0, %xmm3, %xmm3
  1189. shufps $0xf5, %xmm4, %xmm4
  1190. #ifndef CONJ
  1191. xorps POSINV, %xmm4
  1192. #else
  1193. xorps POSINV, %xmm3
  1194. #endif
  1195. mulps %xmm0, %xmm3
  1196. mulps %xmm1, %xmm4
  1197. addps %xmm3, %xmm4
  1198. #endif
  1199. #if defined(RN) || defined(RT)
  1200. movsd 0 * SIZE(B), %xmm1
  1201. movhps 2 * SIZE(B), %xmm1
  1202. movaps %xmm1, %xmm2
  1203. shufps $0x44, %xmm2, %xmm2
  1204. movaps %xmm1, %xmm3
  1205. shufps $0x11, %xmm2, %xmm3
  1206. movaps %xmm5, %xmm4
  1207. shufps $0xa0, %xmm4, %xmm4
  1208. shufps $0xf5, %xmm5, %xmm5
  1209. #ifndef CONJ
  1210. xorps %xmm0, %xmm5
  1211. #else
  1212. xorps %xmm0, %xmm4
  1213. #endif
  1214. mulps %xmm2, %xmm4
  1215. mulps %xmm3, %xmm5
  1216. addps %xmm4, %xmm5
  1217. #endif
  1218. #ifdef LN
  1219. subl $4 * SIZE, CO1
  1220. #endif
  1221. #if defined(LN) || defined(LT)
  1222. movlhps %xmm4, %xmm5
  1223. movlps %xmm5, 0 * SIZE(B)
  1224. movhps %xmm5, 2 * SIZE(B)
  1225. #ifdef HAVE_SSE2
  1226. pshufd $0x00, %xmm5, %xmm0
  1227. pshufd $0x55, %xmm5, %xmm1
  1228. pshufd $0xaa, %xmm5, %xmm2
  1229. pshufd $0xff, %xmm5, %xmm3
  1230. #else
  1231. movaps %xmm5, %xmm0
  1232. shufps $0x00, %xmm0, %xmm0
  1233. movaps %xmm5, %xmm1
  1234. shufps $0x55, %xmm1, %xmm1
  1235. movaps %xmm5, %xmm2
  1236. shufps $0xaa, %xmm2, %xmm2
  1237. movaps %xmm5, %xmm3
  1238. shufps $0xff, %xmm3, %xmm3
  1239. #endif
  1240. movaps %xmm0, 0 * SIZE(BB)
  1241. movaps %xmm1, 4 * SIZE(BB)
  1242. movaps %xmm2, 8 * SIZE(BB)
  1243. movaps %xmm3, 12 * SIZE(BB)
  1244. #else
  1245. movaps %xmm5, 0 * SIZE(AA)
  1246. #endif
  1247. movlps %xmm5, 0 * SIZE(CO1)
  1248. movhps %xmm5, 2 * SIZE(CO1)
  1249. #ifndef LN
  1250. addl $4 * SIZE, CO1
  1251. #endif
  1252. #if defined(LT) || defined(RN)
  1253. movl K, %eax
  1254. subl KK, %eax
  1255. sall $1 + ZBASE_SHIFT, %eax
  1256. addl %eax, AA
  1257. #ifdef LT
  1258. addl $4 * SIZE, B
  1259. #endif
  1260. #endif
  1261. #ifdef LN
  1262. subl $2, KK
  1263. movl BORIG, B
  1264. #endif
  1265. #ifdef LT
  1266. addl $2, KK
  1267. #endif
  1268. #ifdef RT
  1269. movl K, %eax
  1270. movl BORIG, B
  1271. sall $1 + ZBASE_SHIFT, %eax
  1272. addl %eax, AORIG
  1273. #endif
  1274. ALIGN_2
  1275. .L70:
  1276. movl M, %ebx
  1277. testl $1, %ebx
  1278. jle .L99
  1279. #ifdef LN
  1280. movl K, %eax
  1281. sall $ZBASE_SHIFT, %eax
  1282. subl %eax, AORIG
  1283. #endif
  1284. #if defined(LN) || defined(RT)
  1285. movl AORIG, %eax
  1286. movl %eax, AA
  1287. movl KK, %eax
  1288. sall $ZBASE_SHIFT, %eax
  1289. addl %eax, AA
  1290. #endif
  1291. leal BUFFER, BB
  1292. #if defined(LN) || defined(RT)
  1293. movl KK, %eax
  1294. sall $2 + ZBASE_SHIFT, %eax
  1295. addl %eax, BB
  1296. #endif
  1297. movaps 0 * SIZE(BB), %xmm2
  1298. xorps %xmm4, %xmm4
  1299. #ifdef movsd
  1300. xorps %xmm0, %xmm0
  1301. #endif
  1302. movsd 0 * SIZE(AA), %xmm0
  1303. xorps %xmm5, %xmm5
  1304. movaps 8 * SIZE(BB), %xmm3
  1305. #ifdef movsd
  1306. xorps %xmm1, %xmm1
  1307. #endif
  1308. movsd 8 * SIZE(AA), %xmm1
  1309. #if defined(LT) || defined(RN)
  1310. movl KK, %eax
  1311. #else
  1312. movl K, %eax
  1313. subl KK, %eax
  1314. #endif
  1315. sarl $3, %eax
  1316. je .L72
  1317. ALIGN_4
  1318. .L71:
  1319. mulps %xmm0, %xmm2
  1320. addps %xmm2, %xmm4
  1321. movaps 4 * SIZE(BB), %xmm2
  1322. mulps %xmm0, %xmm2
  1323. movsd 2 * SIZE(AA), %xmm0
  1324. addps %xmm2, %xmm5
  1325. movaps 16 * SIZE(BB), %xmm2
  1326. mulps %xmm0, %xmm3
  1327. addps %xmm3, %xmm4
  1328. movaps 12 * SIZE(BB), %xmm3
  1329. mulps %xmm0, %xmm3
  1330. movsd 4 * SIZE(AA), %xmm0
  1331. addps %xmm3, %xmm5
  1332. movaps 24 * SIZE(BB), %xmm3
  1333. mulps %xmm0, %xmm2
  1334. addps %xmm2, %xmm4
  1335. movaps 20 * SIZE(BB), %xmm2
  1336. mulps %xmm0, %xmm2
  1337. movsd 6 * SIZE(AA), %xmm0
  1338. addps %xmm2, %xmm5
  1339. movaps 32 * SIZE(BB), %xmm2
  1340. mulps %xmm0, %xmm3
  1341. addps %xmm3, %xmm4
  1342. movaps 28 * SIZE(BB), %xmm3
  1343. mulps %xmm0, %xmm3
  1344. movsd 16 * SIZE(AA), %xmm0
  1345. addps %xmm3, %xmm5
  1346. movaps 40 * SIZE(BB), %xmm3
  1347. mulps %xmm1, %xmm2
  1348. addps %xmm2, %xmm4
  1349. movaps 36 * SIZE(BB), %xmm2
  1350. mulps %xmm1, %xmm2
  1351. movsd 10 * SIZE(AA), %xmm1
  1352. addps %xmm2, %xmm5
  1353. movaps 48 * SIZE(BB), %xmm2
  1354. mulps %xmm1, %xmm3
  1355. addps %xmm3, %xmm4
  1356. movaps 44 * SIZE(BB), %xmm3
  1357. mulps %xmm1, %xmm3
  1358. movsd 12 * SIZE(AA), %xmm1
  1359. addps %xmm3, %xmm5
  1360. movaps 56 * SIZE(BB), %xmm3
  1361. mulps %xmm1, %xmm2
  1362. addps %xmm2, %xmm4
  1363. movaps 52 * SIZE(BB), %xmm2
  1364. mulps %xmm1, %xmm2
  1365. movsd 14 * SIZE(AA), %xmm1
  1366. addps %xmm2, %xmm5
  1367. movaps 64 * SIZE(BB), %xmm2
  1368. mulps %xmm1, %xmm3
  1369. addps %xmm3, %xmm4
  1370. movaps 60 * SIZE(BB), %xmm3
  1371. mulps %xmm1, %xmm3
  1372. movsd 24 * SIZE(AA), %xmm1
  1373. addps %xmm3, %xmm5
  1374. movaps 72 * SIZE(BB), %xmm3
  1375. addl $16 * SIZE, AA
  1376. addl $64 * SIZE, BB
  1377. decl %eax
  1378. jne .L71
  1379. ALIGN_2
  1380. .L72:
  1381. #if defined(LT) || defined(RN)
  1382. movl KK, %eax
  1383. #else
  1384. movl K, %eax
  1385. subl KK, %eax
  1386. #endif
  1387. andl $7, %eax
  1388. je .L74
  1389. .L73:
  1390. mulps %xmm0, %xmm2
  1391. addps %xmm2, %xmm4
  1392. movaps 4 * SIZE(BB), %xmm2
  1393. mulps %xmm0, %xmm2
  1394. movsd 2 * SIZE(AA), %xmm0
  1395. addps %xmm2, %xmm5
  1396. movaps 8 * SIZE(BB), %xmm2
  1397. addl $2 * SIZE, AA # aoffset += 8
  1398. addl $8 * SIZE, BB # boffset1 += 8
  1399. decl %eax
  1400. jg .L73
  1401. .L74:
  1402. movaps POSINV, %xmm0
  1403. shufps $0xb1, %xmm5, %xmm5
  1404. #if defined(LN) || defined(LT)
  1405. #ifndef CONJ
  1406. xorps %xmm0, %xmm5
  1407. #else
  1408. xorps %xmm0, %xmm4
  1409. #endif
  1410. #else
  1411. xorps %xmm0, %xmm5
  1412. #endif
  1413. addps %xmm5, %xmm4
  1414. #if defined(LN) || defined(RT)
  1415. movl KK, %eax
  1416. subl $1, %eax
  1417. movl AORIG, AA
  1418. movl BORIG, B
  1419. leal BUFFER, BB
  1420. sall $ZBASE_SHIFT, %eax
  1421. leal (AA, %eax, 1), AA
  1422. leal (B, %eax, 1), B
  1423. leal (BB, %eax, 4), BB
  1424. #endif
  1425. #ifdef movsd
  1426. xorps %xmm5, %xmm5
  1427. #endif
  1428. #if defined(LN) || defined(LT)
  1429. movsd 0 * SIZE(B), %xmm5
  1430. #else
  1431. movsd 0 * SIZE(AA), %xmm5
  1432. #endif
  1433. subps %xmm4, %xmm5
  1434. #ifdef movsd
  1435. xorps %xmm1, %xmm1
  1436. #endif
  1437. #if defined(LN) || defined(LT)
  1438. movsd 0 * SIZE(AA), %xmm1
  1439. #else
  1440. movsd 0 * SIZE(B), %xmm1
  1441. #endif
  1442. movaps %xmm1, %xmm0
  1443. shufps $0x44, %xmm0, %xmm0
  1444. shufps $0x11, %xmm1, %xmm1
  1445. movaps %xmm5, %xmm3
  1446. shufps $0xa0, %xmm3, %xmm3
  1447. shufps $0xf5, %xmm5, %xmm5
  1448. #ifndef CONJ
  1449. xorps POSINV, %xmm5
  1450. #else
  1451. xorps POSINV, %xmm3
  1452. #endif
  1453. mulps %xmm0, %xmm3
  1454. mulps %xmm1, %xmm5
  1455. addps %xmm3, %xmm5
  1456. #ifdef LN
  1457. subl $2 * SIZE, CO1
  1458. #endif
  1459. #if defined(LN) || defined(LT)
  1460. movlps %xmm5, 0 * SIZE(B)
  1461. movaps %xmm5, %xmm0
  1462. shufps $0x00, %xmm0, %xmm0
  1463. movaps %xmm5, %xmm1
  1464. shufps $0x55, %xmm1, %xmm1
  1465. movaps %xmm0, 0 * SIZE(BB)
  1466. movaps %xmm1, 4 * SIZE(BB)
  1467. #else
  1468. movlps %xmm5, 0 * SIZE(AA)
  1469. #endif
  1470. movlps %xmm5, 0 * SIZE(CO1)
  1471. #ifndef LN
  1472. addl $2 * SIZE, CO1
  1473. #endif
  1474. #if defined(LT) || defined(RN)
  1475. movl K, %eax
  1476. subl KK, %eax
  1477. sall $ZBASE_SHIFT, %eax
  1478. addl %eax, AA
  1479. #ifdef LT
  1480. addl $2 * SIZE, B
  1481. #endif
  1482. #endif
  1483. #ifdef LN
  1484. subl $1, KK
  1485. movl BORIG, B
  1486. #endif
  1487. #ifdef LT
  1488. addl $1, KK
  1489. #endif
  1490. #ifdef RT
  1491. movl K, %eax
  1492. movl BORIG, B
  1493. sall $ZBASE_SHIFT, %eax
  1494. addl %eax, AORIG
  1495. #endif
  1496. ALIGN_2
  1497. .L99:
  1498. #ifdef LN
  1499. movl K, %eax
  1500. sall $ZBASE_SHIFT, %eax
  1501. addl %eax, B
  1502. #endif
  1503. #if defined(LT) || defined(RN)
  1504. movl K, %eax
  1505. subl KK, %eax
  1506. sall $ZBASE_SHIFT, %eax
  1507. addl %eax, B
  1508. #endif
  1509. #ifdef RN
  1510. addl $1, KK
  1511. #endif
  1512. #ifdef RT
  1513. subl $1, KK
  1514. #endif
  1515. decl J # j --
  1516. jg .L01
  1517. ALIGN_2
  1518. .L999:
  1519. movl OLD_STACK, %esp
  1520. EMMS
  1521. popl %ebx
  1522. popl %esi
  1523. popl %edi
  1524. popl %ebp
  1525. ret
  1526. EPILOGUE