You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LN_4x1_sse.S 33 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(HAVE_SSE) || !defined(HAVE_MMX)
  41. #error You have to check your configuration.
  42. #endif
  43. #define STACK 16
  44. #define ARGS 0
  45. #define STACK_M 4 + STACK + ARGS(%esi)
  46. #define STACK_N 8 + STACK + ARGS(%esi)
  47. #define STACK_K 12 + STACK + ARGS(%esi)
  48. #define STACK_A 24 + STACK + ARGS(%esi)
  49. #define STACK_B 28 + STACK + ARGS(%esi)
  50. #define STACK_C 32 + STACK + ARGS(%esi)
  51. #define STACK_LDC 36 + STACK + ARGS(%esi)
  52. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  53. #define POSINV 0(%esp)
  54. #define K 16(%esp)
  55. #define N 20(%esp)
  56. #define M 24(%esp)
  57. #define A 28(%esp)
  58. #define C 32(%esp)
  59. #define J 36(%esp)
  60. #define OLD_STACK 40(%esp)
  61. #define OFFSET 48(%esp)
  62. #define KK 52(%esp)
  63. #define KKK 56(%esp)
  64. #define AORIG 60(%esp)
  65. #define BORIG 64(%esp)
  66. #define BUFFER 128(%esp)
  67. #define B %edi
  68. #define LDC %ebp
  69. #define AA %edx
  70. #define BB %ecx
  71. #define CO1 %esi
  72. #define STACK_ALIGN 4096
  73. #define STACK_OFFSET 1024
  74. #if !defined(HAVE_SSE2) || defined(OPTERON)
  75. #define movsd movlps
  76. #endif
  77. #ifdef HAVE_SSE2
  78. #define xorps pxor
  79. #endif
  80. PROLOGUE
  81. pushl %ebp
  82. pushl %edi
  83. pushl %esi
  84. pushl %ebx
  85. PROFCODE
  86. movl %esp, %esi # save old stack
  87. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  88. andl $-STACK_ALIGN, %esp # align stack
  89. addl $STACK_OFFSET, %esp
  90. STACK_TOUCHING
  91. movl STACK_M, %ebx
  92. movl STACK_N, %eax
  93. movl STACK_K, %ecx
  94. movl STACK_A, %edx
  95. movl %ebx, M
  96. movl %eax, N
  97. movl %ecx, K
  98. movl %edx, A
  99. movl %esi, OLD_STACK
  100. movl STACK_B, %edi
  101. movl STACK_C, %ebx
  102. movss STACK_OFFT, %xmm4
  103. #ifndef CONJ
  104. movl $0x80000000, 0 + POSINV
  105. movl $0x00000000, 4 + POSINV
  106. movl $0x80000000, 8 + POSINV
  107. movl $0x00000000, 12 + POSINV
  108. #else
  109. movl $0x00000000, 0 + POSINV
  110. movl $0x80000000, 4 + POSINV
  111. movl $0x00000000, 8 + POSINV
  112. movl $0x80000000, 12 + POSINV
  113. #endif
  114. movl %ebx, C
  115. movl STACK_LDC, LDC
  116. movss %xmm4, OFFSET
  117. movss %xmm4, KK
  118. sall $ZBASE_SHIFT, LDC
  119. #ifdef LN
  120. movl M, %eax
  121. sall $ZBASE_SHIFT, %eax
  122. addl %eax, C
  123. imull K, %eax
  124. addl %eax, A
  125. #endif
  126. #ifdef RT
  127. movl N, %eax
  128. sall $ZBASE_SHIFT, %eax
  129. imull K, %eax
  130. addl %eax, B
  131. movl N, %eax
  132. imull LDC, %eax
  133. addl %eax, C
  134. #endif
  135. #ifdef RN
  136. negl KK
  137. #endif
  138. #ifdef RT
  139. movl N, %eax
  140. subl OFFSET, %eax
  141. movl %eax, KK
  142. #endif
  143. movl N, %eax
  144. movl %eax, J # j = n
  145. testl %eax, %eax
  146. jle .L999
  147. .L01:
  148. #ifdef LN
  149. movl OFFSET, %eax
  150. addl M, %eax
  151. movl %eax, KK
  152. #endif
  153. leal BUFFER, BB
  154. #ifdef RT
  155. movl K, %eax
  156. sall $ZBASE_SHIFT, %eax
  157. subl %eax, B
  158. #endif
  159. #if defined(LN) || defined(RT)
  160. movl KK, %eax
  161. movl B, BORIG
  162. sall $ZBASE_SHIFT, %eax
  163. addl %eax, B
  164. leal (BB, %eax, 4), BB
  165. #endif
  166. #if defined(LT)
  167. movl OFFSET, %eax
  168. movl %eax, KK
  169. #endif
  170. #if defined(LT) || defined(RN)
  171. movl KK, %eax
  172. #else
  173. movl K, %eax
  174. subl KK, %eax
  175. #endif
  176. sarl $2, %eax
  177. jle .L03
  178. .L02:
  179. movss 0 * SIZE(B), %xmm0
  180. movss 1 * SIZE(B), %xmm1
  181. movss 2 * SIZE(B), %xmm2
  182. movss 3 * SIZE(B), %xmm3
  183. shufps $0, %xmm0, %xmm0
  184. shufps $0, %xmm1, %xmm1
  185. shufps $0, %xmm2, %xmm2
  186. shufps $0, %xmm3, %xmm3
  187. movaps %xmm0, 0 * SIZE(BB)
  188. movaps %xmm1, 4 * SIZE(BB)
  189. movaps %xmm2, 8 * SIZE(BB)
  190. movaps %xmm3, 12 * SIZE(BB)
  191. movss 4 * SIZE(B), %xmm0
  192. movss 5 * SIZE(B), %xmm1
  193. movss 6 * SIZE(B), %xmm2
  194. movss 7 * SIZE(B), %xmm3
  195. shufps $0, %xmm0, %xmm0
  196. shufps $0, %xmm1, %xmm1
  197. shufps $0, %xmm2, %xmm2
  198. shufps $0, %xmm3, %xmm3
  199. movaps %xmm0, 16 * SIZE(BB)
  200. movaps %xmm1, 20 * SIZE(BB)
  201. movaps %xmm2, 24 * SIZE(BB)
  202. movaps %xmm3, 28 * SIZE(BB)
  203. prefetcht0 104 * SIZE(B)
  204. addl $ 8 * SIZE, B
  205. addl $32 * SIZE, BB
  206. decl %eax
  207. jne .L02
  208. .L03:
  209. #if defined(LT) || defined(RN)
  210. movl KK, %eax
  211. #else
  212. movl K, %eax
  213. subl KK, %eax
  214. #endif
  215. andl $3, %eax
  216. BRANCH
  217. jle .L05
  218. .L04:
  219. movss 0 * SIZE(B), %xmm0
  220. movss 1 * SIZE(B), %xmm1
  221. shufps $0, %xmm0, %xmm0
  222. shufps $0, %xmm1, %xmm1
  223. movaps %xmm0, 0 * SIZE(BB)
  224. movaps %xmm1, 4 * SIZE(BB)
  225. addl $2 * SIZE, B
  226. addl $8 * SIZE, BB
  227. decl %eax
  228. jne .L04
  229. ALIGN_4
  230. .L05:
  231. #if defined(LT) || defined(RN)
  232. movl A, %eax
  233. movl %eax, AA
  234. #else
  235. movl A, %eax
  236. movl %eax, AORIG
  237. #endif
  238. #ifdef RT
  239. subl LDC, C
  240. #endif
  241. movl C, CO1
  242. #ifndef RT
  243. addl LDC, C
  244. #endif
  245. movl M, %ebx
  246. testl $1, %ebx
  247. jle .L50
  248. #ifdef LN
  249. movl K, %eax
  250. sall $ZBASE_SHIFT, %eax
  251. subl %eax, AORIG
  252. #endif
  253. #if defined(LN) || defined(RT)
  254. movl AORIG, %eax
  255. movl %eax, AA
  256. movl KK, %eax
  257. sall $ZBASE_SHIFT, %eax
  258. addl %eax, AA
  259. #endif
  260. leal BUFFER, BB
  261. #if defined(LN) || defined(RT)
  262. movl KK, %eax
  263. sall $2 + ZBASE_SHIFT, %eax
  264. addl %eax, BB
  265. #endif
  266. movaps 0 * SIZE(BB), %xmm2
  267. xorps %xmm4, %xmm4
  268. #ifdef movsd
  269. xorps %xmm0, %xmm0
  270. #endif
  271. movsd 0 * SIZE(AA), %xmm0
  272. xorps %xmm5, %xmm5
  273. movaps 8 * SIZE(BB), %xmm3
  274. #ifdef movsd
  275. xorps %xmm1, %xmm1
  276. #endif
  277. movsd 8 * SIZE(AA), %xmm1
  278. #if defined(LT) || defined(RN)
  279. movl KK, %eax
  280. #else
  281. movl K, %eax
  282. subl KK, %eax
  283. #endif
  284. sarl $3, %eax
  285. je .L72
  286. ALIGN_4
  287. .L71:
  288. mulps %xmm0, %xmm2
  289. addps %xmm2, %xmm4
  290. movaps 4 * SIZE(BB), %xmm2
  291. mulps %xmm0, %xmm2
  292. movsd 2 * SIZE(AA), %xmm0
  293. addps %xmm2, %xmm5
  294. movaps 16 * SIZE(BB), %xmm2
  295. mulps %xmm0, %xmm3
  296. addps %xmm3, %xmm4
  297. movaps 12 * SIZE(BB), %xmm3
  298. mulps %xmm0, %xmm3
  299. movsd 4 * SIZE(AA), %xmm0
  300. addps %xmm3, %xmm5
  301. movaps 24 * SIZE(BB), %xmm3
  302. mulps %xmm0, %xmm2
  303. addps %xmm2, %xmm4
  304. movaps 20 * SIZE(BB), %xmm2
  305. mulps %xmm0, %xmm2
  306. movsd 6 * SIZE(AA), %xmm0
  307. addps %xmm2, %xmm5
  308. movaps 32 * SIZE(BB), %xmm2
  309. mulps %xmm0, %xmm3
  310. addps %xmm3, %xmm4
  311. movaps 28 * SIZE(BB), %xmm3
  312. mulps %xmm0, %xmm3
  313. movsd 16 * SIZE(AA), %xmm0
  314. addps %xmm3, %xmm5
  315. movaps 40 * SIZE(BB), %xmm3
  316. mulps %xmm1, %xmm2
  317. addps %xmm2, %xmm4
  318. movaps 36 * SIZE(BB), %xmm2
  319. mulps %xmm1, %xmm2
  320. movsd 10 * SIZE(AA), %xmm1
  321. addps %xmm2, %xmm5
  322. movaps 48 * SIZE(BB), %xmm2
  323. mulps %xmm1, %xmm3
  324. addps %xmm3, %xmm4
  325. movaps 44 * SIZE(BB), %xmm3
  326. mulps %xmm1, %xmm3
  327. movsd 12 * SIZE(AA), %xmm1
  328. addps %xmm3, %xmm5
  329. movaps 56 * SIZE(BB), %xmm3
  330. mulps %xmm1, %xmm2
  331. addps %xmm2, %xmm4
  332. movaps 52 * SIZE(BB), %xmm2
  333. mulps %xmm1, %xmm2
  334. movsd 14 * SIZE(AA), %xmm1
  335. addps %xmm2, %xmm5
  336. movaps 64 * SIZE(BB), %xmm2
  337. mulps %xmm1, %xmm3
  338. addps %xmm3, %xmm4
  339. movaps 60 * SIZE(BB), %xmm3
  340. mulps %xmm1, %xmm3
  341. movsd 24 * SIZE(AA), %xmm1
  342. addps %xmm3, %xmm5
  343. movaps 72 * SIZE(BB), %xmm3
  344. addl $16 * SIZE, AA
  345. addl $64 * SIZE, BB
  346. decl %eax
  347. jne .L71
  348. ALIGN_2
  349. .L72:
  350. #if defined(LT) || defined(RN)
  351. movl KK, %eax
  352. #else
  353. movl K, %eax
  354. subl KK, %eax
  355. #endif
  356. andl $7, %eax
  357. je .L74
  358. .L73:
  359. mulps %xmm0, %xmm2
  360. addps %xmm2, %xmm4
  361. movaps 4 * SIZE(BB), %xmm2
  362. mulps %xmm0, %xmm2
  363. movsd 2 * SIZE(AA), %xmm0
  364. addps %xmm2, %xmm5
  365. movaps 8 * SIZE(BB), %xmm2
  366. addl $2 * SIZE, AA # aoffset += 8
  367. addl $8 * SIZE, BB # boffset1 += 8
  368. decl %eax
  369. jg .L73
  370. .L74:
  371. movaps POSINV, %xmm0
  372. shufps $0xb1, %xmm5, %xmm5
  373. #if defined(LN) || defined(LT)
  374. #ifndef CONJ
  375. xorps %xmm0, %xmm5
  376. #else
  377. xorps %xmm0, %xmm4
  378. #endif
  379. #else
  380. xorps %xmm0, %xmm5
  381. #endif
  382. addps %xmm5, %xmm4
  383. #if defined(LN) || defined(RT)
  384. movl KK, %eax
  385. subl $1, %eax
  386. movl AORIG, AA
  387. movl BORIG, B
  388. leal BUFFER, BB
  389. sall $ZBASE_SHIFT, %eax
  390. leal (AA, %eax, 1), AA
  391. leal (B, %eax, 1), B
  392. leal (BB, %eax, 4), BB
  393. #endif
  394. #ifdef movsd
  395. xorps %xmm5, %xmm5
  396. #endif
  397. #if defined(LN) || defined(LT)
  398. movsd 0 * SIZE(B), %xmm5
  399. #else
  400. movsd 0 * SIZE(AA), %xmm5
  401. #endif
  402. subps %xmm4, %xmm5
  403. #ifdef movsd
  404. xorps %xmm1, %xmm1
  405. #endif
  406. #if defined(LN) || defined(LT)
  407. movsd 0 * SIZE(AA), %xmm1
  408. #else
  409. movsd 0 * SIZE(B), %xmm1
  410. #endif
  411. movaps %xmm1, %xmm0
  412. shufps $0x44, %xmm0, %xmm0
  413. shufps $0x11, %xmm1, %xmm1
  414. movaps %xmm5, %xmm3
  415. shufps $0xa0, %xmm3, %xmm3
  416. shufps $0xf5, %xmm5, %xmm5
  417. #ifndef CONJ
  418. xorps POSINV, %xmm5
  419. #else
  420. xorps POSINV, %xmm3
  421. #endif
  422. mulps %xmm0, %xmm3
  423. mulps %xmm1, %xmm5
  424. addps %xmm3, %xmm5
  425. #ifdef LN
  426. subl $2 * SIZE, CO1
  427. #endif
  428. #if defined(LN) || defined(LT)
  429. movlps %xmm5, 0 * SIZE(B)
  430. movaps %xmm5, %xmm0
  431. shufps $0x00, %xmm0, %xmm0
  432. movaps %xmm5, %xmm1
  433. shufps $0x55, %xmm1, %xmm1
  434. movaps %xmm0, 0 * SIZE(BB)
  435. movaps %xmm1, 4 * SIZE(BB)
  436. #else
  437. movlps %xmm5, 0 * SIZE(AA)
  438. #endif
  439. movlps %xmm5, 0 * SIZE(CO1)
  440. #ifndef LN
  441. addl $2 * SIZE, CO1
  442. #endif
  443. #if defined(LT) || defined(RN)
  444. movl K, %eax
  445. subl KK, %eax
  446. sall $ZBASE_SHIFT, %eax
  447. addl %eax, AA
  448. #ifdef LT
  449. addl $2 * SIZE, B
  450. #endif
  451. #endif
  452. #ifdef LN
  453. subl $1, KK
  454. movl BORIG, B
  455. #endif
  456. #ifdef LT
  457. addl $1, KK
  458. #endif
  459. #ifdef RT
  460. movl K, %eax
  461. movl BORIG, B
  462. sall $ZBASE_SHIFT, %eax
  463. addl %eax, AORIG
  464. #endif
  465. ALIGN_2
  466. .L50:
  467. movl M, %ebx
  468. testl $2, %ebx
  469. jle .L70
  470. #ifdef LN
  471. movl K, %eax
  472. sall $1 + ZBASE_SHIFT, %eax
  473. subl %eax, AORIG
  474. #endif
  475. #if defined(LN) || defined(RT)
  476. movl AORIG, %eax
  477. movl %eax, AA
  478. movl KK, %eax
  479. sall $1 + ZBASE_SHIFT, %eax
  480. addl %eax, AA
  481. #endif
  482. leal BUFFER, BB
  483. #if defined(LN) || defined(RT)
  484. movl KK, %eax
  485. sall $2 + ZBASE_SHIFT, %eax
  486. addl %eax, BB
  487. #endif
  488. movaps 0 * SIZE(BB), %xmm2
  489. xorps %xmm4, %xmm4
  490. movaps 0 * SIZE(AA), %xmm0
  491. xorps %xmm5, %xmm5
  492. movaps 8 * SIZE(BB), %xmm3
  493. xorps %xmm6, %xmm6
  494. movaps 8 * SIZE(AA), %xmm1
  495. xorps %xmm7, %xmm7
  496. #if defined(LT) || defined(RN)
  497. movl KK, %eax
  498. #else
  499. movl K, %eax
  500. subl KK, %eax
  501. #endif
  502. sarl $3, %eax
  503. je .L52
  504. ALIGN_4
  505. .L51:
  506. mulps %xmm0, %xmm2
  507. mulps 4 * SIZE(BB), %xmm0
  508. addps %xmm2, %xmm4
  509. movaps 16 * SIZE(BB), %xmm2
  510. addps %xmm0, %xmm5
  511. movaps 4 * SIZE(AA), %xmm0
  512. mulps %xmm0, %xmm3
  513. mulps 12 * SIZE(BB), %xmm0
  514. addps %xmm3, %xmm6
  515. movaps 24 * SIZE(BB), %xmm3
  516. addps %xmm0, %xmm7
  517. movaps 16 * SIZE(AA), %xmm0
  518. mulps %xmm1, %xmm2
  519. mulps 20 * SIZE(BB), %xmm1
  520. addps %xmm2, %xmm4
  521. movaps 32 * SIZE(BB), %xmm2
  522. addps %xmm1, %xmm5
  523. movaps 12 * SIZE(AA), %xmm1
  524. mulps %xmm1, %xmm3
  525. mulps 28 * SIZE(BB), %xmm1
  526. addps %xmm3, %xmm6
  527. movaps 40 * SIZE(BB), %xmm3
  528. addps %xmm1, %xmm7
  529. movaps 24 * SIZE(AA), %xmm1
  530. mulps %xmm0, %xmm2
  531. mulps 36 * SIZE(BB), %xmm0
  532. addps %xmm2, %xmm4
  533. movaps 48 * SIZE(BB), %xmm2
  534. addps %xmm0, %xmm5
  535. movaps 20 * SIZE(AA), %xmm0
  536. mulps %xmm0, %xmm3
  537. mulps 44 * SIZE(BB), %xmm0
  538. addps %xmm3, %xmm6
  539. movaps 56 * SIZE(BB), %xmm3
  540. addps %xmm0, %xmm7
  541. movaps 32 * SIZE(AA), %xmm0
  542. mulps %xmm1, %xmm2
  543. mulps 52 * SIZE(BB), %xmm1
  544. addps %xmm2, %xmm4
  545. movaps 64 * SIZE(BB), %xmm2
  546. addps %xmm1, %xmm5
  547. movaps 28 * SIZE(AA), %xmm1
  548. mulps %xmm1, %xmm3
  549. mulps 60 * SIZE(BB), %xmm1
  550. addps %xmm3, %xmm6
  551. movaps 72 * SIZE(BB), %xmm3
  552. addps %xmm1, %xmm7
  553. movaps 40 * SIZE(AA), %xmm1
  554. addl $32 * SIZE, AA
  555. addl $64 * SIZE, BB
  556. decl %eax
  557. jne .L51
  558. ALIGN_4
  559. .L52:
  560. #if defined(LT) || defined(RN)
  561. movl KK, %eax
  562. #else
  563. movl K, %eax
  564. subl KK, %eax
  565. #endif
  566. andl $7, %eax # if (k & 1)
  567. BRANCH
  568. je .L54
  569. .L53:
  570. mulps %xmm0, %xmm2
  571. mulps 4 * SIZE(BB), %xmm0
  572. addps %xmm2, %xmm4
  573. movaps 8 * SIZE(BB), %xmm2
  574. addps %xmm0, %xmm5
  575. movaps 4 * SIZE(AA), %xmm0
  576. addl $4 * SIZE, AA # aoffset += 8
  577. addl $8 * SIZE, BB # boffset1 += 8
  578. decl %eax
  579. jg .L53
  580. .L54:
  581. addps %xmm6, %xmm4
  582. addps %xmm7, %xmm5
  583. movaps POSINV, %xmm0
  584. shufps $0xb1, %xmm5, %xmm5
  585. #if defined(LN) || defined(LT)
  586. #ifndef CONJ
  587. xorps %xmm0, %xmm5
  588. #else
  589. xorps %xmm0, %xmm4
  590. #endif
  591. #else
  592. xorps %xmm0, %xmm5
  593. #endif
  594. addps %xmm5, %xmm4
  595. #if defined(LN) || defined(RT)
  596. movl KK, %eax
  597. #ifdef LN
  598. subl $2, %eax
  599. #else
  600. subl $1, %eax
  601. #endif
  602. movl AORIG, AA
  603. movl BORIG, B
  604. leal BUFFER, BB
  605. sall $ZBASE_SHIFT, %eax
  606. leal (AA, %eax, 2), AA
  607. leal (B, %eax, 1), B
  608. leal (BB, %eax, 4), BB
  609. #endif
  610. #if defined(LN) || defined(LT)
  611. movsd 0 * SIZE(B), %xmm5
  612. movhps 2 * SIZE(B), %xmm5
  613. #else
  614. movaps 0 * SIZE(AA), %xmm5
  615. #endif
  616. subps %xmm4, %xmm5
  617. #if defined(LN) || defined(LT)
  618. movhlps %xmm5, %xmm4
  619. #endif
  620. #ifdef LN
  621. #ifdef movsd
  622. xorps %xmm1, %xmm1
  623. #endif
  624. movsd 6 * SIZE(AA), %xmm1
  625. movaps %xmm1, %xmm0
  626. shufps $0x44, %xmm0, %xmm0
  627. shufps $0x11, %xmm1, %xmm1
  628. movaps %xmm4, %xmm3
  629. shufps $0xa0, %xmm3, %xmm3
  630. shufps $0xf5, %xmm4, %xmm4
  631. #ifndef CONJ
  632. xorps POSINV, %xmm4
  633. #else
  634. xorps POSINV, %xmm3
  635. #endif
  636. mulps %xmm0, %xmm3
  637. mulps %xmm1, %xmm4
  638. addps %xmm3, %xmm4
  639. movsd 4 * SIZE(AA), %xmm1
  640. movaps %xmm1, %xmm0
  641. shufps $0x44, %xmm0, %xmm0
  642. shufps $0x11, %xmm1, %xmm1
  643. movaps %xmm4, %xmm2
  644. shufps $0xa0, %xmm2, %xmm2
  645. movaps %xmm4, %xmm3
  646. shufps $0xf5, %xmm3, %xmm3
  647. #ifndef CONJ
  648. xorps POSINV, %xmm3
  649. #else
  650. xorps POSINV, %xmm2
  651. #endif
  652. mulps %xmm0, %xmm2
  653. mulps %xmm1, %xmm3
  654. subps %xmm2, %xmm5
  655. subps %xmm3, %xmm5
  656. movsd 0 * SIZE(AA), %xmm1
  657. movaps %xmm1, %xmm0
  658. shufps $0x44, %xmm0, %xmm0
  659. shufps $0x11, %xmm1, %xmm1
  660. movaps %xmm5, %xmm3
  661. shufps $0xa0, %xmm3, %xmm3
  662. shufps $0xf5, %xmm5, %xmm5
  663. #ifndef CONJ
  664. xorps POSINV, %xmm5
  665. #else
  666. xorps POSINV, %xmm3
  667. #endif
  668. mulps %xmm0, %xmm3
  669. mulps %xmm1, %xmm5
  670. addps %xmm3, %xmm5
  671. #endif
  672. #ifdef LT
  673. #ifdef movsd
  674. xorps %xmm1, %xmm1
  675. #endif
  676. movsd 0 * SIZE(AA), %xmm1
  677. movaps %xmm1, %xmm0
  678. shufps $0x44, %xmm0, %xmm0
  679. shufps $0x11, %xmm1, %xmm1
  680. movaps %xmm5, %xmm3
  681. shufps $0xa0, %xmm3, %xmm3
  682. shufps $0xf5, %xmm5, %xmm5
  683. #ifndef CONJ
  684. xorps POSINV, %xmm5
  685. #else
  686. xorps POSINV, %xmm3
  687. #endif
  688. mulps %xmm0, %xmm3
  689. mulps %xmm1, %xmm5
  690. addps %xmm3, %xmm5
  691. movsd 2 * SIZE(AA), %xmm1
  692. movaps %xmm1, %xmm0
  693. shufps $0x44, %xmm0, %xmm0
  694. shufps $0x11, %xmm1, %xmm1
  695. movaps %xmm5, %xmm2
  696. shufps $0xa0, %xmm2, %xmm2
  697. movaps %xmm5, %xmm3
  698. shufps $0xf5, %xmm3, %xmm3
  699. #ifndef CONJ
  700. xorps POSINV, %xmm3
  701. #else
  702. xorps POSINV, %xmm2
  703. #endif
  704. mulps %xmm0, %xmm2
  705. mulps %xmm1, %xmm3
  706. subps %xmm2, %xmm4
  707. subps %xmm3, %xmm4
  708. movsd 6 * SIZE(AA), %xmm1
  709. movaps %xmm1, %xmm0
  710. shufps $0x44, %xmm0, %xmm0
  711. shufps $0x11, %xmm1, %xmm1
  712. movaps %xmm4, %xmm3
  713. shufps $0xa0, %xmm3, %xmm3
  714. shufps $0xf5, %xmm4, %xmm4
  715. #ifndef CONJ
  716. xorps POSINV, %xmm4
  717. #else
  718. xorps POSINV, %xmm3
  719. #endif
  720. mulps %xmm0, %xmm3
  721. mulps %xmm1, %xmm4
  722. addps %xmm3, %xmm4
  723. #endif
  724. #if defined(RN) || defined(RT)
  725. movsd 0 * SIZE(B), %xmm1
  726. movhps 2 * SIZE(B), %xmm1
  727. movaps %xmm1, %xmm2
  728. shufps $0x44, %xmm2, %xmm2
  729. movaps %xmm1, %xmm3
  730. shufps $0x11, %xmm2, %xmm3
  731. movaps %xmm5, %xmm4
  732. shufps $0xa0, %xmm4, %xmm4
  733. shufps $0xf5, %xmm5, %xmm5
  734. #ifndef CONJ
  735. xorps %xmm0, %xmm5
  736. #else
  737. xorps %xmm0, %xmm4
  738. #endif
  739. mulps %xmm2, %xmm4
  740. mulps %xmm3, %xmm5
  741. addps %xmm4, %xmm5
  742. #endif
  743. #ifdef LN
  744. subl $4 * SIZE, CO1
  745. #endif
  746. #if defined(LN) || defined(LT)
  747. movlhps %xmm4, %xmm5
  748. movsd %xmm5, 0 * SIZE(B)
  749. movhps %xmm5, 2 * SIZE(B)
  750. #ifdef HAVE_SSE2
  751. pshufd $0x00, %xmm5, %xmm0
  752. pshufd $0x55, %xmm5, %xmm1
  753. pshufd $0xaa, %xmm5, %xmm2
  754. pshufd $0xff, %xmm5, %xmm3
  755. #else
  756. movaps %xmm5, %xmm0
  757. shufps $0x00, %xmm0, %xmm0
  758. movaps %xmm5, %xmm1
  759. shufps $0x55, %xmm1, %xmm1
  760. movaps %xmm5, %xmm2
  761. shufps $0xaa, %xmm2, %xmm2
  762. movaps %xmm5, %xmm3
  763. shufps $0xff, %xmm3, %xmm3
  764. #endif
  765. movaps %xmm0, 0 * SIZE(BB)
  766. movaps %xmm1, 4 * SIZE(BB)
  767. movaps %xmm2, 8 * SIZE(BB)
  768. movaps %xmm3, 12 * SIZE(BB)
  769. #else
  770. movaps %xmm5, 0 * SIZE(AA)
  771. #endif
  772. movsd %xmm5, 0 * SIZE(CO1)
  773. movhps %xmm5, 2 * SIZE(CO1)
  774. #ifndef LN
  775. addl $4 * SIZE, CO1
  776. #endif
  777. #if defined(LT) || defined(RN)
  778. movl K, %eax
  779. subl KK, %eax
  780. sall $1 + ZBASE_SHIFT, %eax
  781. addl %eax, AA
  782. #ifdef LT
  783. addl $4 * SIZE, B
  784. #endif
  785. #endif
  786. #ifdef LN
  787. subl $2, KK
  788. movl BORIG, B
  789. #endif
  790. #ifdef LT
  791. addl $2, KK
  792. #endif
  793. #ifdef RT
  794. movl K, %eax
  795. movl BORIG, B
  796. sall $1 + ZBASE_SHIFT, %eax
  797. addl %eax, AORIG
  798. #endif
  799. ALIGN_2
  800. .L70:
  801. movl M, %ebx
  802. sarl $2, %ebx
  803. jle .L99
  804. ALIGN_4
  805. .L10:
  806. #ifdef LN
  807. movl K, %eax
  808. sall $2 + ZBASE_SHIFT, %eax
  809. subl %eax, AORIG
  810. #endif
  811. #if defined(LN) || defined(RT)
  812. movl AORIG, %eax
  813. movl %eax, AA
  814. movl KK, %eax
  815. sall $2 + ZBASE_SHIFT, %eax
  816. addl %eax, AA
  817. #endif
  818. leal BUFFER, BB
  819. #if defined(LN) || defined(RT)
  820. movl KK, %eax
  821. sall $2 + ZBASE_SHIFT, %eax
  822. addl %eax, BB
  823. #endif
  824. movaps 0 * SIZE(BB), %xmm2
  825. xorps %xmm4, %xmm4
  826. movaps 0 * SIZE(AA), %xmm0
  827. xorps %xmm5, %xmm5
  828. movaps 8 * SIZE(BB), %xmm3
  829. xorps %xmm6, %xmm6
  830. movaps 8 * SIZE(AA), %xmm1
  831. xorps %xmm7, %xmm7
  832. #if defined(LT) || defined(RN)
  833. movl KK, %eax
  834. #else
  835. movl K, %eax
  836. subl KK, %eax
  837. #endif
  838. sarl $3, %eax
  839. prefetcht0 8 * SIZE(CO1)
  840. je .L12
  841. ALIGN_4
  842. #define PREFETCHSIZE 48
  843. .L11:
  844. #ifdef CORE_KATMAI
  845. prefetcht0 PREFETCHSIZE * SIZE(AA)
  846. #endif
  847. mulps %xmm0, %xmm2
  848. mulps 4 * SIZE(BB), %xmm0
  849. addps %xmm2, %xmm4
  850. movaps 0 * SIZE(BB), %xmm2
  851. addps %xmm0, %xmm5
  852. movaps 4 * SIZE(AA), %xmm0
  853. mulps %xmm0, %xmm2
  854. mulps 4 * SIZE(BB), %xmm0
  855. addps %xmm2, %xmm6
  856. movaps 16 * SIZE(BB), %xmm2
  857. addps %xmm0, %xmm7
  858. movaps 16 * SIZE(AA), %xmm0
  859. #ifdef CORE_KATMAI
  860. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  861. #endif
  862. mulps %xmm1, %xmm3
  863. mulps 12 * SIZE(BB), %xmm1
  864. addps %xmm3, %xmm4
  865. movaps 8 * SIZE(BB), %xmm3
  866. addps %xmm1, %xmm5
  867. movaps 12 * SIZE(AA), %xmm1
  868. mulps %xmm1, %xmm3
  869. mulps 12 * SIZE(BB), %xmm1
  870. addps %xmm3, %xmm6
  871. movaps 24 * SIZE(BB), %xmm3
  872. addps %xmm1, %xmm7
  873. movaps 24 * SIZE(AA), %xmm1
  874. #ifdef CORE_KATMAI
  875. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  876. #endif
  877. mulps %xmm0, %xmm2
  878. mulps 20 * SIZE(BB), %xmm0
  879. addps %xmm2, %xmm4
  880. movaps 16 * SIZE(BB), %xmm2
  881. addps %xmm0, %xmm5
  882. movaps 20 * SIZE(AA), %xmm0
  883. mulps %xmm0, %xmm2
  884. mulps 20 * SIZE(BB), %xmm0
  885. addps %xmm2, %xmm6
  886. movaps 32 * SIZE(BB), %xmm2
  887. addps %xmm0, %xmm7
  888. movaps 32 * SIZE(AA), %xmm0
  889. #ifdef CORE_KATMAI
  890. prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA)
  891. #endif
  892. mulps %xmm1, %xmm3
  893. mulps 28 * SIZE(BB), %xmm1
  894. addps %xmm3, %xmm4
  895. movaps 24 * SIZE(BB), %xmm3
  896. addps %xmm1, %xmm5
  897. movaps 28 * SIZE(AA), %xmm1
  898. mulps %xmm1, %xmm3
  899. mulps 28 * SIZE(BB), %xmm1
  900. addps %xmm3, %xmm6
  901. movaps 40 * SIZE(BB), %xmm3
  902. addps %xmm1, %xmm7
  903. movaps 40 * SIZE(AA), %xmm1
  904. #ifdef CORE_KATMAI
  905. prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA)
  906. #endif
  907. mulps %xmm0, %xmm2
  908. mulps 36 * SIZE(BB), %xmm0
  909. addps %xmm2, %xmm4
  910. movaps 32 * SIZE(BB), %xmm2
  911. addps %xmm0, %xmm5
  912. movaps 36 * SIZE(AA), %xmm0
  913. mulps %xmm0, %xmm2
  914. mulps 36 * SIZE(BB), %xmm0
  915. addps %xmm2, %xmm6
  916. movaps 48 * SIZE(BB), %xmm2
  917. addps %xmm0, %xmm7
  918. movaps 48 * SIZE(AA), %xmm0
  919. #ifdef CORE_KATMAI
  920. prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA)
  921. #endif
  922. mulps %xmm1, %xmm3
  923. mulps 44 * SIZE(BB), %xmm1
  924. addps %xmm3, %xmm4
  925. movaps 40 * SIZE(BB), %xmm3
  926. addps %xmm1, %xmm5
  927. movaps 44 * SIZE(AA), %xmm1
  928. mulps %xmm1, %xmm3
  929. mulps 44 * SIZE(BB), %xmm1
  930. addps %xmm3, %xmm6
  931. movaps 56 * SIZE(BB), %xmm3
  932. addps %xmm1, %xmm7
  933. movaps 56 * SIZE(AA), %xmm1
  934. #ifdef CORE_KATMAI
  935. prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA)
  936. #endif
  937. mulps %xmm0, %xmm2
  938. mulps 52 * SIZE(BB), %xmm0
  939. addps %xmm2, %xmm4
  940. movaps 48 * SIZE(BB), %xmm2
  941. addps %xmm0, %xmm5
  942. movaps 52 * SIZE(AA), %xmm0
  943. mulps %xmm0, %xmm2
  944. mulps 52 * SIZE(BB), %xmm0
  945. addps %xmm2, %xmm6
  946. movaps 64 * SIZE(BB), %xmm2
  947. addps %xmm0, %xmm7
  948. movaps 64 * SIZE(AA), %xmm0
  949. #ifdef CORE_KATMAI
  950. prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA)
  951. #endif
  952. mulps %xmm1, %xmm3
  953. mulps 60 * SIZE(BB), %xmm1
  954. addps %xmm3, %xmm4
  955. movaps 56 * SIZE(BB), %xmm3
  956. addps %xmm1, %xmm5
  957. movaps 60 * SIZE(AA), %xmm1
  958. mulps %xmm1, %xmm3
  959. mulps 60 * SIZE(BB), %xmm1
  960. addps %xmm3, %xmm6
  961. movaps 72 * SIZE(BB), %xmm3
  962. addps %xmm1, %xmm7
  963. movaps 72 * SIZE(AA), %xmm1
  964. addl $64 * SIZE, BB
  965. addl $64 * SIZE, AA
  966. decl %eax
  967. jne .L11
  968. .L12:
  969. #if defined(LT) || defined(RN)
  970. movl KK, %eax
  971. #else
  972. movl K, %eax
  973. subl KK, %eax
  974. #endif
  975. andl $7, %eax # if (k & 1)
  976. BRANCH
  977. je .L14
  978. .L13:
  979. mulps %xmm0, %xmm2
  980. mulps 4 * SIZE(BB), %xmm0
  981. addps %xmm2, %xmm4
  982. movaps 0 * SIZE(BB), %xmm2
  983. addps %xmm0, %xmm5
  984. movaps 4 * SIZE(AA), %xmm0
  985. mulps %xmm0, %xmm2
  986. mulps 4 * SIZE(BB), %xmm0
  987. addps %xmm2, %xmm6
  988. movaps 8 * SIZE(BB), %xmm2
  989. addps %xmm0, %xmm7
  990. movaps 8 * SIZE(AA), %xmm0
  991. addl $8 * SIZE, AA # aoffset += 8
  992. addl $8 * SIZE, BB # boffset1 += 8
  993. decl %eax
  994. jg .L13
  995. .L14:
  996. movaps POSINV, %xmm0
  997. shufps $0xb1, %xmm5, %xmm5
  998. shufps $0xb1, %xmm7, %xmm7
  999. #if defined(LN) || defined(LT)
  1000. #ifndef CONJ
  1001. xorps %xmm0, %xmm5
  1002. xorps %xmm0, %xmm7
  1003. #else
  1004. xorps %xmm0, %xmm4
  1005. xorps %xmm0, %xmm6
  1006. #endif
  1007. #else
  1008. xorps %xmm0, %xmm5
  1009. xorps %xmm0, %xmm7
  1010. #endif
  1011. addps %xmm5, %xmm4
  1012. addps %xmm7, %xmm6
  1013. #if defined(LN) || defined(RT)
  1014. movl KK, %eax
  1015. #ifdef LN
  1016. subl $4, %eax
  1017. #else
  1018. subl $1, %eax
  1019. #endif
  1020. movl AORIG, AA
  1021. movl BORIG, B
  1022. leal BUFFER, BB
  1023. sall $ZBASE_SHIFT, %eax
  1024. leal (AA, %eax, 4), AA
  1025. leal (B, %eax, 1), B
  1026. leal (BB, %eax, 4), BB
  1027. #endif
  1028. #if defined(LN) || defined(LT)
  1029. movsd 0 * SIZE(B), %xmm5
  1030. movhps 2 * SIZE(B), %xmm5
  1031. movsd 4 * SIZE(B), %xmm7
  1032. movhps 6 * SIZE(B), %xmm7
  1033. #else
  1034. movaps 0 * SIZE(AA), %xmm5
  1035. movaps 4 * SIZE(AA), %xmm7
  1036. #endif
  1037. subps %xmm4, %xmm5
  1038. subps %xmm6, %xmm7
  1039. #if defined(LN) || defined(LT)
  1040. movhlps %xmm5, %xmm4
  1041. movhlps %xmm7, %xmm6
  1042. #endif
  1043. #ifdef LN
  1044. #ifdef movsd
  1045. xorps %xmm1, %xmm1
  1046. #endif
  1047. movsd 30 * SIZE(AA), %xmm1
  1048. movaps %xmm1, %xmm0
  1049. shufps $0x44, %xmm0, %xmm0
  1050. shufps $0x11, %xmm1, %xmm1
  1051. movaps %xmm6, %xmm3
  1052. shufps $0xa0, %xmm3, %xmm3
  1053. shufps $0xf5, %xmm6, %xmm6
  1054. #ifndef CONJ
  1055. xorps POSINV, %xmm6
  1056. #else
  1057. xorps POSINV, %xmm3
  1058. #endif
  1059. mulps %xmm0, %xmm3
  1060. mulps %xmm1, %xmm6
  1061. addps %xmm3, %xmm6
  1062. movsd 28 * SIZE(AA), %xmm1
  1063. movaps %xmm1, %xmm0
  1064. shufps $0x44, %xmm0, %xmm0
  1065. shufps $0x11, %xmm1, %xmm1
  1066. movaps %xmm6, %xmm2
  1067. shufps $0xa0, %xmm2, %xmm2
  1068. movaps %xmm6, %xmm3
  1069. shufps $0xf5, %xmm3, %xmm3
  1070. #ifndef CONJ
  1071. xorps POSINV, %xmm3
  1072. #else
  1073. xorps POSINV, %xmm2
  1074. #endif
  1075. mulps %xmm0, %xmm2
  1076. mulps %xmm1, %xmm3
  1077. subps %xmm2, %xmm7
  1078. subps %xmm3, %xmm7
  1079. movsd 26 * SIZE(AA), %xmm1
  1080. movaps %xmm1, %xmm0
  1081. shufps $0x44, %xmm0, %xmm0
  1082. shufps $0x11, %xmm1, %xmm1
  1083. movaps %xmm6, %xmm2
  1084. shufps $0xa0, %xmm2, %xmm2
  1085. movaps %xmm6, %xmm3
  1086. shufps $0xf5, %xmm3, %xmm3
  1087. #ifndef CONJ
  1088. xorps POSINV, %xmm3
  1089. #else
  1090. xorps POSINV, %xmm2
  1091. #endif
  1092. mulps %xmm0, %xmm2
  1093. mulps %xmm1, %xmm3
  1094. subps %xmm2, %xmm4
  1095. subps %xmm3, %xmm4
  1096. movsd 24 * SIZE(AA), %xmm1
  1097. movaps %xmm1, %xmm0
  1098. shufps $0x44, %xmm0, %xmm0
  1099. shufps $0x11, %xmm1, %xmm1
  1100. movaps %xmm6, %xmm2
  1101. shufps $0xa0, %xmm2, %xmm2
  1102. movaps %xmm6, %xmm3
  1103. shufps $0xf5, %xmm3, %xmm3
  1104. #ifndef CONJ
  1105. xorps POSINV, %xmm3
  1106. #else
  1107. xorps POSINV, %xmm2
  1108. #endif
  1109. mulps %xmm0, %xmm2
  1110. mulps %xmm1, %xmm3
  1111. subps %xmm2, %xmm5
  1112. subps %xmm3, %xmm5
  1113. movsd 20 * SIZE(AA), %xmm1
  1114. movaps %xmm1, %xmm0
  1115. shufps $0x44, %xmm0, %xmm0
  1116. shufps $0x11, %xmm1, %xmm1
  1117. movaps %xmm7, %xmm3
  1118. shufps $0xa0, %xmm3, %xmm3
  1119. shufps $0xf5, %xmm7, %xmm7
  1120. #ifndef CONJ
  1121. xorps POSINV, %xmm7
  1122. #else
  1123. xorps POSINV, %xmm3
  1124. #endif
  1125. mulps %xmm0, %xmm3
  1126. mulps %xmm1, %xmm7
  1127. addps %xmm3, %xmm7
  1128. movsd 18 * SIZE(AA), %xmm1
  1129. movaps %xmm1, %xmm0
  1130. shufps $0x44, %xmm0, %xmm0
  1131. shufps $0x11, %xmm1, %xmm1
  1132. movaps %xmm7, %xmm2
  1133. shufps $0xa0, %xmm2, %xmm2
  1134. movaps %xmm7, %xmm3
  1135. shufps $0xf5, %xmm3, %xmm3
  1136. #ifndef CONJ
  1137. xorps POSINV, %xmm3
  1138. #else
  1139. xorps POSINV, %xmm2
  1140. #endif
  1141. mulps %xmm0, %xmm2
  1142. mulps %xmm1, %xmm3
  1143. subps %xmm2, %xmm4
  1144. subps %xmm3, %xmm4
  1145. movsd 16 * SIZE(AA), %xmm1
  1146. movaps %xmm1, %xmm0
  1147. shufps $0x44, %xmm0, %xmm0
  1148. shufps $0x11, %xmm1, %xmm1
  1149. movaps %xmm7, %xmm2
  1150. shufps $0xa0, %xmm2, %xmm2
  1151. movaps %xmm7, %xmm3
  1152. shufps $0xf5, %xmm3, %xmm3
  1153. #ifndef CONJ
  1154. xorps POSINV, %xmm3
  1155. #else
  1156. xorps POSINV, %xmm2
  1157. #endif
  1158. mulps %xmm0, %xmm2
  1159. mulps %xmm1, %xmm3
  1160. subps %xmm2, %xmm5
  1161. subps %xmm3, %xmm5
  1162. movsd 10 * SIZE(AA), %xmm1
  1163. movaps %xmm1, %xmm0
  1164. shufps $0x44, %xmm0, %xmm0
  1165. shufps $0x11, %xmm1, %xmm1
  1166. movaps %xmm4, %xmm3
  1167. shufps $0xa0, %xmm3, %xmm3
  1168. shufps $0xf5, %xmm4, %xmm4
  1169. #ifndef CONJ
  1170. xorps POSINV, %xmm4
  1171. #else
  1172. xorps POSINV, %xmm3
  1173. #endif
  1174. mulps %xmm0, %xmm3
  1175. mulps %xmm1, %xmm4
  1176. addps %xmm3, %xmm4
  1177. movsd 8 * SIZE(AA), %xmm1
  1178. movaps %xmm1, %xmm0
  1179. shufps $0x44, %xmm0, %xmm0
  1180. shufps $0x11, %xmm1, %xmm1
  1181. movaps %xmm4, %xmm2
  1182. shufps $0xa0, %xmm2, %xmm2
  1183. movaps %xmm4, %xmm3
  1184. shufps $0xf5, %xmm3, %xmm3
  1185. #ifndef CONJ
  1186. xorps POSINV, %xmm3
  1187. #else
  1188. xorps POSINV, %xmm2
  1189. #endif
  1190. mulps %xmm0, %xmm2
  1191. mulps %xmm1, %xmm3
  1192. subps %xmm2, %xmm5
  1193. subps %xmm3, %xmm5
  1194. movsd 0 * SIZE(AA), %xmm1
  1195. movaps %xmm1, %xmm0
  1196. shufps $0x44, %xmm0, %xmm0
  1197. shufps $0x11, %xmm1, %xmm1
  1198. movaps %xmm5, %xmm3
  1199. shufps $0xa0, %xmm3, %xmm3
  1200. shufps $0xf5, %xmm5, %xmm5
  1201. #ifndef CONJ
  1202. xorps POSINV, %xmm5
  1203. #else
  1204. xorps POSINV, %xmm3
  1205. #endif
  1206. mulps %xmm0, %xmm3
  1207. mulps %xmm1, %xmm5
  1208. addps %xmm3, %xmm5
  1209. #endif
  1210. #ifdef LT
  1211. movsd 0 * SIZE(AA), %xmm1
  1212. movaps %xmm1, %xmm0
  1213. shufps $0x44, %xmm0, %xmm0
  1214. shufps $0x11, %xmm1, %xmm1
  1215. movaps %xmm5, %xmm3
  1216. shufps $0xa0, %xmm3, %xmm3
  1217. shufps $0xf5, %xmm5, %xmm5
  1218. #ifndef CONJ
  1219. xorps POSINV, %xmm5
  1220. #else
  1221. xorps POSINV, %xmm3
  1222. #endif
  1223. mulps %xmm0, %xmm3
  1224. mulps %xmm1, %xmm5
  1225. addps %xmm3, %xmm5
  1226. movsd 2 * SIZE(AA), %xmm1
  1227. movaps %xmm1, %xmm0
  1228. shufps $0x44, %xmm0, %xmm0
  1229. shufps $0x11, %xmm1, %xmm1
  1230. movaps %xmm5, %xmm2
  1231. shufps $0xa0, %xmm2, %xmm2
  1232. movaps %xmm5, %xmm3
  1233. shufps $0xf5, %xmm3, %xmm3
  1234. #ifndef CONJ
  1235. xorps POSINV, %xmm3
  1236. #else
  1237. xorps POSINV, %xmm2
  1238. #endif
  1239. mulps %xmm0, %xmm2
  1240. mulps %xmm1, %xmm3
  1241. subps %xmm2, %xmm4
  1242. subps %xmm3, %xmm4
  1243. movsd 4 * SIZE(AA), %xmm1
  1244. movaps %xmm1, %xmm0
  1245. shufps $0x44, %xmm0, %xmm0
  1246. shufps $0x11, %xmm1, %xmm1
  1247. movaps %xmm5, %xmm2
  1248. shufps $0xa0, %xmm2, %xmm2
  1249. movaps %xmm5, %xmm3
  1250. shufps $0xf5, %xmm3, %xmm3
  1251. #ifndef CONJ
  1252. xorps POSINV, %xmm3
  1253. #else
  1254. xorps POSINV, %xmm2
  1255. #endif
  1256. mulps %xmm0, %xmm2
  1257. mulps %xmm1, %xmm3
  1258. subps %xmm2, %xmm7
  1259. subps %xmm3, %xmm7
  1260. movsd 6 * SIZE(AA), %xmm1
  1261. movaps %xmm1, %xmm0
  1262. shufps $0x44, %xmm0, %xmm0
  1263. shufps $0x11, %xmm1, %xmm1
  1264. movaps %xmm5, %xmm2
  1265. shufps $0xa0, %xmm2, %xmm2
  1266. movaps %xmm5, %xmm3
  1267. shufps $0xf5, %xmm3, %xmm3
  1268. #ifndef CONJ
  1269. xorps POSINV, %xmm3
  1270. #else
  1271. xorps POSINV, %xmm2
  1272. #endif
  1273. mulps %xmm0, %xmm2
  1274. mulps %xmm1, %xmm3
  1275. subps %xmm2, %xmm6
  1276. subps %xmm3, %xmm6
  1277. movsd 10 * SIZE(AA), %xmm1
  1278. movaps %xmm1, %xmm0
  1279. shufps $0x44, %xmm0, %xmm0
  1280. shufps $0x11, %xmm1, %xmm1
  1281. movaps %xmm4, %xmm3
  1282. shufps $0xa0, %xmm3, %xmm3
  1283. shufps $0xf5, %xmm4, %xmm4
  1284. #ifndef CONJ
  1285. xorps POSINV, %xmm4
  1286. #else
  1287. xorps POSINV, %xmm3
  1288. #endif
  1289. mulps %xmm0, %xmm3
  1290. mulps %xmm1, %xmm4
  1291. addps %xmm3, %xmm4
  1292. movsd 12 * SIZE(AA), %xmm1
  1293. movaps %xmm1, %xmm0
  1294. shufps $0x44, %xmm0, %xmm0
  1295. shufps $0x11, %xmm1, %xmm1
  1296. movaps %xmm4, %xmm2
  1297. shufps $0xa0, %xmm2, %xmm2
  1298. movaps %xmm4, %xmm3
  1299. shufps $0xf5, %xmm3, %xmm3
  1300. #ifndef CONJ
  1301. xorps POSINV, %xmm3
  1302. #else
  1303. xorps POSINV, %xmm2
  1304. #endif
  1305. mulps %xmm0, %xmm2
  1306. mulps %xmm1, %xmm3
  1307. subps %xmm2, %xmm7
  1308. subps %xmm3, %xmm7
  1309. movsd 14 * SIZE(AA), %xmm1
  1310. movaps %xmm1, %xmm0
  1311. shufps $0x44, %xmm0, %xmm0
  1312. shufps $0x11, %xmm1, %xmm1
  1313. movaps %xmm4, %xmm2
  1314. shufps $0xa0, %xmm2, %xmm2
  1315. movaps %xmm4, %xmm3
  1316. shufps $0xf5, %xmm3, %xmm3
  1317. #ifndef CONJ
  1318. xorps POSINV, %xmm3
  1319. #else
  1320. xorps POSINV, %xmm2
  1321. #endif
  1322. mulps %xmm0, %xmm2
  1323. mulps %xmm1, %xmm3
  1324. subps %xmm2, %xmm6
  1325. subps %xmm3, %xmm6
  1326. movsd 20 * SIZE(AA), %xmm1
  1327. movaps %xmm1, %xmm0
  1328. shufps $0x44, %xmm0, %xmm0
  1329. shufps $0x11, %xmm1, %xmm1
  1330. movaps %xmm7, %xmm3
  1331. shufps $0xa0, %xmm3, %xmm3
  1332. shufps $0xf5, %xmm7, %xmm7
  1333. #ifndef CONJ
  1334. xorps POSINV, %xmm7
  1335. #else
  1336. xorps POSINV, %xmm3
  1337. #endif
  1338. mulps %xmm0, %xmm3
  1339. mulps %xmm1, %xmm7
  1340. addps %xmm3, %xmm7
  1341. movsd 22 * SIZE(AA), %xmm1
  1342. movaps %xmm1, %xmm0
  1343. shufps $0x44, %xmm0, %xmm0
  1344. shufps $0x11, %xmm1, %xmm1
  1345. movaps %xmm7, %xmm2
  1346. shufps $0xa0, %xmm2, %xmm2
  1347. movaps %xmm7, %xmm3
  1348. shufps $0xf5, %xmm3, %xmm3
  1349. #ifndef CONJ
  1350. xorps POSINV, %xmm3
  1351. #else
  1352. xorps POSINV, %xmm2
  1353. #endif
  1354. mulps %xmm0, %xmm2
  1355. mulps %xmm1, %xmm3
  1356. subps %xmm2, %xmm6
  1357. subps %xmm3, %xmm6
  1358. movsd 30 * SIZE(AA), %xmm1
  1359. movaps %xmm1, %xmm0
  1360. shufps $0x44, %xmm0, %xmm0
  1361. shufps $0x11, %xmm1, %xmm1
  1362. movaps %xmm6, %xmm3
  1363. shufps $0xa0, %xmm3, %xmm3
  1364. shufps $0xf5, %xmm6, %xmm6
  1365. #ifndef CONJ
  1366. xorps POSINV, %xmm6
  1367. #else
  1368. xorps POSINV, %xmm3
  1369. #endif
  1370. mulps %xmm0, %xmm3
  1371. mulps %xmm1, %xmm6
  1372. addps %xmm3, %xmm6
  1373. #endif
  1374. #if defined(RN) || defined(RT)
  1375. movsd 0 * SIZE(B), %xmm1
  1376. movhps 2 * SIZE(B), %xmm1
  1377. #ifdef HAVE_SSE2
  1378. pshufd $0x44, %xmm1, %xmm2
  1379. pshufd $0x11, %xmm1, %xmm3
  1380. pshufd $0xa0, %xmm5, %xmm4
  1381. pshufd $0xf5, %xmm5, %xmm5
  1382. pshufd $0xa0, %xmm7, %xmm6
  1383. pshufd $0xf5, %xmm7, %xmm7
  1384. #else
  1385. movaps %xmm1, %xmm2
  1386. shufps $0x44, %xmm2, %xmm2
  1387. movaps %xmm1, %xmm3
  1388. shufps $0x11, %xmm3, %xmm3
  1389. movaps %xmm5, %xmm4
  1390. shufps $0xa0, %xmm4, %xmm4
  1391. shufps $0xf5, %xmm5, %xmm5
  1392. movaps %xmm7, %xmm6
  1393. shufps $0xa0, %xmm6, %xmm6
  1394. shufps $0xf5, %xmm7, %xmm7
  1395. #endif
  1396. #ifndef CONJ
  1397. xorps %xmm0, %xmm5
  1398. xorps %xmm0, %xmm7
  1399. #else
  1400. xorps %xmm0, %xmm4
  1401. xorps %xmm0, %xmm6
  1402. #endif
  1403. mulps %xmm2, %xmm4
  1404. mulps %xmm3, %xmm5
  1405. mulps %xmm2, %xmm6
  1406. mulps %xmm3, %xmm7
  1407. addps %xmm4, %xmm5
  1408. addps %xmm6, %xmm7
  1409. #endif
  1410. #ifdef LN
  1411. subl $8 * SIZE, CO1
  1412. #endif
  1413. #if defined(LN) || defined(LT)
  1414. movlhps %xmm4, %xmm5
  1415. movlhps %xmm6, %xmm7
  1416. movsd %xmm5, 0 * SIZE(B)
  1417. movhps %xmm5, 2 * SIZE(B)
  1418. movsd %xmm7, 4 * SIZE(B)
  1419. movhps %xmm7, 6 * SIZE(B)
  1420. #ifdef HAVE_SSE2
  1421. pshufd $0x00, %xmm5, %xmm0
  1422. pshufd $0x55, %xmm5, %xmm1
  1423. pshufd $0xaa, %xmm5, %xmm2
  1424. pshufd $0xff, %xmm5, %xmm3
  1425. #else
  1426. movaps %xmm5, %xmm0
  1427. shufps $0x00, %xmm0, %xmm0
  1428. movaps %xmm5, %xmm1
  1429. shufps $0x55, %xmm1, %xmm1
  1430. movaps %xmm5, %xmm2
  1431. shufps $0xaa, %xmm2, %xmm2
  1432. movaps %xmm5, %xmm3
  1433. shufps $0xff, %xmm3, %xmm3
  1434. #endif
  1435. movaps %xmm0, 0 * SIZE(BB)
  1436. movaps %xmm1, 4 * SIZE(BB)
  1437. movaps %xmm2, 8 * SIZE(BB)
  1438. movaps %xmm3, 12 * SIZE(BB)
  1439. #ifdef HAVE_SSE2
  1440. pshufd $0x00, %xmm7, %xmm0
  1441. pshufd $0x55, %xmm7, %xmm1
  1442. pshufd $0xaa, %xmm7, %xmm2
  1443. pshufd $0xff, %xmm7, %xmm3
  1444. #else
  1445. movaps %xmm7, %xmm0
  1446. shufps $0x00, %xmm0, %xmm0
  1447. movaps %xmm7, %xmm1
  1448. shufps $0x55, %xmm1, %xmm1
  1449. movaps %xmm7, %xmm2
  1450. shufps $0xaa, %xmm2, %xmm2
  1451. movaps %xmm7, %xmm3
  1452. shufps $0xff, %xmm3, %xmm3
  1453. #endif
  1454. movaps %xmm0, 16 * SIZE(BB)
  1455. movaps %xmm1, 20 * SIZE(BB)
  1456. movaps %xmm2, 24 * SIZE(BB)
  1457. movaps %xmm3, 28 * SIZE(BB)
  1458. #else
  1459. movaps %xmm5, 0 * SIZE(AA)
  1460. movaps %xmm7, 4 * SIZE(AA)
  1461. #endif
  1462. movlps %xmm5, 0 * SIZE(CO1)
  1463. movhps %xmm5, 2 * SIZE(CO1)
  1464. movlps %xmm7, 4 * SIZE(CO1)
  1465. movhps %xmm7, 6 * SIZE(CO1)
  1466. #ifndef LN
  1467. addl $8 * SIZE, CO1
  1468. #endif
  1469. #if defined(LT) || defined(RN)
  1470. movl K, %eax
  1471. subl KK, %eax
  1472. sall $2 + ZBASE_SHIFT, %eax
  1473. addl %eax, AA
  1474. #ifdef LT
  1475. addl $8 * SIZE, B
  1476. #endif
  1477. #endif
  1478. #ifdef LN
  1479. subl $4, KK
  1480. movl BORIG, B
  1481. #endif
  1482. #ifdef LT
  1483. addl $4, KK
  1484. #endif
  1485. #ifdef RT
  1486. movl K, %eax
  1487. movl BORIG, B
  1488. sall $2 + ZBASE_SHIFT, %eax
  1489. addl %eax, AORIG
  1490. #endif
  1491. decl %ebx # i --
  1492. jg .L10
  1493. ALIGN_2
  1494. .L99:
  1495. #ifdef LN
  1496. movl K, %eax
  1497. sall $ZBASE_SHIFT, %eax
  1498. addl %eax, B
  1499. #endif
  1500. #if defined(LT) || defined(RN)
  1501. movl K, %eax
  1502. subl KK, %eax
  1503. sall $ZBASE_SHIFT, %eax
  1504. addl %eax, B
  1505. #endif
  1506. #ifdef RN
  1507. addl $1, KK
  1508. #endif
  1509. #ifdef RT
  1510. subl $1, KK
  1511. #endif
  1512. decl J # j --
  1513. jg .L01
  1514. ALIGN_2
  1515. .L999:
  1516. movl OLD_STACK, %esp
  1517. EMMS
  1518. popl %ebx
  1519. popl %esi
  1520. popl %edi
  1521. popl %ebp
  1522. ret
  1523. EPILOGUE