You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_RT_2x2_penryn.S 34 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define A 24 + STACK + ARGS(%esp)
  46. #define ARG_B 28 + STACK + ARGS(%esp)
  47. #define C 32 + STACK + ARGS(%esp)
  48. #define ARG_LDC 36 + STACK + ARGS(%esp)
  49. #define OFFSET 40 + STACK + ARGS(%esp)
  50. #define J 0 + STACK(%esp)
  51. #define KK 4 + STACK(%esp)
  52. #define KKK 8 + STACK(%esp)
  53. #define AORIG 12 + STACK(%esp)
  54. #if defined(PENRYN) || defined(DUNNINGTON)
  55. #define PREFETCH prefetcht1
  56. #define PREFETCHSIZE 84
  57. #endif
  58. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
  59. #define PREFETCH prefetcht1
  60. #define PREFETCHSIZE 84
  61. #endif
  62. #ifdef ATOM
  63. #define PREFETCH prefetcht0
  64. #define PREFETCHSIZE 84
  65. #endif
  66. #ifdef NANO
  67. #define PREFETCH prefetcht0
  68. #define PREFETCHSIZE (16 * 2)
  69. #endif
  70. #define B %edi
  71. #define LDC %ebp
  72. #define AA %edx
  73. #define BB %ecx
  74. #define CO1 %esi
  75. #define ADD1 addps
  76. #define ADD2 addps
  77. PROLOGUE
  78. subl $ARGS, %esp
  79. pushl %ebp
  80. pushl %edi
  81. pushl %esi
  82. pushl %ebx
  83. PROFCODE
  84. movl ARG_B, B
  85. movl ARG_LDC, LDC
  86. movl OFFSET, %eax
  87. #ifdef RN
  88. negl %eax
  89. #endif
  90. movl %eax, KK
  91. movl M, %ebx
  92. testl %ebx, %ebx
  93. jle .L999
  94. subl $-32 * SIZE, A
  95. subl $-32 * SIZE, B
  96. sall $ZBASE_SHIFT, LDC
  97. #ifdef LN
  98. movl M, %eax
  99. sall $ZBASE_SHIFT, %eax
  100. addl %eax, C
  101. imull K, %eax
  102. addl %eax, A
  103. #endif
  104. #ifdef RT
  105. movl N, %eax
  106. sall $ZBASE_SHIFT, %eax
  107. imull K, %eax
  108. addl %eax, B
  109. movl N, %eax
  110. imull LDC, %eax
  111. addl %eax, C
  112. #endif
  113. #ifdef RN
  114. negl KK
  115. #endif
  116. #ifdef RT
  117. movl N, %eax
  118. subl OFFSET, %eax
  119. movl %eax, KK
  120. #endif
  121. movl N, %eax
  122. andl $1, %eax
  123. jle .L100
  124. #if defined(LT) || defined(RN)
  125. movl A, %eax
  126. movl %eax, AA
  127. #else
  128. movl A, %eax
  129. movl %eax, AORIG
  130. #endif
  131. #ifdef RT
  132. movl K, %eax
  133. sall $ZBASE_SHIFT, %eax
  134. subl %eax, B
  135. #endif
  136. #ifdef RT
  137. subl LDC, C
  138. #endif
  139. movl C, CO1
  140. #ifndef RT
  141. addl LDC, C
  142. #endif
  143. #ifdef LN
  144. movl OFFSET, %eax
  145. addl M, %eax
  146. movl %eax, KK
  147. #endif
  148. #ifdef LT
  149. movl OFFSET, %eax
  150. movl %eax, KK
  151. #endif
  152. movl M, %ebx
  153. sarl $1, %ebx
  154. jle .L130
  155. ALIGN_4
  156. .L110:
  157. #ifdef LN
  158. movl K, %eax
  159. sall $1 + ZBASE_SHIFT, %eax
  160. subl %eax, AORIG
  161. #endif
  162. #if defined(LN) || defined(RT)
  163. movl KK, %eax
  164. movl AORIG, AA
  165. sall $1 + ZBASE_SHIFT, %eax
  166. addl %eax, AA
  167. #endif
  168. movl B, BB
  169. #if defined(LN) || defined(RT)
  170. movl KK, %eax
  171. sall $ZBASE_SHIFT, %eax
  172. addl %eax, BB
  173. #endif
  174. movaps -32 * SIZE(AA), %xmm0
  175. pxor %xmm2, %xmm2
  176. movsd -32 * SIZE(BB), %xmm1
  177. pxor %xmm3, %xmm3
  178. movhps -30 * SIZE(BB), %xmm1
  179. pxor %xmm4, %xmm4
  180. #ifdef LN
  181. prefetcht0 -4 * SIZE(CO1)
  182. #else
  183. prefetcht0 3 * SIZE(CO1)
  184. #endif
  185. pxor %xmm5, %xmm5
  186. pxor %xmm6, %xmm6
  187. pxor %xmm7, %xmm7
  188. #if defined(LT) || defined(RN)
  189. movl KK, %eax
  190. #else
  191. movl K, %eax
  192. subl KK, %eax
  193. #endif
  194. sarl $3, %eax
  195. je .L112
  196. ALIGN_4
  197. .L111:
  198. addps %xmm2, %xmm4
  199. pshufd $0x00, %xmm1, %xmm2
  200. mulps %xmm0, %xmm2
  201. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  202. addps %xmm3, %xmm5
  203. pshufd $0x55, %xmm1, %xmm3
  204. mulps %xmm0, %xmm3
  205. movaps -28 * SIZE(AA), %xmm0
  206. addps %xmm2, %xmm4
  207. pshufd $0xaa, %xmm1, %xmm2
  208. mulps %xmm0, %xmm2
  209. addps %xmm3, %xmm5
  210. pshufd $0xff, %xmm1, %xmm3
  211. movaps -28 * SIZE(BB), %xmm1
  212. mulps %xmm0, %xmm3
  213. movaps -24 * SIZE(AA), %xmm0
  214. addps %xmm2, %xmm4
  215. pshufd $0x00, %xmm1, %xmm2
  216. mulps %xmm0, %xmm2
  217. addps %xmm3, %xmm5
  218. pshufd $0x55, %xmm1, %xmm3
  219. mulps %xmm0, %xmm3
  220. movaps -20 * SIZE(AA), %xmm0
  221. addps %xmm2, %xmm4
  222. pshufd $0xaa, %xmm1, %xmm2
  223. mulps %xmm0, %xmm2
  224. addps %xmm3, %xmm5
  225. pshufd $0xff, %xmm1, %xmm3
  226. movaps -24 * SIZE(BB), %xmm1
  227. mulps %xmm0, %xmm3
  228. movaps -16 * SIZE(AA), %xmm0
  229. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  230. addps %xmm2, %xmm4
  231. pshufd $0x00, %xmm1, %xmm2
  232. mulps %xmm0, %xmm2
  233. addps %xmm3, %xmm5
  234. pshufd $0x55, %xmm1, %xmm3
  235. mulps %xmm0, %xmm3
  236. movaps -12 * SIZE(AA), %xmm0
  237. addps %xmm2, %xmm4
  238. pshufd $0xaa, %xmm1, %xmm2
  239. mulps %xmm0, %xmm2
  240. addps %xmm3, %xmm5
  241. pshufd $0xff, %xmm1, %xmm3
  242. movaps -20 * SIZE(BB), %xmm1
  243. mulps %xmm0, %xmm3
  244. movaps -8 * SIZE(AA), %xmm0
  245. addps %xmm2, %xmm4
  246. pshufd $0x00, %xmm1, %xmm2
  247. mulps %xmm0, %xmm2
  248. addps %xmm3, %xmm5
  249. pshufd $0x55, %xmm1, %xmm3
  250. mulps %xmm0, %xmm3
  251. movaps -4 * SIZE(AA), %xmm0
  252. addps %xmm2, %xmm4
  253. pshufd $0xaa, %xmm1, %xmm2
  254. mulps %xmm0, %xmm2
  255. addps %xmm3, %xmm5
  256. pshufd $0xff, %xmm1, %xmm3
  257. movaps -16 * SIZE(BB), %xmm1
  258. mulps %xmm0, %xmm3
  259. movaps 0 * SIZE(AA), %xmm0
  260. subl $-32 * SIZE, AA
  261. subl $-16 * SIZE, BB
  262. decl %eax
  263. jne .L111
  264. ALIGN_4
  265. .L112:
  266. #if defined(LT) || defined(RN)
  267. movl KK, %eax
  268. #else
  269. movl K, %eax
  270. subl KK, %eax
  271. #endif
  272. andl $7, %eax # if (k & 1)
  273. BRANCH
  274. je .L114
  275. ALIGN_4
  276. .L113:
  277. addps %xmm2, %xmm4
  278. pshufd $0x00, %xmm1, %xmm2
  279. mulps %xmm0, %xmm2
  280. addps %xmm3, %xmm5
  281. pshufd $0x55, %xmm1, %xmm3
  282. movsd -30 * SIZE(BB), %xmm1
  283. mulps %xmm0, %xmm3
  284. movaps -28 * SIZE(AA), %xmm0
  285. addl $4 * SIZE, AA
  286. addl $2 * SIZE, BB
  287. decl %eax
  288. jg .L113
  289. ALIGN_4
  290. .L114:
  291. #if defined(LN) || defined(RT)
  292. movl KK, %eax
  293. #ifdef LN
  294. subl $2, %eax
  295. #else
  296. subl $1, %eax
  297. #endif
  298. movl AORIG, AA
  299. sall $ZBASE_SHIFT, %eax
  300. leal (AA, %eax, 2), AA
  301. leal (B, %eax, 1), BB
  302. #endif
  303. addps %xmm2, %xmm4
  304. addps %xmm3, %xmm5
  305. pshufd $0xb1, %xmm5, %xmm5
  306. pcmpeqb %xmm0, %xmm0
  307. psllq $63, %xmm0
  308. #ifndef CONJ
  309. shufps $0xb1, %xmm0, %xmm0
  310. pxor %xmm0, %xmm5
  311. #else
  312. #if defined(LN) || defined(LT)
  313. pxor %xmm0, %xmm4
  314. #else
  315. pxor %xmm0, %xmm5
  316. #endif
  317. #endif
  318. addps %xmm5, %xmm4
  319. #if defined(LN) || defined(LT)
  320. movaps %xmm4, %xmm5
  321. unpcklpd %xmm6, %xmm4
  322. unpckhpd %xmm6, %xmm5
  323. movsd -32 * SIZE(BB), %xmm2
  324. movsd -30 * SIZE(BB), %xmm3
  325. subps %xmm4, %xmm2
  326. subps %xmm5, %xmm3
  327. #else
  328. movaps -32 * SIZE(AA), %xmm1
  329. subps %xmm4, %xmm1
  330. #endif
  331. #ifdef LN
  332. movaps -28 * SIZE(AA), %xmm5
  333. pshufd $0xee, %xmm5, %xmm6
  334. pshufd $0xbb, %xmm5, %xmm7
  335. pshufd $0xa0, %xmm3, %xmm4
  336. pshufd $0xf5, %xmm3, %xmm3
  337. #ifndef CONJ
  338. xorps %xmm0, %xmm3
  339. #else
  340. xorps %xmm0, %xmm4
  341. #endif
  342. mulps %xmm6, %xmm4
  343. mulps %xmm7, %xmm3
  344. addps %xmm4, %xmm3
  345. pshufd $0x44, %xmm5, %xmm6
  346. pshufd $0x11, %xmm5, %xmm7
  347. pshufd $0xa0, %xmm3, %xmm4
  348. pshufd $0xf5, %xmm3, %xmm1
  349. #ifndef CONJ
  350. xorps %xmm0, %xmm1
  351. #else
  352. xorps %xmm0, %xmm4
  353. #endif
  354. mulps %xmm6, %xmm4
  355. mulps %xmm7, %xmm1
  356. subps %xmm4, %xmm2
  357. subps %xmm1, %xmm2
  358. movaps -32 * SIZE(AA), %xmm5
  359. pshufd $0x44, %xmm5, %xmm6
  360. pshufd $0x11, %xmm5, %xmm7
  361. pshufd $0xa0, %xmm2, %xmm4
  362. pshufd $0xf5, %xmm2, %xmm2
  363. #ifndef CONJ
  364. xorps %xmm0, %xmm2
  365. #else
  366. xorps %xmm0, %xmm4
  367. #endif
  368. mulps %xmm6, %xmm4
  369. mulps %xmm7, %xmm2
  370. addps %xmm4, %xmm2
  371. #endif
  372. #ifdef LT
  373. movaps -32 * SIZE(AA), %xmm5
  374. pshufd $0x44, %xmm5, %xmm6
  375. pshufd $0x11, %xmm5, %xmm7
  376. pshufd $0xa0, %xmm2, %xmm4
  377. pshufd $0xf5, %xmm2, %xmm2
  378. #ifndef CONJ
  379. xorps %xmm0, %xmm2
  380. #else
  381. xorps %xmm0, %xmm4
  382. #endif
  383. mulps %xmm6, %xmm4
  384. mulps %xmm7, %xmm2
  385. addps %xmm4, %xmm2
  386. pshufd $0xee, %xmm5, %xmm6
  387. pshufd $0xbb, %xmm5, %xmm7
  388. pshufd $0xa0, %xmm2, %xmm4
  389. pshufd $0xf5, %xmm2, %xmm1
  390. #ifndef CONJ
  391. xorps %xmm0, %xmm1
  392. #else
  393. xorps %xmm0, %xmm4
  394. #endif
  395. mulps %xmm6, %xmm4
  396. mulps %xmm7, %xmm1
  397. subps %xmm4, %xmm3
  398. subps %xmm1, %xmm3
  399. movaps -28 * SIZE(AA), %xmm5
  400. pshufd $0xee, %xmm5, %xmm6
  401. pshufd $0xbb, %xmm5, %xmm7
  402. pshufd $0xa0, %xmm3, %xmm4
  403. pshufd $0xf5, %xmm3, %xmm3
  404. #ifndef CONJ
  405. xorps %xmm0, %xmm3
  406. #else
  407. xorps %xmm0, %xmm4
  408. #endif
  409. mulps %xmm6, %xmm4
  410. mulps %xmm7, %xmm3
  411. addps %xmm4, %xmm3
  412. #endif
  413. #if defined(RN) || defined(RT)
  414. movaps -32 * SIZE(BB), %xmm4
  415. pshufd $0x44, %xmm4, %xmm6
  416. pshufd $0x11, %xmm4, %xmm7
  417. pshufd $0xa0, %xmm1, %xmm3
  418. pshufd $0xf5, %xmm1, %xmm1
  419. #ifndef CONJ
  420. xorps %xmm0, %xmm1
  421. #else
  422. xorps %xmm0, %xmm3
  423. #endif
  424. mulps %xmm6, %xmm3
  425. mulps %xmm7, %xmm1
  426. addps %xmm3, %xmm1
  427. #endif
  428. #ifdef LN
  429. subl $4 * SIZE, CO1
  430. #endif
  431. #if defined(LN) || defined(LT)
  432. movlps %xmm2, -32 * SIZE(BB)
  433. movlps %xmm3, -30 * SIZE(BB)
  434. movlps %xmm2, 0 * SIZE(CO1)
  435. movlps %xmm3, 2 * SIZE(CO1)
  436. #else
  437. movaps %xmm1, -32 * SIZE(AA)
  438. movlps %xmm1, 0 * SIZE(CO1)
  439. movhps %xmm1, 2 * SIZE(CO1)
  440. #endif
  441. #ifndef LN
  442. addl $4 * SIZE, CO1
  443. #endif
  444. #if defined(LT) || defined(RN)
  445. movl K, %eax
  446. subl KK, %eax
  447. sall $ZBASE_SHIFT, %eax
  448. leal (AA, %eax, 2), AA
  449. leal (BB, %eax, 1), BB
  450. #endif
  451. #ifdef LN
  452. subl $2, KK
  453. #endif
  454. #ifdef LT
  455. addl $2, KK
  456. #endif
  457. #ifdef RT
  458. movl K, %eax
  459. sall $1 + ZBASE_SHIFT, %eax
  460. addl %eax, AORIG
  461. #endif
  462. decl %ebx # i --
  463. jg .L110
  464. ALIGN_4
  465. .L130:
  466. movl M, %ebx
  467. andl $1, %ebx
  468. jle .L149
  469. #ifdef LN
  470. movl K, %eax
  471. sall $ZBASE_SHIFT, %eax
  472. subl %eax, AORIG
  473. #endif
  474. #if defined(LN) || defined(RT)
  475. movl KK, %eax
  476. movl AORIG, AA
  477. sall $ZBASE_SHIFT, %eax
  478. addl %eax, AA
  479. #endif
  480. movl B, BB
  481. #if defined(LN) || defined(RT)
  482. movl KK, %eax
  483. sall $ZBASE_SHIFT, %eax
  484. addl %eax, BB
  485. #endif
  486. movsd -32 * SIZE(AA), %xmm0
  487. pxor %xmm2, %xmm2
  488. movsd -32 * SIZE(BB), %xmm1
  489. pxor %xmm3, %xmm3
  490. pxor %xmm4, %xmm4
  491. pxor %xmm5, %xmm5
  492. pxor %xmm6, %xmm6
  493. pxor %xmm7, %xmm7
  494. #if defined(LT) || defined(RN)
  495. movl KK, %eax
  496. #else
  497. movl K, %eax
  498. subl KK, %eax
  499. #endif
  500. sarl $3, %eax
  501. je .L142
  502. ALIGN_4
  503. .L141:
  504. addps %xmm2, %xmm4
  505. pshufd $0x00, %xmm1, %xmm2
  506. mulps %xmm0, %xmm2
  507. addps %xmm3, %xmm5
  508. pshufd $0x55, %xmm1, %xmm3
  509. movsd -30 * SIZE(BB), %xmm1
  510. mulps %xmm0, %xmm3
  511. movsd -30 * SIZE(AA), %xmm0
  512. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  513. addps %xmm2, %xmm4
  514. pshufd $0x00, %xmm1, %xmm2
  515. mulps %xmm0, %xmm2
  516. addps %xmm3, %xmm5
  517. pshufd $0x55, %xmm1, %xmm3
  518. movsd -28 * SIZE(BB), %xmm1
  519. mulps %xmm0, %xmm3
  520. movsd -28 * SIZE(AA), %xmm0
  521. addps %xmm2, %xmm4
  522. pshufd $0x00, %xmm1, %xmm2
  523. mulps %xmm0, %xmm2
  524. addps %xmm3, %xmm5
  525. pshufd $0x55, %xmm1, %xmm3
  526. movsd -26 * SIZE(BB), %xmm1
  527. mulps %xmm0, %xmm3
  528. movsd -26 * SIZE(AA), %xmm0
  529. addps %xmm2, %xmm4
  530. pshufd $0x00, %xmm1, %xmm2
  531. mulps %xmm0, %xmm2
  532. addps %xmm3, %xmm5
  533. pshufd $0x55, %xmm1, %xmm3
  534. movsd -24 * SIZE(BB), %xmm1
  535. mulps %xmm0, %xmm3
  536. movsd -24 * SIZE(AA), %xmm0
  537. addps %xmm2, %xmm4
  538. pshufd $0x00, %xmm1, %xmm2
  539. mulps %xmm0, %xmm2
  540. addps %xmm3, %xmm5
  541. pshufd $0x55, %xmm1, %xmm3
  542. movsd -22 * SIZE(BB), %xmm1
  543. mulps %xmm0, %xmm3
  544. movsd -22 * SIZE(AA), %xmm0
  545. addps %xmm2, %xmm4
  546. pshufd $0x00, %xmm1, %xmm2
  547. mulps %xmm0, %xmm2
  548. addps %xmm3, %xmm5
  549. pshufd $0x55, %xmm1, %xmm3
  550. movsd -20 * SIZE(BB), %xmm1
  551. mulps %xmm0, %xmm3
  552. movsd -20 * SIZE(AA), %xmm0
  553. addps %xmm2, %xmm4
  554. pshufd $0x00, %xmm1, %xmm2
  555. mulps %xmm0, %xmm2
  556. addps %xmm3, %xmm5
  557. pshufd $0x55, %xmm1, %xmm3
  558. movsd -18 * SIZE(BB), %xmm1
  559. mulps %xmm0, %xmm3
  560. movsd -18 * SIZE(AA), %xmm0
  561. addps %xmm2, %xmm4
  562. pshufd $0x00, %xmm1, %xmm2
  563. mulps %xmm0, %xmm2
  564. addps %xmm3, %xmm5
  565. pshufd $0x55, %xmm1, %xmm3
  566. movsd -16 * SIZE(BB), %xmm1
  567. mulps %xmm0, %xmm3
  568. movsd -16 * SIZE(AA), %xmm0
  569. subl $-16 * SIZE, AA
  570. subl $-16 * SIZE, BB
  571. decl %eax
  572. jne .L141
  573. ALIGN_4
  574. .L142:
  575. #if defined(LT) || defined(RN)
  576. movl KK, %eax
  577. #else
  578. movl K, %eax
  579. subl KK, %eax
  580. #endif
  581. andl $7, %eax # if (k & 1)
  582. BRANCH
  583. je .L144
  584. ALIGN_4
  585. .L143:
  586. addps %xmm2, %xmm4
  587. pshufd $0x00, %xmm1, %xmm2
  588. mulps %xmm0, %xmm2
  589. addps %xmm3, %xmm5
  590. pshufd $0x55, %xmm1, %xmm3
  591. movsd -30 * SIZE(BB), %xmm1
  592. mulps %xmm0, %xmm3
  593. movsd -30 * SIZE(AA), %xmm0
  594. addl $2 * SIZE, AA
  595. addl $2 * SIZE, BB
  596. decl %eax
  597. jg .L143
  598. ALIGN_4
  599. .L144:
  600. #if defined(LN) || defined(RT)
  601. movl KK, %eax
  602. subl $1, %eax
  603. movl AORIG, AA
  604. sall $ZBASE_SHIFT, %eax
  605. leal (AA, %eax, 1), AA
  606. leal (B, %eax, 1), BB
  607. #endif
  608. addps %xmm2, %xmm4
  609. addps %xmm3, %xmm5
  610. pshufd $0xb1, %xmm5, %xmm5
  611. pcmpeqb %xmm0, %xmm0
  612. psllq $63, %xmm0
  613. #ifndef CONJ
  614. shufps $0xb1, %xmm0, %xmm0
  615. pxor %xmm0, %xmm5
  616. #else
  617. #if defined(LN) || defined(LT)
  618. pxor %xmm0, %xmm4
  619. #else
  620. pxor %xmm0, %xmm5
  621. #endif
  622. #endif
  623. addps %xmm5, %xmm4
  624. #if defined(LN) || defined(LT)
  625. movsd -32 * SIZE(BB), %xmm2
  626. subps %xmm4, %xmm2
  627. #else
  628. movsd -32 * SIZE(AA), %xmm1
  629. subps %xmm4, %xmm1
  630. #endif
  631. #if defined(LN) || defined(LT)
  632. movaps -32 * SIZE(AA), %xmm5
  633. pshufd $0x44, %xmm5, %xmm6
  634. pshufd $0x11, %xmm5, %xmm7
  635. pshufd $0xa0, %xmm2, %xmm4
  636. pshufd $0xf5, %xmm2, %xmm2
  637. #ifndef CONJ
  638. xorps %xmm0, %xmm2
  639. #else
  640. xorps %xmm0, %xmm4
  641. #endif
  642. mulps %xmm6, %xmm4
  643. mulps %xmm7, %xmm2
  644. addps %xmm4, %xmm2
  645. #endif
  646. #if defined(RN) || defined(RT)
  647. movaps -32 * SIZE(BB), %xmm4
  648. pshufd $0x44, %xmm4, %xmm6
  649. pshufd $0x11, %xmm4, %xmm7
  650. pshufd $0xa0, %xmm1, %xmm3
  651. pshufd $0xf5, %xmm1, %xmm1
  652. #ifndef CONJ
  653. xorps %xmm0, %xmm1
  654. #else
  655. xorps %xmm0, %xmm3
  656. #endif
  657. mulps %xmm6, %xmm3
  658. mulps %xmm7, %xmm1
  659. addps %xmm3, %xmm1
  660. #endif
  661. #ifdef LN
  662. subl $2 * SIZE, CO1
  663. #endif
  664. #if defined(LN) || defined(LT)
  665. movlps %xmm2, -32 * SIZE(BB)
  666. movlps %xmm2, 0 * SIZE(CO1)
  667. #else
  668. movlps %xmm1, -32 * SIZE(AA)
  669. movlps %xmm1, 0 * SIZE(CO1)
  670. #endif
  671. #ifndef LN
  672. addl $2 * SIZE, CO1
  673. #endif
  674. #if defined(LT) || defined(RN)
  675. movl K, %eax
  676. subl KK, %eax
  677. sall $ZBASE_SHIFT, %eax
  678. leal (AA, %eax, 1), AA
  679. leal (BB, %eax, 1), BB
  680. #endif
  681. #ifdef LN
  682. subl $1, KK
  683. #endif
  684. #ifdef LT
  685. addl $1, KK
  686. #endif
  687. #ifdef RT
  688. movl K, %eax
  689. sall $ZBASE_SHIFT, %eax
  690. addl %eax, AORIG
  691. #endif
  692. ALIGN_4
  693. .L149:
  694. #ifdef LN
  695. movl K, %eax
  696. sall $ZBASE_SHIFT, %eax
  697. addl %eax, B
  698. #endif
  699. #if defined(LT) || defined(RN)
  700. movl BB, B
  701. #endif
  702. #ifdef RN
  703. addl $1, KK
  704. #endif
  705. #ifdef RT
  706. subl $1, KK
  707. #endif
  708. ALIGN_4
  709. .L100:
  710. movl N, %eax
  711. movl %eax, J
  712. sarl $1, J
  713. jle .L999
  714. ALIGN_4
  715. .L01:
  716. #if defined(LT) || defined(RN)
  717. movl A, %eax
  718. movl %eax, AA
  719. #else
  720. movl A, %eax
  721. movl %eax, AORIG
  722. #endif
  723. #ifdef RT
  724. movl K, %eax
  725. sall $1 + ZBASE_SHIFT, %eax
  726. subl %eax, B
  727. #endif
  728. leal (, LDC, 2), %eax
  729. #ifdef RT
  730. subl %eax, C
  731. #endif
  732. movl C, CO1
  733. #ifndef RT
  734. addl %eax, C
  735. #endif
  736. #ifdef LN
  737. movl OFFSET, %eax
  738. addl M, %eax
  739. movl %eax, KK
  740. #endif
  741. #ifdef LT
  742. movl OFFSET, %eax
  743. movl %eax, KK
  744. #endif
  745. movl M, %ebx
  746. sarl $1, %ebx
  747. jle .L30
  748. ALIGN_4
  749. .L10:
  750. #ifdef LN
  751. movl K, %eax
  752. sall $1 + ZBASE_SHIFT, %eax
  753. subl %eax, AORIG
  754. #endif
  755. #if defined(LN) || defined(RT)
  756. movl KK, %eax
  757. movl AORIG, AA
  758. sall $1 + ZBASE_SHIFT, %eax
  759. addl %eax, AA
  760. #endif
  761. movl B, BB
  762. #if defined(LN) || defined(RT)
  763. movl KK, %eax
  764. sall $1 + ZBASE_SHIFT, %eax
  765. addl %eax, BB
  766. #endif
  767. movaps -32 * SIZE(AA), %xmm0
  768. pxor %xmm2, %xmm2
  769. movaps -32 * SIZE(BB), %xmm1
  770. pxor %xmm3, %xmm3
  771. #ifdef LN
  772. pxor %xmm4, %xmm4
  773. prefetcht0 -4 * SIZE(CO1)
  774. pxor %xmm5, %xmm5
  775. prefetcht0 -4 * SIZE(CO1, LDC)
  776. pxor %xmm6, %xmm6
  777. pxor %xmm7, %xmm7
  778. #else
  779. pxor %xmm4, %xmm4
  780. prefetcht0 3 * SIZE(CO1)
  781. pxor %xmm5, %xmm5
  782. prefetcht0 3 * SIZE(CO1, LDC)
  783. pxor %xmm6, %xmm6
  784. pxor %xmm7, %xmm7
  785. #endif
  786. #if defined(LT) || defined(RN)
  787. movl KK, %eax
  788. #else
  789. movl K, %eax
  790. subl KK, %eax
  791. #endif
  792. sarl $3, %eax
  793. je .L15
  794. ALIGN_4
  795. .L11:
  796. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  797. ADD2 %xmm2, %xmm7
  798. pshufd $0xb1, %xmm1, %xmm2
  799. mulps %xmm0, %xmm1
  800. ADD1 %xmm3, %xmm6
  801. pshufd $0x1b, %xmm2, %xmm3
  802. mulps %xmm0, %xmm2
  803. ADD2 %xmm2, %xmm5
  804. pshufd $0xb1, %xmm3, %xmm2
  805. mulps %xmm0, %xmm3
  806. ADD1 %xmm1, %xmm4
  807. movaps -28 * SIZE(BB), %xmm1
  808. mulps %xmm0, %xmm2
  809. movaps -28 * SIZE(AA), %xmm0
  810. ADD2 %xmm2, %xmm7
  811. pshufd $0xb1, %xmm1, %xmm2
  812. mulps %xmm0, %xmm1
  813. ADD1 %xmm3, %xmm6
  814. pshufd $0x1b, %xmm2, %xmm3
  815. mulps %xmm0, %xmm2
  816. ADD2 %xmm2, %xmm5
  817. pshufd $0xb1, %xmm3, %xmm2
  818. mulps %xmm0, %xmm3
  819. ADD1 %xmm1, %xmm4
  820. movaps -24 * SIZE(BB), %xmm1
  821. mulps %xmm0, %xmm2
  822. movaps -24 * SIZE(AA), %xmm0
  823. ADD2 %xmm2, %xmm7
  824. pshufd $0xb1, %xmm1, %xmm2
  825. mulps %xmm0, %xmm1
  826. ADD1 %xmm3, %xmm6
  827. pshufd $0x1b, %xmm2, %xmm3
  828. mulps %xmm0, %xmm2
  829. ADD2 %xmm2, %xmm5
  830. pshufd $0xb1, %xmm3, %xmm2
  831. mulps %xmm0, %xmm3
  832. ADD1 %xmm1, %xmm4
  833. movaps -20 * SIZE(BB), %xmm1
  834. mulps %xmm0, %xmm2
  835. movaps -20 * SIZE(AA), %xmm0
  836. ADD2 %xmm2, %xmm7
  837. pshufd $0xb1, %xmm1, %xmm2
  838. mulps %xmm0, %xmm1
  839. ADD1 %xmm3, %xmm6
  840. pshufd $0x1b, %xmm2, %xmm3
  841. mulps %xmm0, %xmm2
  842. ADD2 %xmm2, %xmm5
  843. pshufd $0xb1, %xmm3, %xmm2
  844. mulps %xmm0, %xmm3
  845. ADD1 %xmm1, %xmm4
  846. movaps -16 * SIZE(BB), %xmm1
  847. mulps %xmm0, %xmm2
  848. movaps -16 * SIZE(AA), %xmm0
  849. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  850. ADD2 %xmm2, %xmm7
  851. pshufd $0xb1, %xmm1, %xmm2
  852. mulps %xmm0, %xmm1
  853. ADD1 %xmm3, %xmm6
  854. pshufd $0x1b, %xmm2, %xmm3
  855. mulps %xmm0, %xmm2
  856. ADD2 %xmm2, %xmm5
  857. pshufd $0xb1, %xmm3, %xmm2
  858. mulps %xmm0, %xmm3
  859. ADD1 %xmm1, %xmm4
  860. movaps -12 * SIZE(BB), %xmm1
  861. mulps %xmm0, %xmm2
  862. movaps -12 * SIZE(AA), %xmm0
  863. ADD2 %xmm2, %xmm7
  864. pshufd $0xb1, %xmm1, %xmm2
  865. mulps %xmm0, %xmm1
  866. ADD1 %xmm3, %xmm6
  867. pshufd $0x1b, %xmm2, %xmm3
  868. mulps %xmm0, %xmm2
  869. ADD2 %xmm2, %xmm5
  870. pshufd $0xb1, %xmm3, %xmm2
  871. mulps %xmm0, %xmm3
  872. ADD1 %xmm1, %xmm4
  873. movaps -8 * SIZE(BB), %xmm1
  874. mulps %xmm0, %xmm2
  875. movaps -8 * SIZE(AA), %xmm0
  876. ADD2 %xmm2, %xmm7
  877. pshufd $0xb1, %xmm1, %xmm2
  878. mulps %xmm0, %xmm1
  879. ADD1 %xmm3, %xmm6
  880. pshufd $0x1b, %xmm2, %xmm3
  881. mulps %xmm0, %xmm2
  882. ADD2 %xmm2, %xmm5
  883. pshufd $0xb1, %xmm3, %xmm2
  884. mulps %xmm0, %xmm3
  885. ADD1 %xmm1, %xmm4
  886. movaps -4 * SIZE(BB), %xmm1
  887. mulps %xmm0, %xmm2
  888. movaps -4 * SIZE(AA), %xmm0
  889. ADD2 %xmm2, %xmm7
  890. subl $-32 * SIZE, BB
  891. pshufd $0xb1, %xmm1, %xmm2
  892. mulps %xmm0, %xmm1
  893. ADD1 %xmm3, %xmm6
  894. pshufd $0x1b, %xmm2, %xmm3
  895. mulps %xmm0, %xmm2
  896. ADD2 %xmm2, %xmm5
  897. subl $-32 * SIZE, AA
  898. pshufd $0xb1, %xmm3, %xmm2
  899. mulps %xmm0, %xmm3
  900. ADD1 %xmm1, %xmm4
  901. movaps -32 * SIZE(BB), %xmm1
  902. mulps %xmm0, %xmm2
  903. movaps -32 * SIZE(AA), %xmm0
  904. decl %eax
  905. jne .L11
  906. ALIGN_4
  907. .L15:
  908. #if defined(LT) || defined(RN)
  909. movl KK, %eax
  910. #else
  911. movl K, %eax
  912. subl KK, %eax
  913. #endif
  914. andl $7, %eax # if (k & 1)
  915. BRANCH
  916. je .L14
  917. ALIGN_4
  918. .L13:
  919. ADD2 %xmm2, %xmm7
  920. pshufd $0xb1, %xmm1, %xmm2
  921. mulps %xmm0, %xmm1
  922. ADD1 %xmm3, %xmm6
  923. pshufd $0x1b, %xmm2, %xmm3
  924. mulps %xmm0, %xmm2
  925. ADD2 %xmm2, %xmm5
  926. pshufd $0xb1, %xmm3, %xmm2
  927. mulps %xmm0, %xmm3
  928. ADD1 %xmm1, %xmm4
  929. movaps -28 * SIZE(BB), %xmm1
  930. mulps %xmm0, %xmm2
  931. movaps -28 * SIZE(AA), %xmm0
  932. addl $4 * SIZE, AA
  933. addl $4 * SIZE, BB
  934. decl %eax
  935. jg .L13
  936. ALIGN_4
  937. .L14:
  938. #if defined(LN) || defined(RT)
  939. movl KK, %eax
  940. #ifdef LN
  941. subl $2, %eax
  942. #else
  943. subl $2, %eax
  944. #endif
  945. movl AORIG, AA
  946. sall $ZBASE_SHIFT, %eax
  947. leal (AA, %eax, 2), AA
  948. leal (B, %eax, 2), BB
  949. #endif
  950. ADD2 %xmm2, %xmm7
  951. pcmpeqb %xmm0, %xmm0
  952. ADD1 %xmm3, %xmm6
  953. psllq $63, %xmm0
  954. #ifndef CONJ
  955. pxor %xmm0, %xmm4
  956. pxor %xmm0, %xmm6
  957. shufps $0xb1, %xmm0, %xmm0
  958. #else
  959. #if defined(LN) || defined(LT)
  960. pxor %xmm0, %xmm5
  961. pxor %xmm0, %xmm7
  962. #else
  963. pshufd $0xb1, %xmm0, %xmm1
  964. pxor %xmm1, %xmm5
  965. pxor %xmm1, %xmm7
  966. #endif
  967. #endif
  968. haddps %xmm5, %xmm4
  969. haddps %xmm7, %xmm6
  970. shufps $0xd8, %xmm4, %xmm4
  971. shufps $0xd8, %xmm6, %xmm6
  972. movaps %xmm4, %xmm5
  973. shufps $0xe4, %xmm6, %xmm4
  974. shufps $0xe4, %xmm5, %xmm6
  975. #if defined(LN) || defined(LT)
  976. movaps %xmm4, %xmm5
  977. unpcklpd %xmm6, %xmm4
  978. unpckhpd %xmm6, %xmm5
  979. movaps -32 * SIZE(BB), %xmm2
  980. movaps -28 * SIZE(BB), %xmm3
  981. subps %xmm4, %xmm2
  982. subps %xmm5, %xmm3
  983. #else
  984. movaps -32 * SIZE(AA), %xmm1
  985. movaps -28 * SIZE(AA), %xmm5
  986. subps %xmm4, %xmm1
  987. subps %xmm6, %xmm5
  988. #endif
  989. #ifdef LN
  990. movaps -28 * SIZE(AA), %xmm5
  991. pshufd $0xee, %xmm5, %xmm6
  992. pshufd $0xbb, %xmm5, %xmm7
  993. pshufd $0xa0, %xmm3, %xmm4
  994. pshufd $0xf5, %xmm3, %xmm3
  995. #ifndef CONJ
  996. xorps %xmm0, %xmm3
  997. #else
  998. xorps %xmm0, %xmm4
  999. #endif
  1000. mulps %xmm6, %xmm4
  1001. mulps %xmm7, %xmm3
  1002. addps %xmm4, %xmm3
  1003. pshufd $0x44, %xmm5, %xmm6
  1004. pshufd $0x11, %xmm5, %xmm7
  1005. pshufd $0xa0, %xmm3, %xmm4
  1006. pshufd $0xf5, %xmm3, %xmm1
  1007. #ifndef CONJ
  1008. xorps %xmm0, %xmm1
  1009. #else
  1010. xorps %xmm0, %xmm4
  1011. #endif
  1012. mulps %xmm6, %xmm4
  1013. mulps %xmm7, %xmm1
  1014. subps %xmm4, %xmm2
  1015. subps %xmm1, %xmm2
  1016. movaps -32 * SIZE(AA), %xmm5
  1017. pshufd $0x44, %xmm5, %xmm6
  1018. pshufd $0x11, %xmm5, %xmm7
  1019. pshufd $0xa0, %xmm2, %xmm4
  1020. pshufd $0xf5, %xmm2, %xmm2
  1021. #ifndef CONJ
  1022. xorps %xmm0, %xmm2
  1023. #else
  1024. xorps %xmm0, %xmm4
  1025. #endif
  1026. mulps %xmm6, %xmm4
  1027. mulps %xmm7, %xmm2
  1028. addps %xmm4, %xmm2
  1029. #endif
  1030. #ifdef LT
  1031. movaps -32 * SIZE(AA), %xmm5
  1032. pshufd $0x44, %xmm5, %xmm6
  1033. pshufd $0x11, %xmm5, %xmm7
  1034. pshufd $0xa0, %xmm2, %xmm4
  1035. pshufd $0xf5, %xmm2, %xmm2
  1036. #ifndef CONJ
  1037. xorps %xmm0, %xmm2
  1038. #else
  1039. xorps %xmm0, %xmm4
  1040. #endif
  1041. mulps %xmm6, %xmm4
  1042. mulps %xmm7, %xmm2
  1043. addps %xmm4, %xmm2
  1044. pshufd $0xee, %xmm5, %xmm6
  1045. pshufd $0xbb, %xmm5, %xmm7
  1046. pshufd $0xa0, %xmm2, %xmm4
  1047. pshufd $0xf5, %xmm2, %xmm1
  1048. #ifndef CONJ
  1049. xorps %xmm0, %xmm1
  1050. #else
  1051. xorps %xmm0, %xmm4
  1052. #endif
  1053. mulps %xmm6, %xmm4
  1054. mulps %xmm7, %xmm1
  1055. subps %xmm4, %xmm3
  1056. subps %xmm1, %xmm3
  1057. movaps -28 * SIZE(AA), %xmm5
  1058. pshufd $0xee, %xmm5, %xmm6
  1059. pshufd $0xbb, %xmm5, %xmm7
  1060. pshufd $0xa0, %xmm3, %xmm4
  1061. pshufd $0xf5, %xmm3, %xmm3
  1062. #ifndef CONJ
  1063. xorps %xmm0, %xmm3
  1064. #else
  1065. xorps %xmm0, %xmm4
  1066. #endif
  1067. mulps %xmm6, %xmm4
  1068. mulps %xmm7, %xmm3
  1069. addps %xmm4, %xmm3
  1070. #endif
  1071. #ifdef RN
  1072. movaps -32 * SIZE(BB), %xmm4
  1073. pshufd $0x44, %xmm4, %xmm6
  1074. pshufd $0x11, %xmm4, %xmm7
  1075. pshufd $0xa0, %xmm1, %xmm3
  1076. pshufd $0xf5, %xmm1, %xmm1
  1077. #ifndef CONJ
  1078. xorps %xmm0, %xmm1
  1079. #else
  1080. xorps %xmm0, %xmm3
  1081. #endif
  1082. mulps %xmm6, %xmm3
  1083. mulps %xmm7, %xmm1
  1084. addps %xmm3, %xmm1
  1085. pshufd $0xee, %xmm4, %xmm6
  1086. pshufd $0xbb, %xmm4, %xmm7
  1087. pshufd $0xa0, %xmm1, %xmm3
  1088. pshufd $0xf5, %xmm1, %xmm2
  1089. #ifndef CONJ
  1090. xorps %xmm0, %xmm2
  1091. #else
  1092. xorps %xmm0, %xmm3
  1093. #endif
  1094. mulps %xmm6, %xmm3
  1095. mulps %xmm7, %xmm2
  1096. subps %xmm3, %xmm5
  1097. subps %xmm2, %xmm5
  1098. movaps -28 * SIZE(BB), %xmm4
  1099. pshufd $0xee, %xmm4, %xmm6
  1100. pshufd $0xbb, %xmm4, %xmm7
  1101. pshufd $0xa0, %xmm5, %xmm3
  1102. pshufd $0xf5, %xmm5, %xmm5
  1103. #ifndef CONJ
  1104. xorps %xmm0, %xmm5
  1105. #else
  1106. xorps %xmm0, %xmm3
  1107. #endif
  1108. mulps %xmm6, %xmm3
  1109. mulps %xmm7, %xmm5
  1110. addps %xmm3, %xmm5
  1111. #endif
  1112. #ifdef RT
  1113. movaps -28 * SIZE(BB), %xmm4
  1114. pshufd $0xee, %xmm4, %xmm6
  1115. pshufd $0xbb, %xmm4, %xmm7
  1116. pshufd $0xa0, %xmm5, %xmm3
  1117. pshufd $0xf5, %xmm5, %xmm5
  1118. #ifndef CONJ
  1119. xorps %xmm0, %xmm5
  1120. #else
  1121. xorps %xmm0, %xmm3
  1122. #endif
  1123. mulps %xmm6, %xmm3
  1124. mulps %xmm7, %xmm5
  1125. addps %xmm3, %xmm5
  1126. pshufd $0x44, %xmm4, %xmm6
  1127. pshufd $0x11, %xmm4, %xmm7
  1128. pshufd $0xa0, %xmm5, %xmm3
  1129. pshufd $0xf5, %xmm5, %xmm2
  1130. #ifndef CONJ
  1131. xorps %xmm0, %xmm2
  1132. #else
  1133. xorps %xmm0, %xmm3
  1134. #endif
  1135. mulps %xmm6, %xmm3
  1136. mulps %xmm7, %xmm2
  1137. subps %xmm3, %xmm1
  1138. subps %xmm2, %xmm1
  1139. movaps -32 * SIZE(BB), %xmm4
  1140. pshufd $0x44, %xmm4, %xmm6
  1141. pshufd $0x11, %xmm4, %xmm7
  1142. pshufd $0xa0, %xmm1, %xmm3
  1143. pshufd $0xf5, %xmm1, %xmm1
  1144. #ifndef CONJ
  1145. xorps %xmm0, %xmm1
  1146. #else
  1147. xorps %xmm0, %xmm3
  1148. #endif
  1149. mulps %xmm6, %xmm3
  1150. mulps %xmm7, %xmm1
  1151. addps %xmm3, %xmm1
  1152. #endif
  1153. #ifdef LN
  1154. subl $4 * SIZE, CO1
  1155. #endif
  1156. #if defined(LN) || defined(LT)
  1157. movaps %xmm2, -32 * SIZE(BB)
  1158. movaps %xmm3, -28 * SIZE(BB)
  1159. movlps %xmm2, 0 * SIZE(CO1)
  1160. movlps %xmm3, 2 * SIZE(CO1)
  1161. movhps %xmm2, 0 * SIZE(CO1, LDC)
  1162. movhps %xmm3, 2 * SIZE(CO1, LDC)
  1163. #else
  1164. movaps %xmm1, -32 * SIZE(AA)
  1165. movaps %xmm5, -28 * SIZE(AA)
  1166. movlps %xmm1, 0 * SIZE(CO1)
  1167. movhps %xmm1, 2 * SIZE(CO1)
  1168. movlps %xmm5, 0 * SIZE(CO1, LDC)
  1169. movhps %xmm5, 2 * SIZE(CO1, LDC)
  1170. #endif
  1171. #ifndef LN
  1172. addl $4 * SIZE, CO1
  1173. #endif
  1174. #if defined(LT) || defined(RN)
  1175. movl K, %eax
  1176. subl KK, %eax
  1177. sall $ZBASE_SHIFT, %eax
  1178. leal (AA, %eax, 2), AA
  1179. leal (BB, %eax, 2), BB
  1180. #endif
  1181. #ifdef LN
  1182. subl $2, KK
  1183. #endif
  1184. #ifdef LT
  1185. addl $2, KK
  1186. #endif
  1187. #ifdef RT
  1188. movl K, %eax
  1189. sall $1 + ZBASE_SHIFT, %eax
  1190. addl %eax, AORIG
  1191. #endif
  1192. decl %ebx
  1193. jg .L10
  1194. ALIGN_4
  1195. .L30:
  1196. movl M, %ebx
  1197. andl $1, %ebx
  1198. jle .L99
  1199. #ifdef LN
  1200. movl K, %eax
  1201. sall $ZBASE_SHIFT, %eax
  1202. subl %eax, AORIG
  1203. #endif
  1204. #if defined(LN) || defined(RT)
  1205. movl KK, %eax
  1206. movl AORIG, AA
  1207. sall $ZBASE_SHIFT, %eax
  1208. addl %eax, AA
  1209. #endif
  1210. movl B, BB
  1211. #if defined(LN) || defined(RT)
  1212. movl KK, %eax
  1213. sall $1 + ZBASE_SHIFT, %eax
  1214. addl %eax, BB
  1215. #endif
  1216. movsd -32 * SIZE(AA), %xmm0
  1217. pxor %xmm2, %xmm2
  1218. movaps -32 * SIZE(BB), %xmm1
  1219. pxor %xmm3, %xmm3
  1220. pxor %xmm4, %xmm4
  1221. pxor %xmm5, %xmm5
  1222. pxor %xmm6, %xmm6
  1223. pxor %xmm7, %xmm7
  1224. #if defined(LT) || defined(RN)
  1225. movl KK, %eax
  1226. #else
  1227. movl K, %eax
  1228. subl KK, %eax
  1229. #endif
  1230. sarl $3, %eax
  1231. je .L42
  1232. ALIGN_4
  1233. .L41:
  1234. addps %xmm2, %xmm6
  1235. pshufd $0x00, %xmm1, %xmm2
  1236. mulps %xmm0, %xmm2
  1237. addps %xmm3, %xmm7
  1238. pshufd $0x55, %xmm1, %xmm3
  1239. mulps %xmm0, %xmm3
  1240. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1241. addps %xmm2, %xmm4
  1242. pshufd $0xaa, %xmm1, %xmm2
  1243. mulps %xmm0, %xmm2
  1244. addps %xmm3, %xmm5
  1245. pshufd $0xff, %xmm1, %xmm3
  1246. movaps -28 * SIZE(BB), %xmm1
  1247. mulps %xmm0, %xmm3
  1248. movsd -30 * SIZE(AA), %xmm0
  1249. addps %xmm2, %xmm6
  1250. pshufd $0x00, %xmm1, %xmm2
  1251. mulps %xmm0, %xmm2
  1252. addps %xmm3, %xmm7
  1253. pshufd $0x55, %xmm1, %xmm3
  1254. mulps %xmm0, %xmm3
  1255. addps %xmm2, %xmm4
  1256. pshufd $0xaa, %xmm1, %xmm2
  1257. mulps %xmm0, %xmm2
  1258. addps %xmm3, %xmm5
  1259. pshufd $0xff, %xmm1, %xmm3
  1260. movaps -24 * SIZE(BB), %xmm1
  1261. mulps %xmm0, %xmm3
  1262. movsd -28 * SIZE(AA), %xmm0
  1263. addps %xmm2, %xmm6
  1264. pshufd $0x00, %xmm1, %xmm2
  1265. mulps %xmm0, %xmm2
  1266. addps %xmm3, %xmm7
  1267. pshufd $0x55, %xmm1, %xmm3
  1268. mulps %xmm0, %xmm3
  1269. addps %xmm2, %xmm4
  1270. pshufd $0xaa, %xmm1, %xmm2
  1271. mulps %xmm0, %xmm2
  1272. addps %xmm3, %xmm5
  1273. pshufd $0xff, %xmm1, %xmm3
  1274. movaps -20 * SIZE(BB), %xmm1
  1275. mulps %xmm0, %xmm3
  1276. movsd -26 * SIZE(AA), %xmm0
  1277. addps %xmm2, %xmm6
  1278. pshufd $0x00, %xmm1, %xmm2
  1279. mulps %xmm0, %xmm2
  1280. addps %xmm3, %xmm7
  1281. pshufd $0x55, %xmm1, %xmm3
  1282. mulps %xmm0, %xmm3
  1283. addps %xmm2, %xmm4
  1284. pshufd $0xaa, %xmm1, %xmm2
  1285. mulps %xmm0, %xmm2
  1286. addps %xmm3, %xmm5
  1287. pshufd $0xff, %xmm1, %xmm3
  1288. movaps -16 * SIZE(BB), %xmm1
  1289. mulps %xmm0, %xmm3
  1290. movsd -24 * SIZE(AA), %xmm0
  1291. addps %xmm2, %xmm6
  1292. pshufd $0x00, %xmm1, %xmm2
  1293. mulps %xmm0, %xmm2
  1294. addps %xmm3, %xmm7
  1295. pshufd $0x55, %xmm1, %xmm3
  1296. mulps %xmm0, %xmm3
  1297. addps %xmm2, %xmm4
  1298. pshufd $0xaa, %xmm1, %xmm2
  1299. mulps %xmm0, %xmm2
  1300. addps %xmm3, %xmm5
  1301. pshufd $0xff, %xmm1, %xmm3
  1302. movaps -12 * SIZE(BB), %xmm1
  1303. mulps %xmm0, %xmm3
  1304. movsd -22 * SIZE(AA), %xmm0
  1305. addps %xmm2, %xmm6
  1306. pshufd $0x00, %xmm1, %xmm2
  1307. mulps %xmm0, %xmm2
  1308. addps %xmm3, %xmm7
  1309. pshufd $0x55, %xmm1, %xmm3
  1310. mulps %xmm0, %xmm3
  1311. addps %xmm2, %xmm4
  1312. pshufd $0xaa, %xmm1, %xmm2
  1313. mulps %xmm0, %xmm2
  1314. addps %xmm3, %xmm5
  1315. pshufd $0xff, %xmm1, %xmm3
  1316. movaps -8 * SIZE(BB), %xmm1
  1317. mulps %xmm0, %xmm3
  1318. movsd -20 * SIZE(AA), %xmm0
  1319. addps %xmm2, %xmm6
  1320. pshufd $0x00, %xmm1, %xmm2
  1321. mulps %xmm0, %xmm2
  1322. addps %xmm3, %xmm7
  1323. pshufd $0x55, %xmm1, %xmm3
  1324. mulps %xmm0, %xmm3
  1325. addps %xmm2, %xmm4
  1326. pshufd $0xaa, %xmm1, %xmm2
  1327. mulps %xmm0, %xmm2
  1328. addps %xmm3, %xmm5
  1329. pshufd $0xff, %xmm1, %xmm3
  1330. movaps -4 * SIZE(BB), %xmm1
  1331. mulps %xmm0, %xmm3
  1332. movsd -18 * SIZE(AA), %xmm0
  1333. addps %xmm2, %xmm6
  1334. pshufd $0x00, %xmm1, %xmm2
  1335. mulps %xmm0, %xmm2
  1336. addps %xmm3, %xmm7
  1337. pshufd $0x55, %xmm1, %xmm3
  1338. mulps %xmm0, %xmm3
  1339. addps %xmm2, %xmm4
  1340. pshufd $0xaa, %xmm1, %xmm2
  1341. mulps %xmm0, %xmm2
  1342. addps %xmm3, %xmm5
  1343. pshufd $0xff, %xmm1, %xmm3
  1344. movaps 0 * SIZE(BB), %xmm1
  1345. mulps %xmm0, %xmm3
  1346. movsd -16 * SIZE(AA), %xmm0
  1347. subl $-16 * SIZE, AA
  1348. subl $-32 * SIZE, BB
  1349. decl %eax
  1350. jne .L41
  1351. ALIGN_4
  1352. .L42:
  1353. #if defined(LT) || defined(RN)
  1354. movl KK, %eax
  1355. #else
  1356. movl K, %eax
  1357. subl KK, %eax
  1358. #endif
  1359. andl $7, %eax # if (k & 1)
  1360. BRANCH
  1361. je .L44
  1362. ALIGN_4
  1363. .L43:
  1364. addps %xmm2, %xmm6
  1365. pshufd $0x00, %xmm1, %xmm2
  1366. mulps %xmm0, %xmm2
  1367. addps %xmm3, %xmm7
  1368. pshufd $0x55, %xmm1, %xmm3
  1369. mulps %xmm0, %xmm3
  1370. addps %xmm2, %xmm4
  1371. pshufd $0xaa, %xmm1, %xmm2
  1372. mulps %xmm0, %xmm2
  1373. addps %xmm3, %xmm5
  1374. pshufd $0xff, %xmm1, %xmm3
  1375. movaps -28 * SIZE(BB), %xmm1
  1376. mulps %xmm0, %xmm3
  1377. movsd -30 * SIZE(AA), %xmm0
  1378. addl $2 * SIZE, AA
  1379. addl $4 * SIZE, BB
  1380. decl %eax
  1381. jg .L43
  1382. ALIGN_4
  1383. .L44:
  1384. #if defined(LN) || defined(RT)
  1385. movl KK, %eax
  1386. #ifdef LN
  1387. subl $1, %eax
  1388. #else
  1389. subl $2, %eax
  1390. #endif
  1391. movl AORIG, AA
  1392. sall $ZBASE_SHIFT, %eax
  1393. leal (AA, %eax, 1), AA
  1394. leal (B, %eax, 2), BB
  1395. #endif
  1396. addps %xmm2, %xmm6
  1397. addps %xmm3, %xmm7
  1398. pshufd $0xb1, %xmm5, %xmm5
  1399. pcmpeqb %xmm0, %xmm0
  1400. pshufd $0xb1, %xmm7, %xmm7
  1401. psllq $63, %xmm0
  1402. #ifndef CONJ
  1403. shufps $0xb1, %xmm0, %xmm0
  1404. pxor %xmm0, %xmm5
  1405. pxor %xmm0, %xmm7
  1406. #else
  1407. #if defined(LN) || defined(LT)
  1408. pxor %xmm0, %xmm4
  1409. pxor %xmm0, %xmm6
  1410. #else
  1411. pxor %xmm0, %xmm5
  1412. pxor %xmm0, %xmm7
  1413. #endif
  1414. #endif
  1415. addps %xmm5, %xmm4
  1416. addps %xmm7, %xmm6
  1417. #if defined(LN) || defined(LT)
  1418. unpcklpd %xmm6, %xmm4
  1419. movaps -32 * SIZE(BB), %xmm2
  1420. subps %xmm4, %xmm2
  1421. #else
  1422. movsd -32 * SIZE(AA), %xmm1
  1423. movsd -30 * SIZE(AA), %xmm5
  1424. subps %xmm4, %xmm1
  1425. subps %xmm6, %xmm5
  1426. #endif
  1427. #if defined(LN) || defined(LT)
  1428. movaps -32 * SIZE(AA), %xmm5
  1429. pshufd $0x44, %xmm5, %xmm6
  1430. pshufd $0x11, %xmm5, %xmm7
  1431. pshufd $0xa0, %xmm2, %xmm4
  1432. pshufd $0xf5, %xmm2, %xmm2
  1433. #ifndef CONJ
  1434. xorps %xmm0, %xmm2
  1435. #else
  1436. xorps %xmm0, %xmm4
  1437. #endif
  1438. mulps %xmm6, %xmm4
  1439. mulps %xmm7, %xmm2
  1440. addps %xmm4, %xmm2
  1441. #endif
  1442. #ifdef RN
  1443. movaps -32 * SIZE(BB), %xmm4
  1444. pshufd $0x44, %xmm4, %xmm6
  1445. pshufd $0x11, %xmm4, %xmm7
  1446. pshufd $0xa0, %xmm1, %xmm3
  1447. pshufd $0xf5, %xmm1, %xmm1
  1448. #ifndef CONJ
  1449. xorps %xmm0, %xmm1
  1450. #else
  1451. xorps %xmm0, %xmm3
  1452. #endif
  1453. mulps %xmm6, %xmm3
  1454. mulps %xmm7, %xmm1
  1455. addps %xmm3, %xmm1
  1456. pshufd $0xee, %xmm4, %xmm6
  1457. pshufd $0xbb, %xmm4, %xmm7
  1458. pshufd $0xa0, %xmm1, %xmm3
  1459. pshufd $0xf5, %xmm1, %xmm2
  1460. #ifndef CONJ
  1461. xorps %xmm0, %xmm2
  1462. #else
  1463. xorps %xmm0, %xmm3
  1464. #endif
  1465. mulps %xmm6, %xmm3
  1466. mulps %xmm7, %xmm2
  1467. subps %xmm3, %xmm5
  1468. subps %xmm2, %xmm5
  1469. movaps -28 * SIZE(BB), %xmm4
  1470. pshufd $0xee, %xmm4, %xmm6
  1471. pshufd $0xbb, %xmm4, %xmm7
  1472. pshufd $0xa0, %xmm5, %xmm3
  1473. pshufd $0xf5, %xmm5, %xmm5
  1474. #ifndef CONJ
  1475. xorps %xmm0, %xmm5
  1476. #else
  1477. xorps %xmm0, %xmm3
  1478. #endif
  1479. mulps %xmm6, %xmm3
  1480. mulps %xmm7, %xmm5
  1481. addps %xmm3, %xmm5
  1482. #endif
  1483. #ifdef RT
  1484. movaps -28 * SIZE(BB), %xmm4
  1485. pshufd $0xee, %xmm4, %xmm6
  1486. pshufd $0xbb, %xmm4, %xmm7
  1487. pshufd $0xa0, %xmm5, %xmm3
  1488. pshufd $0xf5, %xmm5, %xmm5
  1489. #ifndef CONJ
  1490. xorps %xmm0, %xmm5
  1491. #else
  1492. xorps %xmm0, %xmm3
  1493. #endif
  1494. mulps %xmm6, %xmm3
  1495. mulps %xmm7, %xmm5
  1496. addps %xmm3, %xmm5
  1497. pshufd $0x44, %xmm4, %xmm6
  1498. pshufd $0x11, %xmm4, %xmm7
  1499. pshufd $0xa0, %xmm5, %xmm3
  1500. pshufd $0xf5, %xmm5, %xmm2
  1501. #ifndef CONJ
  1502. xorps %xmm0, %xmm2
  1503. #else
  1504. xorps %xmm0, %xmm3
  1505. #endif
  1506. mulps %xmm6, %xmm3
  1507. mulps %xmm7, %xmm2
  1508. subps %xmm3, %xmm1
  1509. subps %xmm2, %xmm1
  1510. movaps -32 * SIZE(BB), %xmm4
  1511. pshufd $0x44, %xmm4, %xmm6
  1512. pshufd $0x11, %xmm4, %xmm7
  1513. pshufd $0xa0, %xmm1, %xmm3
  1514. pshufd $0xf5, %xmm1, %xmm1
  1515. #ifndef CONJ
  1516. xorps %xmm0, %xmm1
  1517. #else
  1518. xorps %xmm0, %xmm3
  1519. #endif
  1520. mulps %xmm6, %xmm3
  1521. mulps %xmm7, %xmm1
  1522. addps %xmm3, %xmm1
  1523. #endif
  1524. #ifdef LN
  1525. subl $2 * SIZE, CO1
  1526. #endif
  1527. #if defined(LN) || defined(LT)
  1528. movaps %xmm2, -32 * SIZE(BB)
  1529. movlps %xmm2, 0 * SIZE(CO1)
  1530. movhps %xmm2, 0 * SIZE(CO1, LDC)
  1531. #else
  1532. movlps %xmm1, -32 * SIZE(AA)
  1533. movlps %xmm5, -30 * SIZE(AA)
  1534. movlps %xmm1, 0 * SIZE(CO1)
  1535. movlps %xmm5, 0 * SIZE(CO1, LDC)
  1536. #endif
  1537. #ifndef LN
  1538. addl $2 * SIZE, CO1
  1539. #endif
  1540. #if defined(LT) || defined(RN)
  1541. movl K, %eax
  1542. subl KK, %eax
  1543. sall $ZBASE_SHIFT, %eax
  1544. leal (AA, %eax, 1), AA
  1545. leal (BB, %eax, 2), BB
  1546. #endif
  1547. #ifdef LN
  1548. subl $1, KK
  1549. #endif
  1550. #ifdef LT
  1551. addl $1, KK
  1552. #endif
  1553. #ifdef RT
  1554. movl K, %eax
  1555. sall $ZBASE_SHIFT, %eax
  1556. addl %eax, AORIG
  1557. #endif
  1558. ALIGN_4
  1559. .L99:
  1560. #ifdef LN
  1561. movl K, %eax
  1562. sall $1 + ZBASE_SHIFT, %eax
  1563. addl %eax, B
  1564. #endif
  1565. #if defined(LT) || defined(RN)
  1566. movl BB, B
  1567. #endif
  1568. #ifdef RN
  1569. addl $2, KK
  1570. #endif
  1571. #ifdef RT
  1572. subl $2, KK
  1573. #endif
  1574. decl J # j --
  1575. jg .L01
  1576. ALIGN_4
  1577. .L999:
  1578. popl %ebx
  1579. popl %esi
  1580. popl %edi
  1581. popl %ebp
  1582. addl $ARGS, %esp
  1583. ret
  1584. EPILOGUE