You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LN_2x2_penryn.S 34 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define A 24 + STACK + ARGS(%esp)
  46. #define ARG_B 28 + STACK + ARGS(%esp)
  47. #define C 32 + STACK + ARGS(%esp)
  48. #define ARG_LDC 36 + STACK + ARGS(%esp)
  49. #define OFFSET 40 + STACK + ARGS(%esp)
  50. #define J 0 + STACK(%esp)
  51. #define KK 4 + STACK(%esp)
  52. #define KKK 8 + STACK(%esp)
  53. #define AORIG 12 + STACK(%esp)
  54. #if defined(PENRYN) || defined(DUNNINGTON)
  55. #define PREFETCH prefetcht1
  56. #define PREFETCHSIZE 84
  57. #endif
  58. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
  59. #define PREFETCH prefetcht1
  60. #define PREFETCHSIZE 84
  61. #endif
  62. #ifdef ATOM
  63. #define PREFETCH prefetcht0
  64. #define PREFETCHSIZE 84
  65. #endif
  66. #ifdef NANO
  67. #define PREFETCH prefetcht0
  68. #define PREFETCHSIZE (16 * 2)
  69. #endif
  70. #define B %edi
  71. #define LDC %ebp
  72. #define AA %edx
  73. #define BB %ecx
  74. #define CO1 %esi
  75. #define ADD1 addps
  76. #define ADD2 addps
  77. PROLOGUE
  78. subl $ARGS, %esp
  79. pushl %ebp
  80. pushl %edi
  81. pushl %esi
  82. pushl %ebx
  83. PROFCODE
  84. movl ARG_B, B
  85. movl ARG_LDC, LDC
  86. movl OFFSET, %eax
  87. #ifdef RN
  88. negl %eax
  89. #endif
  90. movl %eax, KK
  91. movl M, %ebx
  92. testl %ebx, %ebx
  93. jle .L999
  94. subl $-32 * SIZE, A
  95. subl $-32 * SIZE, B
  96. sall $ZBASE_SHIFT, LDC
  97. #ifdef LN
  98. movl M, %eax
  99. sall $ZBASE_SHIFT, %eax
  100. addl %eax, C
  101. imull K, %eax
  102. addl %eax, A
  103. #endif
  104. #ifdef RT
  105. movl N, %eax
  106. sall $ZBASE_SHIFT, %eax
  107. imull K, %eax
  108. addl %eax, B
  109. movl N, %eax
  110. imull LDC, %eax
  111. addl %eax, C
  112. #endif
  113. #ifdef RN
  114. negl KK
  115. #endif
  116. #ifdef RT
  117. movl N, %eax
  118. subl OFFSET, %eax
  119. movl %eax, KK
  120. #endif
  121. movl N, %eax
  122. movl %eax, J
  123. sarl $1, J
  124. jle .L100
  125. ALIGN_4
  126. .L01:
  127. #if defined(LT) || defined(RN)
  128. movl A, %eax
  129. movl %eax, AA
  130. #else
  131. movl A, %eax
  132. movl %eax, AORIG
  133. #endif
  134. #ifdef RT
  135. movl K, %eax
  136. sall $1 + ZBASE_SHIFT, %eax
  137. subl %eax, B
  138. #endif
  139. leal (, LDC, 2), %eax
  140. #ifdef RT
  141. subl %eax, C
  142. #endif
  143. movl C, CO1
  144. #ifndef RT
  145. addl %eax, C
  146. #endif
  147. #ifdef LN
  148. movl OFFSET, %eax
  149. addl M, %eax
  150. movl %eax, KK
  151. #endif
  152. #ifdef LT
  153. movl OFFSET, %eax
  154. movl %eax, KK
  155. #endif
  156. movl M, %ebx
  157. andl $1, %ebx
  158. jle .L30
  159. #ifdef LN
  160. movl K, %eax
  161. sall $ZBASE_SHIFT, %eax
  162. subl %eax, AORIG
  163. #endif
  164. #if defined(LN) || defined(RT)
  165. movl KK, %eax
  166. movl AORIG, AA
  167. sall $ZBASE_SHIFT, %eax
  168. addl %eax, AA
  169. #endif
  170. movl B, BB
  171. #if defined(LN) || defined(RT)
  172. movl KK, %eax
  173. sall $1 + ZBASE_SHIFT, %eax
  174. addl %eax, BB
  175. #endif
  176. movsd -32 * SIZE(AA), %xmm0
  177. pxor %xmm2, %xmm2
  178. movaps -32 * SIZE(BB), %xmm1
  179. pxor %xmm3, %xmm3
  180. pxor %xmm4, %xmm4
  181. pxor %xmm5, %xmm5
  182. pxor %xmm6, %xmm6
  183. pxor %xmm7, %xmm7
  184. #if defined(LT) || defined(RN)
  185. movl KK, %eax
  186. #else
  187. movl K, %eax
  188. subl KK, %eax
  189. #endif
  190. sarl $3, %eax
  191. je .L42
  192. ALIGN_4
  193. .L41:
  194. addps %xmm2, %xmm6
  195. pshufd $0x00, %xmm1, %xmm2
  196. mulps %xmm0, %xmm2
  197. addps %xmm3, %xmm7
  198. pshufd $0x55, %xmm1, %xmm3
  199. mulps %xmm0, %xmm3
  200. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  201. addps %xmm2, %xmm4
  202. pshufd $0xaa, %xmm1, %xmm2
  203. mulps %xmm0, %xmm2
  204. addps %xmm3, %xmm5
  205. pshufd $0xff, %xmm1, %xmm3
  206. movaps -28 * SIZE(BB), %xmm1
  207. mulps %xmm0, %xmm3
  208. movsd -30 * SIZE(AA), %xmm0
  209. addps %xmm2, %xmm6
  210. pshufd $0x00, %xmm1, %xmm2
  211. mulps %xmm0, %xmm2
  212. addps %xmm3, %xmm7
  213. pshufd $0x55, %xmm1, %xmm3
  214. mulps %xmm0, %xmm3
  215. addps %xmm2, %xmm4
  216. pshufd $0xaa, %xmm1, %xmm2
  217. mulps %xmm0, %xmm2
  218. addps %xmm3, %xmm5
  219. pshufd $0xff, %xmm1, %xmm3
  220. movaps -24 * SIZE(BB), %xmm1
  221. mulps %xmm0, %xmm3
  222. movsd -28 * SIZE(AA), %xmm0
  223. addps %xmm2, %xmm6
  224. pshufd $0x00, %xmm1, %xmm2
  225. mulps %xmm0, %xmm2
  226. addps %xmm3, %xmm7
  227. pshufd $0x55, %xmm1, %xmm3
  228. mulps %xmm0, %xmm3
  229. addps %xmm2, %xmm4
  230. pshufd $0xaa, %xmm1, %xmm2
  231. mulps %xmm0, %xmm2
  232. addps %xmm3, %xmm5
  233. pshufd $0xff, %xmm1, %xmm3
  234. movaps -20 * SIZE(BB), %xmm1
  235. mulps %xmm0, %xmm3
  236. movsd -26 * SIZE(AA), %xmm0
  237. addps %xmm2, %xmm6
  238. pshufd $0x00, %xmm1, %xmm2
  239. mulps %xmm0, %xmm2
  240. addps %xmm3, %xmm7
  241. pshufd $0x55, %xmm1, %xmm3
  242. mulps %xmm0, %xmm3
  243. addps %xmm2, %xmm4
  244. pshufd $0xaa, %xmm1, %xmm2
  245. mulps %xmm0, %xmm2
  246. addps %xmm3, %xmm5
  247. pshufd $0xff, %xmm1, %xmm3
  248. movaps -16 * SIZE(BB), %xmm1
  249. mulps %xmm0, %xmm3
  250. movsd -24 * SIZE(AA), %xmm0
  251. addps %xmm2, %xmm6
  252. pshufd $0x00, %xmm1, %xmm2
  253. mulps %xmm0, %xmm2
  254. addps %xmm3, %xmm7
  255. pshufd $0x55, %xmm1, %xmm3
  256. mulps %xmm0, %xmm3
  257. addps %xmm2, %xmm4
  258. pshufd $0xaa, %xmm1, %xmm2
  259. mulps %xmm0, %xmm2
  260. addps %xmm3, %xmm5
  261. pshufd $0xff, %xmm1, %xmm3
  262. movaps -12 * SIZE(BB), %xmm1
  263. mulps %xmm0, %xmm3
  264. movsd -22 * SIZE(AA), %xmm0
  265. addps %xmm2, %xmm6
  266. pshufd $0x00, %xmm1, %xmm2
  267. mulps %xmm0, %xmm2
  268. addps %xmm3, %xmm7
  269. pshufd $0x55, %xmm1, %xmm3
  270. mulps %xmm0, %xmm3
  271. addps %xmm2, %xmm4
  272. pshufd $0xaa, %xmm1, %xmm2
  273. mulps %xmm0, %xmm2
  274. addps %xmm3, %xmm5
  275. pshufd $0xff, %xmm1, %xmm3
  276. movaps -8 * SIZE(BB), %xmm1
  277. mulps %xmm0, %xmm3
  278. movsd -20 * SIZE(AA), %xmm0
  279. addps %xmm2, %xmm6
  280. pshufd $0x00, %xmm1, %xmm2
  281. mulps %xmm0, %xmm2
  282. addps %xmm3, %xmm7
  283. pshufd $0x55, %xmm1, %xmm3
  284. mulps %xmm0, %xmm3
  285. addps %xmm2, %xmm4
  286. pshufd $0xaa, %xmm1, %xmm2
  287. mulps %xmm0, %xmm2
  288. addps %xmm3, %xmm5
  289. pshufd $0xff, %xmm1, %xmm3
  290. movaps -4 * SIZE(BB), %xmm1
  291. mulps %xmm0, %xmm3
  292. movsd -18 * SIZE(AA), %xmm0
  293. addps %xmm2, %xmm6
  294. pshufd $0x00, %xmm1, %xmm2
  295. mulps %xmm0, %xmm2
  296. addps %xmm3, %xmm7
  297. pshufd $0x55, %xmm1, %xmm3
  298. mulps %xmm0, %xmm3
  299. addps %xmm2, %xmm4
  300. pshufd $0xaa, %xmm1, %xmm2
  301. mulps %xmm0, %xmm2
  302. addps %xmm3, %xmm5
  303. pshufd $0xff, %xmm1, %xmm3
  304. movaps 0 * SIZE(BB), %xmm1
  305. mulps %xmm0, %xmm3
  306. movsd -16 * SIZE(AA), %xmm0
  307. subl $-16 * SIZE, AA
  308. subl $-32 * SIZE, BB
  309. decl %eax
  310. jne .L41
  311. ALIGN_4
  312. .L42:
  313. #if defined(LT) || defined(RN)
  314. movl KK, %eax
  315. #else
  316. movl K, %eax
  317. subl KK, %eax
  318. #endif
  319. andl $7, %eax # if (k & 1)
  320. BRANCH
  321. je .L44
  322. ALIGN_4
  323. .L43:
  324. addps %xmm2, %xmm6
  325. pshufd $0x00, %xmm1, %xmm2
  326. mulps %xmm0, %xmm2
  327. addps %xmm3, %xmm7
  328. pshufd $0x55, %xmm1, %xmm3
  329. mulps %xmm0, %xmm3
  330. addps %xmm2, %xmm4
  331. pshufd $0xaa, %xmm1, %xmm2
  332. mulps %xmm0, %xmm2
  333. addps %xmm3, %xmm5
  334. pshufd $0xff, %xmm1, %xmm3
  335. movaps -28 * SIZE(BB), %xmm1
  336. mulps %xmm0, %xmm3
  337. movsd -30 * SIZE(AA), %xmm0
  338. addl $2 * SIZE, AA
  339. addl $4 * SIZE, BB
  340. decl %eax
  341. jg .L43
  342. ALIGN_4
  343. .L44:
  344. #if defined(LN) || defined(RT)
  345. movl KK, %eax
  346. #ifdef LN
  347. subl $1, %eax
  348. #else
  349. subl $2, %eax
  350. #endif
  351. movl AORIG, AA
  352. sall $ZBASE_SHIFT, %eax
  353. leal (AA, %eax, 1), AA
  354. leal (B, %eax, 2), BB
  355. #endif
  356. addps %xmm2, %xmm6
  357. addps %xmm3, %xmm7
  358. pshufd $0xb1, %xmm5, %xmm5
  359. pcmpeqb %xmm0, %xmm0
  360. pshufd $0xb1, %xmm7, %xmm7
  361. psllq $63, %xmm0
  362. #ifndef CONJ
  363. shufps $0xb1, %xmm0, %xmm0
  364. pxor %xmm0, %xmm5
  365. pxor %xmm0, %xmm7
  366. #else
  367. #if defined(LN) || defined(LT)
  368. pxor %xmm0, %xmm4
  369. pxor %xmm0, %xmm6
  370. #else
  371. pxor %xmm0, %xmm5
  372. pxor %xmm0, %xmm7
  373. #endif
  374. #endif
  375. addps %xmm5, %xmm4
  376. addps %xmm7, %xmm6
  377. #if defined(LN) || defined(LT)
  378. unpcklpd %xmm6, %xmm4
  379. movaps -32 * SIZE(BB), %xmm2
  380. subps %xmm4, %xmm2
  381. #else
  382. movsd -32 * SIZE(AA), %xmm1
  383. movsd -30 * SIZE(AA), %xmm5
  384. subps %xmm4, %xmm1
  385. subps %xmm6, %xmm5
  386. #endif
  387. #if defined(LN) || defined(LT)
  388. movaps -32 * SIZE(AA), %xmm5
  389. pshufd $0x44, %xmm5, %xmm6
  390. pshufd $0x11, %xmm5, %xmm7
  391. pshufd $0xa0, %xmm2, %xmm4
  392. pshufd $0xf5, %xmm2, %xmm2
  393. #ifndef CONJ
  394. xorps %xmm0, %xmm2
  395. #else
  396. xorps %xmm0, %xmm4
  397. #endif
  398. mulps %xmm6, %xmm4
  399. mulps %xmm7, %xmm2
  400. addps %xmm4, %xmm2
  401. #endif
  402. #ifdef RN
  403. movaps -32 * SIZE(BB), %xmm4
  404. pshufd $0x44, %xmm4, %xmm6
  405. pshufd $0x11, %xmm4, %xmm7
  406. pshufd $0xa0, %xmm1, %xmm3
  407. pshufd $0xf5, %xmm1, %xmm1
  408. #ifndef CONJ
  409. xorps %xmm0, %xmm1
  410. #else
  411. xorps %xmm0, %xmm3
  412. #endif
  413. mulps %xmm6, %xmm3
  414. mulps %xmm7, %xmm1
  415. addps %xmm3, %xmm1
  416. pshufd $0xee, %xmm4, %xmm6
  417. pshufd $0xbb, %xmm4, %xmm7
  418. pshufd $0xa0, %xmm1, %xmm3
  419. pshufd $0xf5, %xmm1, %xmm2
  420. #ifndef CONJ
  421. xorps %xmm0, %xmm2
  422. #else
  423. xorps %xmm0, %xmm3
  424. #endif
  425. mulps %xmm6, %xmm3
  426. mulps %xmm7, %xmm2
  427. subps %xmm3, %xmm5
  428. subps %xmm2, %xmm5
  429. movaps -28 * SIZE(BB), %xmm4
  430. pshufd $0xee, %xmm4, %xmm6
  431. pshufd $0xbb, %xmm4, %xmm7
  432. pshufd $0xa0, %xmm5, %xmm3
  433. pshufd $0xf5, %xmm5, %xmm5
  434. #ifndef CONJ
  435. xorps %xmm0, %xmm5
  436. #else
  437. xorps %xmm0, %xmm3
  438. #endif
  439. mulps %xmm6, %xmm3
  440. mulps %xmm7, %xmm5
  441. addps %xmm3, %xmm5
  442. #endif
  443. #ifdef RT
  444. movaps -28 * SIZE(BB), %xmm4
  445. pshufd $0xee, %xmm4, %xmm6
  446. pshufd $0xbb, %xmm4, %xmm7
  447. pshufd $0xa0, %xmm5, %xmm3
  448. pshufd $0xf5, %xmm5, %xmm5
  449. #ifndef CONJ
  450. xorps %xmm0, %xmm5
  451. #else
  452. xorps %xmm0, %xmm3
  453. #endif
  454. mulps %xmm6, %xmm3
  455. mulps %xmm7, %xmm5
  456. addps %xmm3, %xmm5
  457. pshufd $0x44, %xmm4, %xmm6
  458. pshufd $0x11, %xmm4, %xmm7
  459. pshufd $0xa0, %xmm5, %xmm3
  460. pshufd $0xf5, %xmm5, %xmm2
  461. #ifndef CONJ
  462. xorps %xmm0, %xmm2
  463. #else
  464. xorps %xmm0, %xmm3
  465. #endif
  466. mulps %xmm6, %xmm3
  467. mulps %xmm7, %xmm2
  468. subps %xmm3, %xmm1
  469. subps %xmm2, %xmm1
  470. movaps -32 * SIZE(BB), %xmm4
  471. pshufd $0x44, %xmm4, %xmm6
  472. pshufd $0x11, %xmm4, %xmm7
  473. pshufd $0xa0, %xmm1, %xmm3
  474. pshufd $0xf5, %xmm1, %xmm1
  475. #ifndef CONJ
  476. xorps %xmm0, %xmm1
  477. #else
  478. xorps %xmm0, %xmm3
  479. #endif
  480. mulps %xmm6, %xmm3
  481. mulps %xmm7, %xmm1
  482. addps %xmm3, %xmm1
  483. #endif
  484. #ifdef LN
  485. subl $2 * SIZE, CO1
  486. #endif
  487. #if defined(LN) || defined(LT)
  488. movaps %xmm2, -32 * SIZE(BB)
  489. movlps %xmm2, 0 * SIZE(CO1)
  490. movhps %xmm2, 0 * SIZE(CO1, LDC)
  491. #else
  492. movlps %xmm1, -32 * SIZE(AA)
  493. movlps %xmm5, -30 * SIZE(AA)
  494. movlps %xmm1, 0 * SIZE(CO1)
  495. movlps %xmm5, 0 * SIZE(CO1, LDC)
  496. #endif
  497. #ifndef LN
  498. addl $2 * SIZE, CO1
  499. #endif
  500. #if defined(LT) || defined(RN)
  501. movl K, %eax
  502. subl KK, %eax
  503. sall $ZBASE_SHIFT, %eax
  504. leal (AA, %eax, 1), AA
  505. leal (BB, %eax, 2), BB
  506. #endif
  507. #ifdef LN
  508. subl $1, KK
  509. #endif
  510. #ifdef LT
  511. addl $1, KK
  512. #endif
  513. #ifdef RT
  514. movl K, %eax
  515. sall $ZBASE_SHIFT, %eax
  516. addl %eax, AORIG
  517. #endif
  518. ALIGN_4
  519. .L30:
  520. movl M, %ebx
  521. sarl $1, %ebx
  522. jle .L99
  523. ALIGN_4
  524. .L10:
  525. #ifdef LN
  526. movl K, %eax
  527. sall $1 + ZBASE_SHIFT, %eax
  528. subl %eax, AORIG
  529. #endif
  530. #if defined(LN) || defined(RT)
  531. movl KK, %eax
  532. movl AORIG, AA
  533. sall $1 + ZBASE_SHIFT, %eax
  534. addl %eax, AA
  535. #endif
  536. movl B, BB
  537. #if defined(LN) || defined(RT)
  538. movl KK, %eax
  539. sall $1 + ZBASE_SHIFT, %eax
  540. addl %eax, BB
  541. #endif
  542. movaps -32 * SIZE(AA), %xmm0
  543. pxor %xmm2, %xmm2
  544. movaps -32 * SIZE(BB), %xmm1
  545. pxor %xmm3, %xmm3
  546. #ifdef LN
  547. pxor %xmm4, %xmm4
  548. prefetcht0 -4 * SIZE(CO1)
  549. pxor %xmm5, %xmm5
  550. prefetcht0 -4 * SIZE(CO1, LDC)
  551. pxor %xmm6, %xmm6
  552. pxor %xmm7, %xmm7
  553. #else
  554. pxor %xmm4, %xmm4
  555. prefetcht0 3 * SIZE(CO1)
  556. pxor %xmm5, %xmm5
  557. prefetcht0 3 * SIZE(CO1, LDC)
  558. pxor %xmm6, %xmm6
  559. pxor %xmm7, %xmm7
  560. #endif
  561. #if defined(LT) || defined(RN)
  562. movl KK, %eax
  563. #else
  564. movl K, %eax
  565. subl KK, %eax
  566. #endif
  567. sarl $3, %eax
  568. je .L15
  569. ALIGN_4
  570. .L11:
  571. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  572. ADD2 %xmm2, %xmm7
  573. pshufd $0xb1, %xmm1, %xmm2
  574. mulps %xmm0, %xmm1
  575. ADD1 %xmm3, %xmm6
  576. pshufd $0x1b, %xmm2, %xmm3
  577. mulps %xmm0, %xmm2
  578. ADD2 %xmm2, %xmm5
  579. pshufd $0xb1, %xmm3, %xmm2
  580. mulps %xmm0, %xmm3
  581. ADD1 %xmm1, %xmm4
  582. movaps -28 * SIZE(BB), %xmm1
  583. mulps %xmm0, %xmm2
  584. movaps -28 * SIZE(AA), %xmm0
  585. ADD2 %xmm2, %xmm7
  586. pshufd $0xb1, %xmm1, %xmm2
  587. mulps %xmm0, %xmm1
  588. ADD1 %xmm3, %xmm6
  589. pshufd $0x1b, %xmm2, %xmm3
  590. mulps %xmm0, %xmm2
  591. ADD2 %xmm2, %xmm5
  592. pshufd $0xb1, %xmm3, %xmm2
  593. mulps %xmm0, %xmm3
  594. ADD1 %xmm1, %xmm4
  595. movaps -24 * SIZE(BB), %xmm1
  596. mulps %xmm0, %xmm2
  597. movaps -24 * SIZE(AA), %xmm0
  598. ADD2 %xmm2, %xmm7
  599. pshufd $0xb1, %xmm1, %xmm2
  600. mulps %xmm0, %xmm1
  601. ADD1 %xmm3, %xmm6
  602. pshufd $0x1b, %xmm2, %xmm3
  603. mulps %xmm0, %xmm2
  604. ADD2 %xmm2, %xmm5
  605. pshufd $0xb1, %xmm3, %xmm2
  606. mulps %xmm0, %xmm3
  607. ADD1 %xmm1, %xmm4
  608. movaps -20 * SIZE(BB), %xmm1
  609. mulps %xmm0, %xmm2
  610. movaps -20 * SIZE(AA), %xmm0
  611. ADD2 %xmm2, %xmm7
  612. pshufd $0xb1, %xmm1, %xmm2
  613. mulps %xmm0, %xmm1
  614. ADD1 %xmm3, %xmm6
  615. pshufd $0x1b, %xmm2, %xmm3
  616. mulps %xmm0, %xmm2
  617. ADD2 %xmm2, %xmm5
  618. pshufd $0xb1, %xmm3, %xmm2
  619. mulps %xmm0, %xmm3
  620. ADD1 %xmm1, %xmm4
  621. movaps -16 * SIZE(BB), %xmm1
  622. mulps %xmm0, %xmm2
  623. movaps -16 * SIZE(AA), %xmm0
  624. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  625. ADD2 %xmm2, %xmm7
  626. pshufd $0xb1, %xmm1, %xmm2
  627. mulps %xmm0, %xmm1
  628. ADD1 %xmm3, %xmm6
  629. pshufd $0x1b, %xmm2, %xmm3
  630. mulps %xmm0, %xmm2
  631. ADD2 %xmm2, %xmm5
  632. pshufd $0xb1, %xmm3, %xmm2
  633. mulps %xmm0, %xmm3
  634. ADD1 %xmm1, %xmm4
  635. movaps -12 * SIZE(BB), %xmm1
  636. mulps %xmm0, %xmm2
  637. movaps -12 * SIZE(AA), %xmm0
  638. ADD2 %xmm2, %xmm7
  639. pshufd $0xb1, %xmm1, %xmm2
  640. mulps %xmm0, %xmm1
  641. ADD1 %xmm3, %xmm6
  642. pshufd $0x1b, %xmm2, %xmm3
  643. mulps %xmm0, %xmm2
  644. ADD2 %xmm2, %xmm5
  645. pshufd $0xb1, %xmm3, %xmm2
  646. mulps %xmm0, %xmm3
  647. ADD1 %xmm1, %xmm4
  648. movaps -8 * SIZE(BB), %xmm1
  649. mulps %xmm0, %xmm2
  650. movaps -8 * SIZE(AA), %xmm0
  651. ADD2 %xmm2, %xmm7
  652. pshufd $0xb1, %xmm1, %xmm2
  653. mulps %xmm0, %xmm1
  654. ADD1 %xmm3, %xmm6
  655. pshufd $0x1b, %xmm2, %xmm3
  656. mulps %xmm0, %xmm2
  657. ADD2 %xmm2, %xmm5
  658. pshufd $0xb1, %xmm3, %xmm2
  659. mulps %xmm0, %xmm3
  660. ADD1 %xmm1, %xmm4
  661. movaps -4 * SIZE(BB), %xmm1
  662. mulps %xmm0, %xmm2
  663. movaps -4 * SIZE(AA), %xmm0
  664. ADD2 %xmm2, %xmm7
  665. subl $-32 * SIZE, BB
  666. pshufd $0xb1, %xmm1, %xmm2
  667. mulps %xmm0, %xmm1
  668. ADD1 %xmm3, %xmm6
  669. pshufd $0x1b, %xmm2, %xmm3
  670. mulps %xmm0, %xmm2
  671. ADD2 %xmm2, %xmm5
  672. subl $-32 * SIZE, AA
  673. pshufd $0xb1, %xmm3, %xmm2
  674. mulps %xmm0, %xmm3
  675. ADD1 %xmm1, %xmm4
  676. movaps -32 * SIZE(BB), %xmm1
  677. mulps %xmm0, %xmm2
  678. movaps -32 * SIZE(AA), %xmm0
  679. decl %eax
  680. jne .L11
  681. ALIGN_4
  682. .L15:
  683. #if defined(LT) || defined(RN)
  684. movl KK, %eax
  685. #else
  686. movl K, %eax
  687. subl KK, %eax
  688. #endif
  689. andl $7, %eax # if (k & 1)
  690. BRANCH
  691. je .L14
  692. ALIGN_4
  693. .L13:
  694. ADD2 %xmm2, %xmm7
  695. pshufd $0xb1, %xmm1, %xmm2
  696. mulps %xmm0, %xmm1
  697. ADD1 %xmm3, %xmm6
  698. pshufd $0x1b, %xmm2, %xmm3
  699. mulps %xmm0, %xmm2
  700. ADD2 %xmm2, %xmm5
  701. pshufd $0xb1, %xmm3, %xmm2
  702. mulps %xmm0, %xmm3
  703. ADD1 %xmm1, %xmm4
  704. movaps -28 * SIZE(BB), %xmm1
  705. mulps %xmm0, %xmm2
  706. movaps -28 * SIZE(AA), %xmm0
  707. addl $4 * SIZE, AA
  708. addl $4 * SIZE, BB
  709. decl %eax
  710. jg .L13
  711. ALIGN_4
  712. .L14:
  713. #if defined(LN) || defined(RT)
  714. movl KK, %eax
  715. #ifdef LN
  716. subl $2, %eax
  717. #else
  718. subl $2, %eax
  719. #endif
  720. movl AORIG, AA
  721. sall $ZBASE_SHIFT, %eax
  722. leal (AA, %eax, 2), AA
  723. leal (B, %eax, 2), BB
  724. #endif
  725. ADD2 %xmm2, %xmm7
  726. pcmpeqb %xmm0, %xmm0
  727. ADD1 %xmm3, %xmm6
  728. psllq $63, %xmm0
  729. #ifndef CONJ
  730. pxor %xmm0, %xmm4
  731. pxor %xmm0, %xmm6
  732. shufps $0xb1, %xmm0, %xmm0
  733. #else
  734. #if defined(LN) || defined(LT)
  735. pxor %xmm0, %xmm5
  736. pxor %xmm0, %xmm7
  737. #else
  738. pshufd $0xb1, %xmm0, %xmm1
  739. pxor %xmm1, %xmm5
  740. pxor %xmm1, %xmm7
  741. #endif
  742. #endif
  743. haddps %xmm5, %xmm4
  744. haddps %xmm7, %xmm6
  745. shufps $0xd8, %xmm4, %xmm4
  746. shufps $0xd8, %xmm6, %xmm6
  747. movaps %xmm4, %xmm5
  748. shufps $0xe4, %xmm6, %xmm4
  749. shufps $0xe4, %xmm5, %xmm6
  750. #if defined(LN) || defined(LT)
  751. movaps %xmm4, %xmm5
  752. unpcklpd %xmm6, %xmm4
  753. unpckhpd %xmm6, %xmm5
  754. movaps -32 * SIZE(BB), %xmm2
  755. movaps -28 * SIZE(BB), %xmm3
  756. subps %xmm4, %xmm2
  757. subps %xmm5, %xmm3
  758. #else
  759. movaps -32 * SIZE(AA), %xmm1
  760. movaps -28 * SIZE(AA), %xmm5
  761. subps %xmm4, %xmm1
  762. subps %xmm6, %xmm5
  763. #endif
  764. #ifdef LN
  765. movaps -28 * SIZE(AA), %xmm5
  766. pshufd $0xee, %xmm5, %xmm6
  767. pshufd $0xbb, %xmm5, %xmm7
  768. pshufd $0xa0, %xmm3, %xmm4
  769. pshufd $0xf5, %xmm3, %xmm3
  770. #ifndef CONJ
  771. xorps %xmm0, %xmm3
  772. #else
  773. xorps %xmm0, %xmm4
  774. #endif
  775. mulps %xmm6, %xmm4
  776. mulps %xmm7, %xmm3
  777. addps %xmm4, %xmm3
  778. pshufd $0x44, %xmm5, %xmm6
  779. pshufd $0x11, %xmm5, %xmm7
  780. pshufd $0xa0, %xmm3, %xmm4
  781. pshufd $0xf5, %xmm3, %xmm1
  782. #ifndef CONJ
  783. xorps %xmm0, %xmm1
  784. #else
  785. xorps %xmm0, %xmm4
  786. #endif
  787. mulps %xmm6, %xmm4
  788. mulps %xmm7, %xmm1
  789. subps %xmm4, %xmm2
  790. subps %xmm1, %xmm2
  791. movaps -32 * SIZE(AA), %xmm5
  792. pshufd $0x44, %xmm5, %xmm6
  793. pshufd $0x11, %xmm5, %xmm7
  794. pshufd $0xa0, %xmm2, %xmm4
  795. pshufd $0xf5, %xmm2, %xmm2
  796. #ifndef CONJ
  797. xorps %xmm0, %xmm2
  798. #else
  799. xorps %xmm0, %xmm4
  800. #endif
  801. mulps %xmm6, %xmm4
  802. mulps %xmm7, %xmm2
  803. addps %xmm4, %xmm2
  804. #endif
  805. #ifdef LT
  806. movaps -32 * SIZE(AA), %xmm5
  807. pshufd $0x44, %xmm5, %xmm6
  808. pshufd $0x11, %xmm5, %xmm7
  809. pshufd $0xa0, %xmm2, %xmm4
  810. pshufd $0xf5, %xmm2, %xmm2
  811. #ifndef CONJ
  812. xorps %xmm0, %xmm2
  813. #else
  814. xorps %xmm0, %xmm4
  815. #endif
  816. mulps %xmm6, %xmm4
  817. mulps %xmm7, %xmm2
  818. addps %xmm4, %xmm2
  819. pshufd $0xee, %xmm5, %xmm6
  820. pshufd $0xbb, %xmm5, %xmm7
  821. pshufd $0xa0, %xmm2, %xmm4
  822. pshufd $0xf5, %xmm2, %xmm1
  823. #ifndef CONJ
  824. xorps %xmm0, %xmm1
  825. #else
  826. xorps %xmm0, %xmm4
  827. #endif
  828. mulps %xmm6, %xmm4
  829. mulps %xmm7, %xmm1
  830. subps %xmm4, %xmm3
  831. subps %xmm1, %xmm3
  832. movaps -28 * SIZE(AA), %xmm5
  833. pshufd $0xee, %xmm5, %xmm6
  834. pshufd $0xbb, %xmm5, %xmm7
  835. pshufd $0xa0, %xmm3, %xmm4
  836. pshufd $0xf5, %xmm3, %xmm3
  837. #ifndef CONJ
  838. xorps %xmm0, %xmm3
  839. #else
  840. xorps %xmm0, %xmm4
  841. #endif
  842. mulps %xmm6, %xmm4
  843. mulps %xmm7, %xmm3
  844. addps %xmm4, %xmm3
  845. #endif
  846. #ifdef RN
  847. movaps -32 * SIZE(BB), %xmm4
  848. pshufd $0x44, %xmm4, %xmm6
  849. pshufd $0x11, %xmm4, %xmm7
  850. pshufd $0xa0, %xmm1, %xmm3
  851. pshufd $0xf5, %xmm1, %xmm1
  852. #ifndef CONJ
  853. xorps %xmm0, %xmm1
  854. #else
  855. xorps %xmm0, %xmm3
  856. #endif
  857. mulps %xmm6, %xmm3
  858. mulps %xmm7, %xmm1
  859. addps %xmm3, %xmm1
  860. pshufd $0xee, %xmm4, %xmm6
  861. pshufd $0xbb, %xmm4, %xmm7
  862. pshufd $0xa0, %xmm1, %xmm3
  863. pshufd $0xf5, %xmm1, %xmm2
  864. #ifndef CONJ
  865. xorps %xmm0, %xmm2
  866. #else
  867. xorps %xmm0, %xmm3
  868. #endif
  869. mulps %xmm6, %xmm3
  870. mulps %xmm7, %xmm2
  871. subps %xmm3, %xmm5
  872. subps %xmm2, %xmm5
  873. movaps -28 * SIZE(BB), %xmm4
  874. pshufd $0xee, %xmm4, %xmm6
  875. pshufd $0xbb, %xmm4, %xmm7
  876. pshufd $0xa0, %xmm5, %xmm3
  877. pshufd $0xf5, %xmm5, %xmm5
  878. #ifndef CONJ
  879. xorps %xmm0, %xmm5
  880. #else
  881. xorps %xmm0, %xmm3
  882. #endif
  883. mulps %xmm6, %xmm3
  884. mulps %xmm7, %xmm5
  885. addps %xmm3, %xmm5
  886. #endif
  887. #ifdef RT
  888. movaps -28 * SIZE(BB), %xmm4
  889. pshufd $0xee, %xmm4, %xmm6
  890. pshufd $0xbb, %xmm4, %xmm7
  891. pshufd $0xa0, %xmm5, %xmm3
  892. pshufd $0xf5, %xmm5, %xmm5
  893. #ifndef CONJ
  894. xorps %xmm0, %xmm5
  895. #else
  896. xorps %xmm0, %xmm3
  897. #endif
  898. mulps %xmm6, %xmm3
  899. mulps %xmm7, %xmm5
  900. addps %xmm3, %xmm5
  901. pshufd $0x44, %xmm4, %xmm6
  902. pshufd $0x11, %xmm4, %xmm7
  903. pshufd $0xa0, %xmm5, %xmm3
  904. pshufd $0xf5, %xmm5, %xmm2
  905. #ifndef CONJ
  906. xorps %xmm0, %xmm2
  907. #else
  908. xorps %xmm0, %xmm3
  909. #endif
  910. mulps %xmm6, %xmm3
  911. mulps %xmm7, %xmm2
  912. subps %xmm3, %xmm1
  913. subps %xmm2, %xmm1
  914. movaps -32 * SIZE(BB), %xmm4
  915. pshufd $0x44, %xmm4, %xmm6
  916. pshufd $0x11, %xmm4, %xmm7
  917. pshufd $0xa0, %xmm1, %xmm3
  918. pshufd $0xf5, %xmm1, %xmm1
  919. #ifndef CONJ
  920. xorps %xmm0, %xmm1
  921. #else
  922. xorps %xmm0, %xmm3
  923. #endif
  924. mulps %xmm6, %xmm3
  925. mulps %xmm7, %xmm1
  926. addps %xmm3, %xmm1
  927. #endif
  928. #ifdef LN
  929. subl $4 * SIZE, CO1
  930. #endif
  931. #if defined(LN) || defined(LT)
  932. movaps %xmm2, -32 * SIZE(BB)
  933. movaps %xmm3, -28 * SIZE(BB)
  934. movlps %xmm2, 0 * SIZE(CO1)
  935. movlps %xmm3, 2 * SIZE(CO1)
  936. movhps %xmm2, 0 * SIZE(CO1, LDC)
  937. movhps %xmm3, 2 * SIZE(CO1, LDC)
  938. #else
  939. movaps %xmm1, -32 * SIZE(AA)
  940. movaps %xmm5, -28 * SIZE(AA)
  941. movlps %xmm1, 0 * SIZE(CO1)
  942. movhps %xmm1, 2 * SIZE(CO1)
  943. movlps %xmm5, 0 * SIZE(CO1, LDC)
  944. movhps %xmm5, 2 * SIZE(CO1, LDC)
  945. #endif
  946. #ifndef LN
  947. addl $4 * SIZE, CO1
  948. #endif
  949. #if defined(LT) || defined(RN)
  950. movl K, %eax
  951. subl KK, %eax
  952. sall $ZBASE_SHIFT, %eax
  953. leal (AA, %eax, 2), AA
  954. leal (BB, %eax, 2), BB
  955. #endif
  956. #ifdef LN
  957. subl $2, KK
  958. #endif
  959. #ifdef LT
  960. addl $2, KK
  961. #endif
  962. #ifdef RT
  963. movl K, %eax
  964. sall $1 + ZBASE_SHIFT, %eax
  965. addl %eax, AORIG
  966. #endif
  967. decl %ebx
  968. jg .L10
  969. ALIGN_4
  970. .L99:
  971. #ifdef LN
  972. movl K, %eax
  973. sall $1 + ZBASE_SHIFT, %eax
  974. addl %eax, B
  975. #endif
  976. #if defined(LT) || defined(RN)
  977. movl BB, B
  978. #endif
  979. #ifdef RN
  980. addl $2, KK
  981. #endif
  982. #ifdef RT
  983. subl $2, KK
  984. #endif
  985. decl J # j --
  986. jg .L01
  987. ALIGN_4
  988. .L100:
  989. movl N, %eax
  990. andl $1, %eax
  991. jle .L999
  992. #if defined(LT) || defined(RN)
  993. movl A, %eax
  994. movl %eax, AA
  995. #else
  996. movl A, %eax
  997. movl %eax, AORIG
  998. #endif
  999. #ifdef RT
  1000. movl K, %eax
  1001. sall $ZBASE_SHIFT, %eax
  1002. subl %eax, B
  1003. #endif
  1004. #ifdef RT
  1005. subl LDC, C
  1006. #endif
  1007. movl C, CO1
  1008. #ifndef RT
  1009. addl LDC, C
  1010. #endif
  1011. #ifdef LN
  1012. movl OFFSET, %eax
  1013. addl M, %eax
  1014. movl %eax, KK
  1015. #endif
  1016. #ifdef LT
  1017. movl OFFSET, %eax
  1018. movl %eax, KK
  1019. #endif
  1020. movl M, %ebx
  1021. andl $1, %ebx
  1022. jle .L130
  1023. #ifdef LN
  1024. movl K, %eax
  1025. sall $ZBASE_SHIFT, %eax
  1026. subl %eax, AORIG
  1027. #endif
  1028. #if defined(LN) || defined(RT)
  1029. movl KK, %eax
  1030. movl AORIG, AA
  1031. sall $ZBASE_SHIFT, %eax
  1032. addl %eax, AA
  1033. #endif
  1034. movl B, BB
  1035. #if defined(LN) || defined(RT)
  1036. movl KK, %eax
  1037. sall $ZBASE_SHIFT, %eax
  1038. addl %eax, BB
  1039. #endif
  1040. movsd -32 * SIZE(AA), %xmm0
  1041. pxor %xmm2, %xmm2
  1042. movsd -32 * SIZE(BB), %xmm1
  1043. pxor %xmm3, %xmm3
  1044. pxor %xmm4, %xmm4
  1045. pxor %xmm5, %xmm5
  1046. pxor %xmm6, %xmm6
  1047. pxor %xmm7, %xmm7
  1048. #if defined(LT) || defined(RN)
  1049. movl KK, %eax
  1050. #else
  1051. movl K, %eax
  1052. subl KK, %eax
  1053. #endif
  1054. sarl $3, %eax
  1055. je .L142
  1056. ALIGN_4
  1057. .L141:
  1058. addps %xmm2, %xmm4
  1059. pshufd $0x00, %xmm1, %xmm2
  1060. mulps %xmm0, %xmm2
  1061. addps %xmm3, %xmm5
  1062. pshufd $0x55, %xmm1, %xmm3
  1063. movsd -30 * SIZE(BB), %xmm1
  1064. mulps %xmm0, %xmm3
  1065. movsd -30 * SIZE(AA), %xmm0
  1066. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1067. addps %xmm2, %xmm4
  1068. pshufd $0x00, %xmm1, %xmm2
  1069. mulps %xmm0, %xmm2
  1070. addps %xmm3, %xmm5
  1071. pshufd $0x55, %xmm1, %xmm3
  1072. movsd -28 * SIZE(BB), %xmm1
  1073. mulps %xmm0, %xmm3
  1074. movsd -28 * SIZE(AA), %xmm0
  1075. addps %xmm2, %xmm4
  1076. pshufd $0x00, %xmm1, %xmm2
  1077. mulps %xmm0, %xmm2
  1078. addps %xmm3, %xmm5
  1079. pshufd $0x55, %xmm1, %xmm3
  1080. movsd -26 * SIZE(BB), %xmm1
  1081. mulps %xmm0, %xmm3
  1082. movsd -26 * SIZE(AA), %xmm0
  1083. addps %xmm2, %xmm4
  1084. pshufd $0x00, %xmm1, %xmm2
  1085. mulps %xmm0, %xmm2
  1086. addps %xmm3, %xmm5
  1087. pshufd $0x55, %xmm1, %xmm3
  1088. movsd -24 * SIZE(BB), %xmm1
  1089. mulps %xmm0, %xmm3
  1090. movsd -24 * SIZE(AA), %xmm0
  1091. addps %xmm2, %xmm4
  1092. pshufd $0x00, %xmm1, %xmm2
  1093. mulps %xmm0, %xmm2
  1094. addps %xmm3, %xmm5
  1095. pshufd $0x55, %xmm1, %xmm3
  1096. movsd -22 * SIZE(BB), %xmm1
  1097. mulps %xmm0, %xmm3
  1098. movsd -22 * SIZE(AA), %xmm0
  1099. addps %xmm2, %xmm4
  1100. pshufd $0x00, %xmm1, %xmm2
  1101. mulps %xmm0, %xmm2
  1102. addps %xmm3, %xmm5
  1103. pshufd $0x55, %xmm1, %xmm3
  1104. movsd -20 * SIZE(BB), %xmm1
  1105. mulps %xmm0, %xmm3
  1106. movsd -20 * SIZE(AA), %xmm0
  1107. addps %xmm2, %xmm4
  1108. pshufd $0x00, %xmm1, %xmm2
  1109. mulps %xmm0, %xmm2
  1110. addps %xmm3, %xmm5
  1111. pshufd $0x55, %xmm1, %xmm3
  1112. movsd -18 * SIZE(BB), %xmm1
  1113. mulps %xmm0, %xmm3
  1114. movsd -18 * SIZE(AA), %xmm0
  1115. addps %xmm2, %xmm4
  1116. pshufd $0x00, %xmm1, %xmm2
  1117. mulps %xmm0, %xmm2
  1118. addps %xmm3, %xmm5
  1119. pshufd $0x55, %xmm1, %xmm3
  1120. movsd -16 * SIZE(BB), %xmm1
  1121. mulps %xmm0, %xmm3
  1122. movsd -16 * SIZE(AA), %xmm0
  1123. subl $-16 * SIZE, AA
  1124. subl $-16 * SIZE, BB
  1125. decl %eax
  1126. jne .L141
  1127. ALIGN_4
  1128. .L142:
  1129. #if defined(LT) || defined(RN)
  1130. movl KK, %eax
  1131. #else
  1132. movl K, %eax
  1133. subl KK, %eax
  1134. #endif
  1135. andl $7, %eax # if (k & 1)
  1136. BRANCH
  1137. je .L144
  1138. ALIGN_4
  1139. .L143:
  1140. addps %xmm2, %xmm4
  1141. pshufd $0x00, %xmm1, %xmm2
  1142. mulps %xmm0, %xmm2
  1143. addps %xmm3, %xmm5
  1144. pshufd $0x55, %xmm1, %xmm3
  1145. movsd -30 * SIZE(BB), %xmm1
  1146. mulps %xmm0, %xmm3
  1147. movsd -30 * SIZE(AA), %xmm0
  1148. addl $2 * SIZE, AA
  1149. addl $2 * SIZE, BB
  1150. decl %eax
  1151. jg .L143
  1152. ALIGN_4
  1153. .L144:
  1154. #if defined(LN) || defined(RT)
  1155. movl KK, %eax
  1156. subl $1, %eax
  1157. movl AORIG, AA
  1158. sall $ZBASE_SHIFT, %eax
  1159. leal (AA, %eax, 1), AA
  1160. leal (B, %eax, 1), BB
  1161. #endif
  1162. addps %xmm2, %xmm4
  1163. addps %xmm3, %xmm5
  1164. pshufd $0xb1, %xmm5, %xmm5
  1165. pcmpeqb %xmm0, %xmm0
  1166. psllq $63, %xmm0
  1167. #ifndef CONJ
  1168. shufps $0xb1, %xmm0, %xmm0
  1169. pxor %xmm0, %xmm5
  1170. #else
  1171. #if defined(LN) || defined(LT)
  1172. pxor %xmm0, %xmm4
  1173. #else
  1174. pxor %xmm0, %xmm5
  1175. #endif
  1176. #endif
  1177. addps %xmm5, %xmm4
  1178. #if defined(LN) || defined(LT)
  1179. movsd -32 * SIZE(BB), %xmm2
  1180. subps %xmm4, %xmm2
  1181. #else
  1182. movsd -32 * SIZE(AA), %xmm1
  1183. subps %xmm4, %xmm1
  1184. #endif
  1185. #if defined(LN) || defined(LT)
  1186. movaps -32 * SIZE(AA), %xmm5
  1187. pshufd $0x44, %xmm5, %xmm6
  1188. pshufd $0x11, %xmm5, %xmm7
  1189. pshufd $0xa0, %xmm2, %xmm4
  1190. pshufd $0xf5, %xmm2, %xmm2
  1191. #ifndef CONJ
  1192. xorps %xmm0, %xmm2
  1193. #else
  1194. xorps %xmm0, %xmm4
  1195. #endif
  1196. mulps %xmm6, %xmm4
  1197. mulps %xmm7, %xmm2
  1198. addps %xmm4, %xmm2
  1199. #endif
  1200. #if defined(RN) || defined(RT)
  1201. movaps -32 * SIZE(BB), %xmm4
  1202. pshufd $0x44, %xmm4, %xmm6
  1203. pshufd $0x11, %xmm4, %xmm7
  1204. pshufd $0xa0, %xmm1, %xmm3
  1205. pshufd $0xf5, %xmm1, %xmm1
  1206. #ifndef CONJ
  1207. xorps %xmm0, %xmm1
  1208. #else
  1209. xorps %xmm0, %xmm3
  1210. #endif
  1211. mulps %xmm6, %xmm3
  1212. mulps %xmm7, %xmm1
  1213. addps %xmm3, %xmm1
  1214. #endif
  1215. #ifdef LN
  1216. subl $2 * SIZE, CO1
  1217. #endif
  1218. #if defined(LN) || defined(LT)
  1219. movlps %xmm2, -32 * SIZE(BB)
  1220. movlps %xmm2, 0 * SIZE(CO1)
  1221. #else
  1222. movlps %xmm1, -32 * SIZE(AA)
  1223. movlps %xmm1, 0 * SIZE(CO1)
  1224. #endif
  1225. #ifndef LN
  1226. addl $2 * SIZE, CO1
  1227. #endif
  1228. #if defined(LT) || defined(RN)
  1229. movl K, %eax
  1230. subl KK, %eax
  1231. sall $ZBASE_SHIFT, %eax
  1232. leal (AA, %eax, 1), AA
  1233. leal (BB, %eax, 1), BB
  1234. #endif
  1235. #ifdef LN
  1236. subl $1, KK
  1237. #endif
  1238. #ifdef LT
  1239. addl $1, KK
  1240. #endif
  1241. #ifdef RT
  1242. movl K, %eax
  1243. sall $ZBASE_SHIFT, %eax
  1244. addl %eax, AORIG
  1245. #endif
  1246. ALIGN_4
  1247. .L130:
  1248. movl M, %ebx
  1249. sarl $1, %ebx
  1250. jle .L149
  1251. ALIGN_4
  1252. .L110:
  1253. #ifdef LN
  1254. movl K, %eax
  1255. sall $1 + ZBASE_SHIFT, %eax
  1256. subl %eax, AORIG
  1257. #endif
  1258. #if defined(LN) || defined(RT)
  1259. movl KK, %eax
  1260. movl AORIG, AA
  1261. sall $1 + ZBASE_SHIFT, %eax
  1262. addl %eax, AA
  1263. #endif
  1264. movl B, BB
  1265. #if defined(LN) || defined(RT)
  1266. movl KK, %eax
  1267. sall $ZBASE_SHIFT, %eax
  1268. addl %eax, BB
  1269. #endif
  1270. movaps -32 * SIZE(AA), %xmm0
  1271. pxor %xmm2, %xmm2
  1272. movsd -32 * SIZE(BB), %xmm1
  1273. pxor %xmm3, %xmm3
  1274. movhps -30 * SIZE(BB), %xmm1
  1275. pxor %xmm4, %xmm4
  1276. #ifdef LN
  1277. prefetcht0 -4 * SIZE(CO1)
  1278. #else
  1279. prefetcht0 3 * SIZE(CO1)
  1280. #endif
  1281. pxor %xmm5, %xmm5
  1282. pxor %xmm6, %xmm6
  1283. pxor %xmm7, %xmm7
  1284. #if defined(LT) || defined(RN)
  1285. movl KK, %eax
  1286. #else
  1287. movl K, %eax
  1288. subl KK, %eax
  1289. #endif
  1290. sarl $3, %eax
  1291. je .L112
  1292. ALIGN_4
  1293. .L111:
  1294. addps %xmm2, %xmm4
  1295. pshufd $0x00, %xmm1, %xmm2
  1296. mulps %xmm0, %xmm2
  1297. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1298. addps %xmm3, %xmm5
  1299. pshufd $0x55, %xmm1, %xmm3
  1300. mulps %xmm0, %xmm3
  1301. movaps -28 * SIZE(AA), %xmm0
  1302. addps %xmm2, %xmm4
  1303. pshufd $0xaa, %xmm1, %xmm2
  1304. mulps %xmm0, %xmm2
  1305. addps %xmm3, %xmm5
  1306. pshufd $0xff, %xmm1, %xmm3
  1307. movaps -28 * SIZE(BB), %xmm1
  1308. mulps %xmm0, %xmm3
  1309. movaps -24 * SIZE(AA), %xmm0
  1310. addps %xmm2, %xmm4
  1311. pshufd $0x00, %xmm1, %xmm2
  1312. mulps %xmm0, %xmm2
  1313. addps %xmm3, %xmm5
  1314. pshufd $0x55, %xmm1, %xmm3
  1315. mulps %xmm0, %xmm3
  1316. movaps -20 * SIZE(AA), %xmm0
  1317. addps %xmm2, %xmm4
  1318. pshufd $0xaa, %xmm1, %xmm2
  1319. mulps %xmm0, %xmm2
  1320. addps %xmm3, %xmm5
  1321. pshufd $0xff, %xmm1, %xmm3
  1322. movaps -24 * SIZE(BB), %xmm1
  1323. mulps %xmm0, %xmm3
  1324. movaps -16 * SIZE(AA), %xmm0
  1325. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  1326. addps %xmm2, %xmm4
  1327. pshufd $0x00, %xmm1, %xmm2
  1328. mulps %xmm0, %xmm2
  1329. addps %xmm3, %xmm5
  1330. pshufd $0x55, %xmm1, %xmm3
  1331. mulps %xmm0, %xmm3
  1332. movaps -12 * SIZE(AA), %xmm0
  1333. addps %xmm2, %xmm4
  1334. pshufd $0xaa, %xmm1, %xmm2
  1335. mulps %xmm0, %xmm2
  1336. addps %xmm3, %xmm5
  1337. pshufd $0xff, %xmm1, %xmm3
  1338. movaps -20 * SIZE(BB), %xmm1
  1339. mulps %xmm0, %xmm3
  1340. movaps -8 * SIZE(AA), %xmm0
  1341. addps %xmm2, %xmm4
  1342. pshufd $0x00, %xmm1, %xmm2
  1343. mulps %xmm0, %xmm2
  1344. addps %xmm3, %xmm5
  1345. pshufd $0x55, %xmm1, %xmm3
  1346. mulps %xmm0, %xmm3
  1347. movaps -4 * SIZE(AA), %xmm0
  1348. addps %xmm2, %xmm4
  1349. pshufd $0xaa, %xmm1, %xmm2
  1350. mulps %xmm0, %xmm2
  1351. addps %xmm3, %xmm5
  1352. pshufd $0xff, %xmm1, %xmm3
  1353. movaps -16 * SIZE(BB), %xmm1
  1354. mulps %xmm0, %xmm3
  1355. movaps 0 * SIZE(AA), %xmm0
  1356. subl $-32 * SIZE, AA
  1357. subl $-16 * SIZE, BB
  1358. decl %eax
  1359. jne .L111
  1360. ALIGN_4
  1361. .L112:
  1362. #if defined(LT) || defined(RN)
  1363. movl KK, %eax
  1364. #else
  1365. movl K, %eax
  1366. subl KK, %eax
  1367. #endif
  1368. andl $7, %eax # if (k & 1)
  1369. BRANCH
  1370. je .L114
  1371. ALIGN_4
  1372. .L113:
  1373. addps %xmm2, %xmm4
  1374. pshufd $0x00, %xmm1, %xmm2
  1375. mulps %xmm0, %xmm2
  1376. addps %xmm3, %xmm5
  1377. pshufd $0x55, %xmm1, %xmm3
  1378. movsd -30 * SIZE(BB), %xmm1
  1379. mulps %xmm0, %xmm3
  1380. movaps -28 * SIZE(AA), %xmm0
  1381. addl $4 * SIZE, AA
  1382. addl $2 * SIZE, BB
  1383. decl %eax
  1384. jg .L113
  1385. ALIGN_4
  1386. .L114:
  1387. #if defined(LN) || defined(RT)
  1388. movl KK, %eax
  1389. #ifdef LN
  1390. subl $2, %eax
  1391. #else
  1392. subl $1, %eax
  1393. #endif
  1394. movl AORIG, AA
  1395. sall $ZBASE_SHIFT, %eax
  1396. leal (AA, %eax, 2), AA
  1397. leal (B, %eax, 1), BB
  1398. #endif
  1399. addps %xmm2, %xmm4
  1400. addps %xmm3, %xmm5
  1401. pshufd $0xb1, %xmm5, %xmm5
  1402. pcmpeqb %xmm0, %xmm0
  1403. psllq $63, %xmm0
  1404. #ifndef CONJ
  1405. shufps $0xb1, %xmm0, %xmm0
  1406. pxor %xmm0, %xmm5
  1407. #else
  1408. #if defined(LN) || defined(LT)
  1409. pxor %xmm0, %xmm4
  1410. #else
  1411. pxor %xmm0, %xmm5
  1412. #endif
  1413. #endif
  1414. addps %xmm5, %xmm4
  1415. #if defined(LN) || defined(LT)
  1416. movaps %xmm4, %xmm5
  1417. unpcklpd %xmm6, %xmm4
  1418. unpckhpd %xmm6, %xmm5
  1419. movsd -32 * SIZE(BB), %xmm2
  1420. movsd -30 * SIZE(BB), %xmm3
  1421. subps %xmm4, %xmm2
  1422. subps %xmm5, %xmm3
  1423. #else
  1424. movaps -32 * SIZE(AA), %xmm1
  1425. subps %xmm4, %xmm1
  1426. #endif
  1427. #ifdef LN
  1428. movaps -28 * SIZE(AA), %xmm5
  1429. pshufd $0xee, %xmm5, %xmm6
  1430. pshufd $0xbb, %xmm5, %xmm7
  1431. pshufd $0xa0, %xmm3, %xmm4
  1432. pshufd $0xf5, %xmm3, %xmm3
  1433. #ifndef CONJ
  1434. xorps %xmm0, %xmm3
  1435. #else
  1436. xorps %xmm0, %xmm4
  1437. #endif
  1438. mulps %xmm6, %xmm4
  1439. mulps %xmm7, %xmm3
  1440. addps %xmm4, %xmm3
  1441. pshufd $0x44, %xmm5, %xmm6
  1442. pshufd $0x11, %xmm5, %xmm7
  1443. pshufd $0xa0, %xmm3, %xmm4
  1444. pshufd $0xf5, %xmm3, %xmm1
  1445. #ifndef CONJ
  1446. xorps %xmm0, %xmm1
  1447. #else
  1448. xorps %xmm0, %xmm4
  1449. #endif
  1450. mulps %xmm6, %xmm4
  1451. mulps %xmm7, %xmm1
  1452. subps %xmm4, %xmm2
  1453. subps %xmm1, %xmm2
  1454. movaps -32 * SIZE(AA), %xmm5
  1455. pshufd $0x44, %xmm5, %xmm6
  1456. pshufd $0x11, %xmm5, %xmm7
  1457. pshufd $0xa0, %xmm2, %xmm4
  1458. pshufd $0xf5, %xmm2, %xmm2
  1459. #ifndef CONJ
  1460. xorps %xmm0, %xmm2
  1461. #else
  1462. xorps %xmm0, %xmm4
  1463. #endif
  1464. mulps %xmm6, %xmm4
  1465. mulps %xmm7, %xmm2
  1466. addps %xmm4, %xmm2
  1467. #endif
  1468. #ifdef LT
  1469. movaps -32 * SIZE(AA), %xmm5
  1470. pshufd $0x44, %xmm5, %xmm6
  1471. pshufd $0x11, %xmm5, %xmm7
  1472. pshufd $0xa0, %xmm2, %xmm4
  1473. pshufd $0xf5, %xmm2, %xmm2
  1474. #ifndef CONJ
  1475. xorps %xmm0, %xmm2
  1476. #else
  1477. xorps %xmm0, %xmm4
  1478. #endif
  1479. mulps %xmm6, %xmm4
  1480. mulps %xmm7, %xmm2
  1481. addps %xmm4, %xmm2
  1482. pshufd $0xee, %xmm5, %xmm6
  1483. pshufd $0xbb, %xmm5, %xmm7
  1484. pshufd $0xa0, %xmm2, %xmm4
  1485. pshufd $0xf5, %xmm2, %xmm1
  1486. #ifndef CONJ
  1487. xorps %xmm0, %xmm1
  1488. #else
  1489. xorps %xmm0, %xmm4
  1490. #endif
  1491. mulps %xmm6, %xmm4
  1492. mulps %xmm7, %xmm1
  1493. subps %xmm4, %xmm3
  1494. subps %xmm1, %xmm3
  1495. movaps -28 * SIZE(AA), %xmm5
  1496. pshufd $0xee, %xmm5, %xmm6
  1497. pshufd $0xbb, %xmm5, %xmm7
  1498. pshufd $0xa0, %xmm3, %xmm4
  1499. pshufd $0xf5, %xmm3, %xmm3
  1500. #ifndef CONJ
  1501. xorps %xmm0, %xmm3
  1502. #else
  1503. xorps %xmm0, %xmm4
  1504. #endif
  1505. mulps %xmm6, %xmm4
  1506. mulps %xmm7, %xmm3
  1507. addps %xmm4, %xmm3
  1508. #endif
  1509. #if defined(RN) || defined(RT)
  1510. movaps -32 * SIZE(BB), %xmm4
  1511. pshufd $0x44, %xmm4, %xmm6
  1512. pshufd $0x11, %xmm4, %xmm7
  1513. pshufd $0xa0, %xmm1, %xmm3
  1514. pshufd $0xf5, %xmm1, %xmm1
  1515. #ifndef CONJ
  1516. xorps %xmm0, %xmm1
  1517. #else
  1518. xorps %xmm0, %xmm3
  1519. #endif
  1520. mulps %xmm6, %xmm3
  1521. mulps %xmm7, %xmm1
  1522. addps %xmm3, %xmm1
  1523. #endif
  1524. #ifdef LN
  1525. subl $4 * SIZE, CO1
  1526. #endif
  1527. #if defined(LN) || defined(LT)
  1528. movlps %xmm2, -32 * SIZE(BB)
  1529. movlps %xmm3, -30 * SIZE(BB)
  1530. movlps %xmm2, 0 * SIZE(CO1)
  1531. movlps %xmm3, 2 * SIZE(CO1)
  1532. #else
  1533. movaps %xmm1, -32 * SIZE(AA)
  1534. movlps %xmm1, 0 * SIZE(CO1)
  1535. movhps %xmm1, 2 * SIZE(CO1)
  1536. #endif
  1537. #ifndef LN
  1538. addl $4 * SIZE, CO1
  1539. #endif
  1540. #if defined(LT) || defined(RN)
  1541. movl K, %eax
  1542. subl KK, %eax
  1543. sall $ZBASE_SHIFT, %eax
  1544. leal (AA, %eax, 2), AA
  1545. leal (BB, %eax, 1), BB
  1546. #endif
  1547. #ifdef LN
  1548. subl $2, KK
  1549. #endif
  1550. #ifdef LT
  1551. addl $2, KK
  1552. #endif
  1553. #ifdef RT
  1554. movl K, %eax
  1555. sall $1 + ZBASE_SHIFT, %eax
  1556. addl %eax, AORIG
  1557. #endif
  1558. decl %ebx # i --
  1559. jg .L110
  1560. ALIGN_4
  1561. .L149:
  1562. #ifdef LN
  1563. movl K, %eax
  1564. sall $ZBASE_SHIFT, %eax
  1565. addl %eax, B
  1566. #endif
  1567. #if defined(LT) || defined(RN)
  1568. movl BB, B
  1569. #endif
  1570. #ifdef RN
  1571. addl $1, KK
  1572. #endif
  1573. #ifdef RT
  1574. subl $1, KK
  1575. #endif
  1576. ALIGN_4
  1577. .L999:
  1578. popl %ebx
  1579. popl %esi
  1580. popl %edi
  1581. popl %ebp
  1582. addl $ARGS, %esp
  1583. ret
  1584. EPILOGUE