You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_RT_2x2_sse2.S 43 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define N %r14
  44. #define K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define J %r12
  51. #define AO %rdi
  52. #define BO %rsi
  53. #define CO1 %r15
  54. #define CO2 %rbp
  55. #ifndef WINDOWS_ABI
  56. #define STACKSIZE 64
  57. #define OLD_LDC 8 + STACKSIZE(%rsp)
  58. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  59. #else
  60. #define STACKSIZE 256
  61. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  62. #define OLD_A 48 + STACKSIZE(%rsp)
  63. #define OLD_B 56 + STACKSIZE(%rsp)
  64. #define OLD_C 64 + STACKSIZE(%rsp)
  65. #define OLD_LDC 72 + STACKSIZE(%rsp)
  66. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  67. #endif
  68. #define POSINV 0(%rsp)
  69. #define ALPHA_R 16(%rsp)
  70. #define ALPHA_I 32(%rsp)
  71. #define OFFSET 40(%rsp)
  72. #define KK 48(%rsp)
  73. #define KKK 56(%rsp)
  74. #define AORIG 64(%rsp)
  75. #define BORIG 72(%rsp)
  76. #define BUFFER 128(%rsp)
  77. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
  78. #define PREFETCH prefetch
  79. #define PREFETCHW prefetchw
  80. #define PREFETCHNTA prefetchnta
  81. #define PREFETCHSIZE (8 * 6 + 4)
  82. #endif
  83. #ifdef GENERIC
  84. #define PREFETCH prefetcht0
  85. #define PREFETCHW prefetcht0
  86. #define PREFETCHNTA prefetchnta
  87. #define PREFETCHSIZE (8 * 6 + 4)
  88. #endif
  89. #define KERNEL1(xx) \
  90. mulpd %xmm8, %xmm9 ;\
  91. addpd %xmm9, %xmm0 ;\
  92. movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  93. mulpd %xmm8, %xmm11 ;\
  94. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\
  95. addpd %xmm11, %xmm1 ;\
  96. movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  97. mulpd %xmm8, %xmm13 ;\
  98. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
  99. addpd %xmm13, %xmm2 ;\
  100. movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  101. addpd %xmm8, %xmm3 ;\
  102. movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
  103. #define KERNEL2(xx) \
  104. mulpd %xmm10, %xmm9 ;\
  105. addpd %xmm9, %xmm4 ;\
  106. movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  107. mulpd %xmm10, %xmm11 ;\
  108. addpd %xmm11, %xmm5 ;\
  109. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  110. mulpd %xmm10, %xmm13 ;\
  111. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
  112. addpd %xmm13, %xmm6 ;\
  113. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  114. addpd %xmm10, %xmm7 ;\
  115. movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
  116. #define KERNEL3(xx) \
  117. mulpd %xmm12, %xmm15 ;\
  118. addpd %xmm15, %xmm0 ;\
  119. movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  120. mulpd %xmm12, %xmm11 ;\
  121. addpd %xmm11, %xmm1 ;\
  122. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  123. mulpd %xmm12, %xmm13 ;\
  124. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
  125. addpd %xmm13, %xmm2 ;\
  126. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  127. addpd %xmm12, %xmm3 ;\
  128. movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
  129. #define KERNEL4(xx) \
  130. mulpd %xmm14, %xmm15 ;\
  131. addpd %xmm15, %xmm4 ;\
  132. movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  133. mulpd %xmm14, %xmm11 ;\
  134. addpd %xmm11, %xmm5 ;\
  135. movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  136. mulpd %xmm14, %xmm13 ;\
  137. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
  138. addpd %xmm13, %xmm6 ;\
  139. movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  140. addpd %xmm14, %xmm7 ;\
  141. movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
  142. #define KERNEL5(xx) \
  143. mulpd %xmm8, %xmm9 ;\
  144. addpd %xmm9, %xmm0 ;\
  145. movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  146. mulpd %xmm8, %xmm11 ;\
  147. PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\
  148. addpd %xmm11, %xmm1 ;\
  149. movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  150. mulpd %xmm8, %xmm13 ;\
  151. mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
  152. addpd %xmm13, %xmm2 ;\
  153. movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  154. addpd %xmm8, %xmm3 ;\
  155. movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
  156. #define KERNEL6(xx) \
  157. mulpd %xmm10, %xmm9 ;\
  158. addpd %xmm9, %xmm4 ;\
  159. movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  160. mulpd %xmm10, %xmm11 ;\
  161. addpd %xmm11, %xmm5 ;\
  162. movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  163. mulpd %xmm10, %xmm13 ;\
  164. mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
  165. addpd %xmm13, %xmm6 ;\
  166. movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  167. addpd %xmm10, %xmm7 ;\
  168. movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
  169. #define KERNEL7(xx) \
  170. mulpd %xmm12, %xmm15 ;\
  171. addpd %xmm15, %xmm0 ;\
  172. movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  173. mulpd %xmm12, %xmm11 ;\
  174. addpd %xmm11, %xmm1 ;\
  175. movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  176. mulpd %xmm12, %xmm13 ;\
  177. mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
  178. addpd %xmm13, %xmm2 ;\
  179. movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  180. addpd %xmm12, %xmm3 ;\
  181. movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
  182. #define KERNEL8(xx) \
  183. mulpd %xmm14, %xmm15 ;\
  184. addpd %xmm15, %xmm4 ;\
  185. movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  186. mulpd %xmm14, %xmm11 ;\
  187. addpd %xmm11, %xmm5 ;\
  188. movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  189. mulpd %xmm14, %xmm13 ;\
  190. mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
  191. addpd %xmm13, %xmm6 ;\
  192. movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  193. addpd %xmm14, %xmm7 ;\
  194. movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
  195. #ifndef CONJ
  196. #define NN
  197. #else
  198. #if defined(LN) || defined(LT)
  199. #define CN
  200. #else
  201. #define NC
  202. #endif
  203. #endif
  204. PROLOGUE
  205. PROFCODE
  206. subq $STACKSIZE, %rsp
  207. movq %rbx, 0(%rsp)
  208. movq %rbp, 8(%rsp)
  209. movq %r12, 16(%rsp)
  210. movq %r13, 24(%rsp)
  211. movq %r14, 32(%rsp)
  212. movq %r15, 40(%rsp)
  213. #ifdef WINDOWS_ABI
  214. movq %rdi, 48(%rsp)
  215. movq %rsi, 56(%rsp)
  216. movups %xmm6, 64(%rsp)
  217. movups %xmm7, 80(%rsp)
  218. movups %xmm8, 96(%rsp)
  219. movups %xmm9, 112(%rsp)
  220. movups %xmm10, 128(%rsp)
  221. movups %xmm11, 144(%rsp)
  222. movups %xmm12, 160(%rsp)
  223. movups %xmm13, 176(%rsp)
  224. movups %xmm14, 192(%rsp)
  225. movups %xmm15, 208(%rsp)
  226. movq ARG1, OLD_M
  227. movq ARG2, OLD_N
  228. movq ARG3, K
  229. movq OLD_A, A
  230. movq OLD_B, B
  231. movq OLD_C, C
  232. movq OLD_LDC, LDC
  233. movsd OLD_OFFSET, %xmm4
  234. movaps %xmm3, %xmm0
  235. #else
  236. movq OLD_LDC, LDC
  237. movsd OLD_OFFSET, %xmm4
  238. #endif
  239. movq %rsp, %rbx # save old stack
  240. subq $128 + LOCAL_BUFFER_SIZE, %rsp
  241. andq $-4096, %rsp # align stack
  242. STACK_TOUCHING
  243. movq OLD_M, M
  244. movq OLD_N, N
  245. pcmpeqb %xmm15, %xmm15
  246. psllq $63, %xmm15 # Generate mask
  247. pxor %xmm2, %xmm2
  248. movlpd %xmm2, 0 + POSINV
  249. movlpd %xmm15, 8 + POSINV
  250. movlpd %xmm4, OFFSET
  251. movlpd %xmm4, KK
  252. salq $ZBASE_SHIFT, LDC
  253. #ifdef LN
  254. movq M, %rax
  255. salq $ZBASE_SHIFT, %rax
  256. addq %rax, C
  257. imulq K, %rax
  258. addq %rax, A
  259. #endif
  260. #ifdef RT
  261. movq N, %rax
  262. salq $ZBASE_SHIFT, %rax
  263. imulq K, %rax
  264. addq %rax, B
  265. movq N, %rax
  266. imulq LDC, %rax
  267. addq %rax, C
  268. #endif
  269. #ifdef RN
  270. negq KK
  271. #endif
  272. #ifdef RT
  273. movq N, %rax
  274. subq OFFSET, %rax
  275. movq %rax, KK
  276. #endif
  277. testq $1, N
  278. jle .L100
  279. .L101:
  280. #ifdef LN
  281. movq OFFSET, %rax
  282. addq M, %rax
  283. movq %rax, KK
  284. #endif
  285. /* Copying to Sub Buffer */
  286. leaq BUFFER, BO
  287. #ifdef RT
  288. movq K, %rax
  289. salq $0 + ZBASE_SHIFT, %rax
  290. subq %rax, B
  291. #endif
  292. #if defined(LN) || defined(RT)
  293. movq KK, %rax
  294. movq B, BORIG
  295. salq $ZBASE_SHIFT, %rax
  296. leaq (B, %rax, 1), B
  297. leaq (BO, %rax, 2), BO
  298. #endif
  299. #if defined(LT)
  300. movq OFFSET, %rax
  301. movq %rax, KK
  302. #endif
  303. #if defined(LT) || defined(RN)
  304. movq KK, %rax
  305. #else
  306. movq K, %rax
  307. subq KK, %rax
  308. #endif
  309. sarq $2, %rax
  310. jle .L103
  311. ALIGN_4
  312. .L102:
  313. movlpd 0 * SIZE(B), %xmm0
  314. movlpd 1 * SIZE(B), %xmm1
  315. movlpd 2 * SIZE(B), %xmm2
  316. movlpd 3 * SIZE(B), %xmm3
  317. movlpd 4 * SIZE(B), %xmm4
  318. movlpd 5 * SIZE(B), %xmm5
  319. movlpd 6 * SIZE(B), %xmm6
  320. movlpd 7 * SIZE(B), %xmm7
  321. movlpd %xmm0, 0 * SIZE(BO)
  322. movlpd %xmm0, 1 * SIZE(BO)
  323. movlpd %xmm1, 2 * SIZE(BO)
  324. movlpd %xmm1, 3 * SIZE(BO)
  325. movlpd %xmm2, 4 * SIZE(BO)
  326. movlpd %xmm2, 5 * SIZE(BO)
  327. movlpd %xmm3, 6 * SIZE(BO)
  328. movlpd %xmm3, 7 * SIZE(BO)
  329. movlpd %xmm4, 8 * SIZE(BO)
  330. movlpd %xmm4, 9 * SIZE(BO)
  331. movlpd %xmm5, 10 * SIZE(BO)
  332. movlpd %xmm5, 11 * SIZE(BO)
  333. movlpd %xmm6, 12 * SIZE(BO)
  334. movlpd %xmm6, 13 * SIZE(BO)
  335. movlpd %xmm7, 14 * SIZE(BO)
  336. movlpd %xmm7, 15 * SIZE(BO)
  337. subq $-16 * SIZE, BO
  338. addq $ 8 * SIZE, B
  339. decq %rax
  340. jne .L102
  341. ALIGN_4
  342. .L103:
  343. #if defined(LT) || defined(RN)
  344. movq KK, %rax
  345. #else
  346. movq K, %rax
  347. subq KK, %rax
  348. #endif
  349. andq $3, %rax
  350. BRANCH
  351. jle .L105
  352. ALIGN_4
  353. .L104:
  354. movlpd 0 * SIZE(B), %xmm0
  355. movlpd 1 * SIZE(B), %xmm1
  356. movlpd %xmm0, 0 * SIZE(BO)
  357. movlpd %xmm0, 1 * SIZE(BO)
  358. movlpd %xmm1, 2 * SIZE(BO)
  359. movlpd %xmm1, 3 * SIZE(BO)
  360. addq $4 * SIZE, BO
  361. addq $2 * SIZE, B
  362. decq %rax
  363. jne .L104
  364. ALIGN_4
  365. .L105:
  366. #if defined(LT) || defined(RN)
  367. movq A, AO
  368. #else
  369. movq A, AORIG
  370. #endif
  371. #ifdef RT
  372. subq LDC, C
  373. #endif
  374. movq C, CO1
  375. #ifndef RT
  376. addq LDC, C
  377. #endif
  378. movq M, I
  379. sarq $1, I # i = (m >> 2)
  380. jle .L130
  381. ALIGN_4
  382. .L110:
  383. #ifdef LN
  384. movq K, %rax
  385. salq $1 + ZBASE_SHIFT, %rax
  386. subq %rax, AORIG
  387. #endif
  388. #if defined(LN) || defined(RT)
  389. movq KK, %rax
  390. movq AORIG, AO
  391. salq $ZBASE_SHIFT, %rax
  392. leaq (AO, %rax, 2), AO
  393. #endif
  394. leaq BUFFER, BO
  395. #if defined(LN) || defined(RT)
  396. movq KK, %rax
  397. salq $0 + ZBASE_SHIFT, %rax
  398. leaq (BO, %rax, 2), BO
  399. #endif
  400. pxor %xmm0, %xmm0
  401. pxor %xmm1, %xmm1
  402. pxor %xmm4, %xmm4
  403. pxor %xmm5, %xmm5
  404. PREFETCHW 4 * SIZE(CO1)
  405. #if defined(LT) || defined(RN)
  406. movq KK, %rax
  407. #else
  408. movq K, %rax
  409. subq KK, %rax
  410. #endif
  411. sarq $2, %rax
  412. je .L112
  413. .L111:
  414. movapd 0 * SIZE(AO), %xmm8
  415. movapd 0 * SIZE(BO), %xmm9
  416. mulpd %xmm8, %xmm9
  417. addpd %xmm9, %xmm0
  418. mulpd 2 * SIZE(BO), %xmm8
  419. addpd %xmm8, %xmm1
  420. movapd 2 * SIZE(AO), %xmm8
  421. movapd 0 * SIZE(BO), %xmm9
  422. mulpd %xmm8, %xmm9
  423. addpd %xmm9, %xmm4
  424. mulpd 2 * SIZE(BO), %xmm8
  425. addpd %xmm8, %xmm5
  426. movapd 4 * SIZE(AO), %xmm8
  427. movapd 4 * SIZE(BO), %xmm9
  428. mulpd %xmm8, %xmm9
  429. addpd %xmm9, %xmm0
  430. mulpd 6 * SIZE(BO), %xmm8
  431. addpd %xmm8, %xmm1
  432. movapd 6 * SIZE(AO), %xmm8
  433. movapd 4 * SIZE(BO), %xmm9
  434. mulpd %xmm8, %xmm9
  435. addpd %xmm9, %xmm4
  436. mulpd 6 * SIZE(BO), %xmm8
  437. addpd %xmm8, %xmm5
  438. movapd 8 * SIZE(AO), %xmm8
  439. movapd 8 * SIZE(BO), %xmm9
  440. mulpd %xmm8, %xmm9
  441. addpd %xmm9, %xmm0
  442. mulpd 10 * SIZE(BO), %xmm8
  443. addpd %xmm8, %xmm1
  444. movapd 10 * SIZE(AO), %xmm8
  445. movapd 8 * SIZE(BO), %xmm9
  446. mulpd %xmm8, %xmm9
  447. addpd %xmm9, %xmm4
  448. mulpd 10 * SIZE(BO), %xmm8
  449. addpd %xmm8, %xmm5
  450. movapd 12 * SIZE(AO), %xmm8
  451. movapd 12 * SIZE(BO), %xmm9
  452. mulpd %xmm8, %xmm9
  453. addpd %xmm9, %xmm0
  454. mulpd 14 * SIZE(BO), %xmm8
  455. addpd %xmm8, %xmm1
  456. movapd 14 * SIZE(AO), %xmm8
  457. movapd 12 * SIZE(BO), %xmm9
  458. mulpd %xmm8, %xmm9
  459. addpd %xmm9, %xmm4
  460. mulpd 14 * SIZE(BO), %xmm8
  461. addpd %xmm8, %xmm5
  462. addq $16 * SIZE, AO
  463. addq $16 * SIZE, BO
  464. decq %rax
  465. jne .L111
  466. ALIGN_4
  467. .L112:
  468. #if defined(LT) || defined(RN)
  469. movq KK, %rax
  470. #else
  471. movq K, %rax
  472. subq KK, %rax
  473. #endif
  474. movapd POSINV, %xmm15
  475. andq $3, %rax # if (k & 1)
  476. BRANCH
  477. jle .L114
  478. .L113:
  479. movapd 0 * SIZE(AO), %xmm8
  480. movapd 0 * SIZE(BO), %xmm9
  481. mulpd %xmm8, %xmm9
  482. addpd %xmm9, %xmm0
  483. mulpd 2 * SIZE(BO), %xmm8
  484. addpd %xmm8, %xmm1
  485. movapd 2 * SIZE(AO), %xmm8
  486. movapd 0 * SIZE(BO), %xmm9
  487. mulpd %xmm8, %xmm9
  488. addpd %xmm9, %xmm4
  489. mulpd 2 * SIZE(BO), %xmm8
  490. addpd %xmm8, %xmm5
  491. addq $4 * SIZE, AO # aoffset += 4
  492. addq $4 * SIZE, BO # boffset1 += 8
  493. decq %rax
  494. jg .L113
  495. ALIGN_4
  496. .L114:
  497. #if defined(LN) || defined(RT)
  498. movq KK, %rax
  499. #ifdef LN
  500. subq $2, %rax
  501. #else
  502. subq $1, %rax
  503. #endif
  504. movq AORIG, AO
  505. movq BORIG, B
  506. leaq BUFFER, BO
  507. salq $ZBASE_SHIFT, %rax
  508. leaq (AO, %rax, 2), AO
  509. leaq (B, %rax, 1), B
  510. leaq (BO, %rax, 2), BO
  511. #endif
  512. SHUFPD_1 %xmm1, %xmm1
  513. SHUFPD_1 %xmm5, %xmm5
  514. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  515. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  516. xorpd %xmm15, %xmm1
  517. xorpd %xmm15, %xmm5
  518. #else
  519. xorpd %xmm15, %xmm0
  520. xorpd %xmm15, %xmm4
  521. #endif
  522. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  523. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  524. subpd %xmm1, %xmm0
  525. subpd %xmm5, %xmm4
  526. #else
  527. addpd %xmm1, %xmm0
  528. addpd %xmm5, %xmm4
  529. #endif
  530. #if defined(LN) || defined(LT)
  531. movapd 0 * SIZE(B), %xmm1
  532. movapd 2 * SIZE(B), %xmm5
  533. subpd %xmm0, %xmm1
  534. subpd %xmm4, %xmm5
  535. #else
  536. movapd 0 * SIZE(AO), %xmm1
  537. movapd 2 * SIZE(AO), %xmm5
  538. subpd %xmm0, %xmm1
  539. subpd %xmm4, %xmm5
  540. #endif
  541. #ifndef CONJ
  542. SHUFPD_1 %xmm15, %xmm15
  543. #endif
  544. #ifdef LN
  545. movlpd 6 * SIZE(AO), %xmm8
  546. movhpd 6 * SIZE(AO), %xmm8
  547. movlpd 7 * SIZE(AO), %xmm9
  548. movhpd 7 * SIZE(AO), %xmm9
  549. movlpd 4 * SIZE(AO), %xmm10
  550. movhpd 4 * SIZE(AO), %xmm10
  551. movlpd 5 * SIZE(AO), %xmm11
  552. movhpd 5 * SIZE(AO), %xmm11
  553. movlpd 0 * SIZE(AO), %xmm12
  554. movhpd 0 * SIZE(AO), %xmm12
  555. movlpd 1 * SIZE(AO), %xmm13
  556. movhpd 1 * SIZE(AO), %xmm13
  557. pshufd $0x4e, %xmm5, %xmm4
  558. xorpd %xmm15, %xmm4
  559. mulpd %xmm8, %xmm5
  560. mulpd %xmm9, %xmm4
  561. addpd %xmm4, %xmm5
  562. movapd %xmm5, %xmm0
  563. pshufd $0x4e, %xmm5, %xmm4
  564. xorpd %xmm15, %xmm4
  565. mulpd %xmm10, %xmm0
  566. mulpd %xmm11, %xmm4
  567. subpd %xmm0, %xmm1
  568. subpd %xmm4, %xmm1
  569. pshufd $0x4e, %xmm1, %xmm0
  570. xorpd %xmm15, %xmm0
  571. mulpd %xmm12, %xmm1
  572. mulpd %xmm13, %xmm0
  573. addpd %xmm0, %xmm1
  574. #endif
  575. #ifdef LT
  576. movlpd 0 * SIZE(AO), %xmm8
  577. movhpd 0 * SIZE(AO), %xmm8
  578. movlpd 1 * SIZE(AO), %xmm9
  579. movhpd 1 * SIZE(AO), %xmm9
  580. movlpd 2 * SIZE(AO), %xmm10
  581. movhpd 2 * SIZE(AO), %xmm10
  582. movlpd 3 * SIZE(AO), %xmm11
  583. movhpd 3 * SIZE(AO), %xmm11
  584. movlpd 6 * SIZE(AO), %xmm12
  585. movhpd 6 * SIZE(AO), %xmm12
  586. movlpd 7 * SIZE(AO), %xmm13
  587. movhpd 7 * SIZE(AO), %xmm13
  588. pshufd $0x4e, %xmm1, %xmm0
  589. xorpd %xmm15, %xmm0
  590. mulpd %xmm8, %xmm1
  591. mulpd %xmm9, %xmm0
  592. addpd %xmm0, %xmm1
  593. movapd %xmm1, %xmm0
  594. pshufd $0x4e, %xmm1, %xmm4
  595. xorpd %xmm15, %xmm4
  596. mulpd %xmm10, %xmm0
  597. mulpd %xmm11, %xmm4
  598. subpd %xmm0, %xmm5
  599. subpd %xmm4, %xmm5
  600. pshufd $0x4e, %xmm5, %xmm4
  601. xorpd %xmm15, %xmm4
  602. mulpd %xmm12, %xmm5
  603. mulpd %xmm13, %xmm4
  604. addpd %xmm4, %xmm5
  605. #endif
  606. #ifdef RN
  607. movlpd 0 * SIZE(B), %xmm8
  608. movhpd 0 * SIZE(B), %xmm8
  609. movlpd 1 * SIZE(B), %xmm9
  610. movhpd 1 * SIZE(B), %xmm9
  611. pshufd $0x4e, %xmm1, %xmm0
  612. pshufd $0x4e, %xmm5, %xmm4
  613. xorpd %xmm15, %xmm0
  614. xorpd %xmm15, %xmm4
  615. mulpd %xmm8, %xmm1
  616. mulpd %xmm9, %xmm0
  617. mulpd %xmm8, %xmm5
  618. mulpd %xmm9, %xmm4
  619. addpd %xmm0, %xmm1
  620. addpd %xmm4, %xmm5
  621. #endif
  622. #ifdef RT
  623. movlpd 0 * SIZE(B), %xmm8
  624. movhpd 0 * SIZE(B), %xmm8
  625. movlpd 1 * SIZE(B), %xmm9
  626. movhpd 1 * SIZE(B), %xmm9
  627. pshufd $0x4e, %xmm1, %xmm0
  628. pshufd $0x4e, %xmm5, %xmm4
  629. xorpd %xmm15, %xmm0
  630. xorpd %xmm15, %xmm4
  631. mulpd %xmm8, %xmm1
  632. mulpd %xmm9, %xmm0
  633. mulpd %xmm8, %xmm5
  634. mulpd %xmm9, %xmm4
  635. addpd %xmm0, %xmm1
  636. addpd %xmm4, %xmm5
  637. #endif
  638. #ifdef LN
  639. subq $4 * SIZE, CO1
  640. #endif
  641. movsd %xmm1, 0 * SIZE(CO1)
  642. movhpd %xmm1, 1 * SIZE(CO1)
  643. movsd %xmm5, 2 * SIZE(CO1)
  644. movhpd %xmm5, 3 * SIZE(CO1)
  645. #if defined(LN) || defined(LT)
  646. movapd %xmm1, 0 * SIZE(B)
  647. movapd %xmm5, 2 * SIZE(B)
  648. movlpd %xmm1, 0 * SIZE(BO)
  649. movlpd %xmm1, 1 * SIZE(BO)
  650. movhpd %xmm1, 2 * SIZE(BO)
  651. movhpd %xmm1, 3 * SIZE(BO)
  652. movlpd %xmm5, 4 * SIZE(BO)
  653. movlpd %xmm5, 5 * SIZE(BO)
  654. movhpd %xmm5, 6 * SIZE(BO)
  655. movhpd %xmm5, 7 * SIZE(BO)
  656. #else
  657. movapd %xmm1, 0 * SIZE(AO)
  658. movapd %xmm5, 2 * SIZE(AO)
  659. #endif
  660. #ifndef LN
  661. addq $4 * SIZE, CO1
  662. #endif
  663. #if defined(LT) || defined(RN)
  664. movq K, %rax
  665. subq KK, %rax
  666. salq $ZBASE_SHIFT, %rax
  667. leaq (AO, %rax, 2), AO
  668. #ifdef LT
  669. addq $4 * SIZE, B
  670. #endif
  671. #endif
  672. #ifdef LN
  673. subq $2, KK
  674. movq BORIG, B
  675. #endif
  676. #ifdef LT
  677. addq $2, KK
  678. #endif
  679. #ifdef RT
  680. movq K, %rax
  681. movq BORIG, B
  682. salq $1 + ZBASE_SHIFT, %rax
  683. addq %rax, AORIG
  684. #endif
  685. decq I # i --
  686. jg .L110
  687. ALIGN_4
  688. .L130:
  689. testq $1, M
  690. jle .L199
  691. ALIGN_4
  692. .L140:
  693. #ifdef LN
  694. movq K, %rax
  695. salq $0 + ZBASE_SHIFT, %rax
  696. subq %rax, AORIG
  697. #endif
  698. #if defined(LN) || defined(RT)
  699. movq KK, %rax
  700. movq AORIG, AO
  701. salq $ZBASE_SHIFT, %rax
  702. leaq (AO, %rax, 1), AO
  703. #endif
  704. leaq BUFFER, BO
  705. #if defined(LN) || defined(RT)
  706. movq KK, %rax
  707. salq $0 + ZBASE_SHIFT, %rax
  708. leaq (BO, %rax, 2), BO
  709. #endif
  710. pxor %xmm0, %xmm0
  711. pxor %xmm1, %xmm1
  712. pxor %xmm2, %xmm2
  713. pxor %xmm3, %xmm3
  714. #if defined(LT) || defined(RN)
  715. movq KK, %rax
  716. #else
  717. movq K, %rax
  718. subq KK, %rax
  719. #endif
  720. sarq $2, %rax
  721. je .L142
  722. .L141:
  723. movapd 0 * SIZE(AO), %xmm8
  724. movapd 0 * SIZE(BO), %xmm9
  725. mulpd %xmm8, %xmm9
  726. addpd %xmm9, %xmm0
  727. mulpd 2 * SIZE(BO), %xmm8
  728. addpd %xmm8, %xmm1
  729. movapd 2 * SIZE(AO), %xmm8
  730. movapd 4 * SIZE(BO), %xmm9
  731. mulpd %xmm8, %xmm9
  732. addpd %xmm9, %xmm2
  733. mulpd 6 * SIZE(BO), %xmm8
  734. addpd %xmm8, %xmm3
  735. movapd 4 * SIZE(AO), %xmm8
  736. movapd 8 * SIZE(BO), %xmm9
  737. mulpd %xmm8, %xmm9
  738. addpd %xmm9, %xmm0
  739. mulpd 10 * SIZE(BO), %xmm8
  740. addpd %xmm8, %xmm1
  741. movapd 6 * SIZE(AO), %xmm8
  742. movapd 12 * SIZE(BO), %xmm9
  743. mulpd %xmm8, %xmm9
  744. addpd %xmm9, %xmm2
  745. mulpd 14 * SIZE(BO), %xmm8
  746. addpd %xmm8, %xmm3
  747. addq $8 * SIZE, AO
  748. addq $16 * SIZE, BO
  749. decq %rax
  750. jne .L141
  751. .L142:
  752. addpd %xmm2, %xmm0
  753. addpd %xmm3, %xmm1
  754. movapd POSINV, %xmm15
  755. #if defined(LT) || defined(RN)
  756. movq KK, %rax
  757. #else
  758. movq K, %rax
  759. subq KK, %rax
  760. #endif
  761. andq $3, %rax # if (k & 1)
  762. BRANCH
  763. jle .L144
  764. .L143:
  765. movapd 0 * SIZE(AO), %xmm8
  766. movapd 0 * SIZE(BO), %xmm9
  767. mulpd %xmm8, %xmm9
  768. addpd %xmm9, %xmm0
  769. mulpd 2 * SIZE(BO), %xmm8
  770. addpd %xmm8, %xmm1
  771. addq $2 * SIZE, AO # aoffset += 4
  772. addq $4 * SIZE, BO # boffset1 += 8
  773. decq %rax
  774. jg .L143
  775. ALIGN_4
  776. .L144:
  777. #if defined(LN) || defined(RT)
  778. movq KK, %rax
  779. #ifdef LN
  780. subq $1, %rax
  781. #else
  782. subq $1, %rax
  783. #endif
  784. movq AORIG, AO
  785. movq BORIG, B
  786. leaq BUFFER, BO
  787. salq $ZBASE_SHIFT, %rax
  788. leaq (AO, %rax, 1), AO
  789. leaq (B, %rax, 1), B
  790. leaq (BO, %rax, 2), BO
  791. #endif
  792. SHUFPD_1 %xmm1, %xmm1
  793. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  794. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  795. xorpd %xmm15, %xmm1
  796. #else
  797. xorpd %xmm15, %xmm0
  798. #endif
  799. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  800. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  801. subpd %xmm1, %xmm0
  802. #else
  803. addpd %xmm1, %xmm0
  804. #endif
  805. #if defined(LN) || defined(LT)
  806. movapd 0 * SIZE(B), %xmm1
  807. subpd %xmm0, %xmm1
  808. #else
  809. movapd 0 * SIZE(AO), %xmm1
  810. subpd %xmm0, %xmm1
  811. #endif
  812. #ifndef CONJ
  813. SHUFPD_1 %xmm15, %xmm15
  814. #endif
  815. #ifdef LN
  816. movlpd 0 * SIZE(AO), %xmm8
  817. movhpd 0 * SIZE(AO), %xmm8
  818. movlpd 1 * SIZE(AO), %xmm9
  819. movhpd 1 * SIZE(AO), %xmm9
  820. pshufd $0x4e, %xmm1, %xmm0
  821. xorpd %xmm15, %xmm0
  822. mulpd %xmm8, %xmm1
  823. mulpd %xmm9, %xmm0
  824. addpd %xmm0, %xmm1
  825. #endif
  826. #ifdef LT
  827. movlpd 0 * SIZE(AO), %xmm8
  828. movhpd 0 * SIZE(AO), %xmm8
  829. movlpd 1 * SIZE(AO), %xmm9
  830. movhpd 1 * SIZE(AO), %xmm9
  831. pshufd $0x4e, %xmm1, %xmm0
  832. xorpd %xmm15, %xmm0
  833. mulpd %xmm8, %xmm1
  834. mulpd %xmm9, %xmm0
  835. addpd %xmm0, %xmm1
  836. #endif
  837. #ifdef RN
  838. movlpd 0 * SIZE(B), %xmm8
  839. movhpd 0 * SIZE(B), %xmm8
  840. movlpd 1 * SIZE(B), %xmm9
  841. movhpd 1 * SIZE(B), %xmm9
  842. pshufd $0x4e, %xmm1, %xmm0
  843. xorpd %xmm15, %xmm0
  844. mulpd %xmm8, %xmm1
  845. mulpd %xmm9, %xmm0
  846. addpd %xmm0, %xmm1
  847. #endif
  848. #ifdef RT
  849. movlpd 0 * SIZE(B), %xmm8
  850. movhpd 0 * SIZE(B), %xmm8
  851. movlpd 1 * SIZE(B), %xmm9
  852. movhpd 1 * SIZE(B), %xmm9
  853. pshufd $0x4e, %xmm1, %xmm0
  854. xorpd %xmm15, %xmm0
  855. mulpd %xmm8, %xmm1
  856. mulpd %xmm9, %xmm0
  857. addpd %xmm0, %xmm1
  858. #endif
  859. #ifdef LN
  860. subq $2 * SIZE, CO1
  861. #endif
  862. movsd %xmm1, 0 * SIZE(CO1)
  863. movhpd %xmm1, 1 * SIZE(CO1)
  864. #if defined(LN) || defined(LT)
  865. movapd %xmm1, 0 * SIZE(B)
  866. movlpd %xmm1, 0 * SIZE(BO)
  867. movlpd %xmm1, 1 * SIZE(BO)
  868. movhpd %xmm1, 2 * SIZE(BO)
  869. movhpd %xmm1, 3 * SIZE(BO)
  870. #else
  871. movapd %xmm1, 0 * SIZE(AO)
  872. #endif
  873. #ifndef LN
  874. addq $2 * SIZE, CO1
  875. #endif
  876. #if defined(LT) || defined(RN)
  877. movq K, %rax
  878. subq KK, %rax
  879. salq $ZBASE_SHIFT, %rax
  880. leaq (AO, %rax, 1), AO
  881. #ifdef LT
  882. addq $2 * SIZE, B
  883. #endif
  884. #endif
  885. #ifdef LN
  886. subq $1, KK
  887. movq BORIG, B
  888. #endif
  889. #ifdef LT
  890. addq $1, KK
  891. #endif
  892. #ifdef RT
  893. movq K, %rax
  894. movq BORIG, B
  895. salq $0 + ZBASE_SHIFT, %rax
  896. addq %rax, AORIG
  897. #endif
  898. ALIGN_4
  899. .L199:
  900. #ifdef LN
  901. leaq (, K, SIZE), %rax
  902. leaq (B, %rax, 2), B
  903. #endif
  904. #if defined(LT) || defined(RN)
  905. movq K, %rax
  906. subq KK, %rax
  907. leaq (,%rax, SIZE), %rax
  908. leaq (B, %rax, 1 * COMPSIZE), B
  909. #endif
  910. #ifdef RN
  911. addq $1, KK
  912. #endif
  913. #ifdef RT
  914. subq $1, KK
  915. #endif
  916. ALIGN_4
  917. .L100:
  918. movq N, J
  919. sarq $1, J # j = (n >> 2)
  920. jle .L999
  921. ALIGN_4
  922. .L01:
  923. #ifdef LN
  924. movq OFFSET, %rax
  925. addq M, %rax
  926. movq %rax, KK
  927. #endif
  928. /* Copying to Sub Buffer */
  929. leaq BUFFER, BO
  930. #ifdef RT
  931. movq K, %rax
  932. salq $1 + ZBASE_SHIFT, %rax
  933. subq %rax, B
  934. #endif
  935. #if defined(LN) || defined(RT)
  936. movq KK, %rax
  937. movq B, BORIG
  938. salq $ZBASE_SHIFT, %rax
  939. leaq (B, %rax, 2), B
  940. leaq (BO, %rax, 4), BO
  941. #endif
  942. #if defined(LT)
  943. movq OFFSET, %rax
  944. movq %rax, KK
  945. #endif
  946. #if defined(LT) || defined(RN)
  947. movq KK, %rax
  948. #else
  949. movq K, %rax
  950. subq KK, %rax
  951. #endif
  952. sarq $2, %rax
  953. jle .L03
  954. addq %rax, %rax
  955. ALIGN_4
  956. .L02:
  957. PREFETCHNTA 56 * SIZE(B)
  958. movlpd 0 * SIZE(B), %xmm0
  959. movlpd 1 * SIZE(B), %xmm1
  960. movlpd 2 * SIZE(B), %xmm2
  961. movlpd 3 * SIZE(B), %xmm3
  962. movlpd 4 * SIZE(B), %xmm4
  963. movlpd 5 * SIZE(B), %xmm5
  964. movlpd 6 * SIZE(B), %xmm6
  965. movlpd 7 * SIZE(B), %xmm7
  966. movlpd %xmm0, 0 * SIZE(BO)
  967. movlpd %xmm0, 1 * SIZE(BO)
  968. movlpd %xmm1, 2 * SIZE(BO)
  969. movlpd %xmm1, 3 * SIZE(BO)
  970. movlpd %xmm2, 4 * SIZE(BO)
  971. movlpd %xmm2, 5 * SIZE(BO)
  972. movlpd %xmm3, 6 * SIZE(BO)
  973. movlpd %xmm3, 7 * SIZE(BO)
  974. movlpd %xmm4, 8 * SIZE(BO)
  975. movlpd %xmm4, 9 * SIZE(BO)
  976. movlpd %xmm5, 10 * SIZE(BO)
  977. movlpd %xmm5, 11 * SIZE(BO)
  978. movlpd %xmm6, 12 * SIZE(BO)
  979. movlpd %xmm6, 13 * SIZE(BO)
  980. movlpd %xmm7, 14 * SIZE(BO)
  981. movlpd %xmm7, 15 * SIZE(BO)
  982. subq $-16 * SIZE, BO
  983. addq $ 8 * SIZE, B
  984. decq %rax
  985. jne .L02
  986. ALIGN_4
  987. .L03:
  988. #if defined(LT) || defined(RN)
  989. movq KK, %rax
  990. #else
  991. movq K, %rax
  992. subq KK, %rax
  993. #endif
  994. andq $3, %rax
  995. BRANCH
  996. jle .L05
  997. ALIGN_4
  998. .L04:
  999. movlpd 0 * SIZE(B), %xmm0
  1000. movlpd 1 * SIZE(B), %xmm1
  1001. movlpd 2 * SIZE(B), %xmm2
  1002. movlpd 3 * SIZE(B), %xmm3
  1003. movlpd %xmm0, 0 * SIZE(BO)
  1004. movlpd %xmm0, 1 * SIZE(BO)
  1005. movlpd %xmm1, 2 * SIZE(BO)
  1006. movlpd %xmm1, 3 * SIZE(BO)
  1007. movlpd %xmm2, 4 * SIZE(BO)
  1008. movlpd %xmm2, 5 * SIZE(BO)
  1009. movlpd %xmm3, 6 * SIZE(BO)
  1010. movlpd %xmm3, 7 * SIZE(BO)
  1011. addq $ 4 * SIZE, B
  1012. addq $ 8 * SIZE, BO
  1013. decq %rax
  1014. jne .L04
  1015. ALIGN_4
  1016. .L05:
  1017. #if defined(LT) || defined(RN)
  1018. movq A, AO
  1019. #else
  1020. movq A, AORIG
  1021. #endif
  1022. #ifdef RT
  1023. leaq (, LDC, 2), %rax
  1024. subq %rax, C
  1025. #endif
  1026. movq C, CO1
  1027. leaq (C, LDC, 1), CO2
  1028. #ifndef RT
  1029. leaq (C, LDC, 2), C
  1030. #endif
  1031. movq M, I
  1032. sarq $1, I # i = (m >> 2)
  1033. jle .L30
  1034. ALIGN_4
  1035. .L10:
  1036. #ifdef LN
  1037. movq K, %rax
  1038. salq $1 + ZBASE_SHIFT, %rax
  1039. subq %rax, AORIG
  1040. #endif
  1041. #if defined(LN) || defined(RT)
  1042. movq KK, %rax
  1043. movq AORIG, AO
  1044. salq $ZBASE_SHIFT, %rax
  1045. leaq (AO, %rax, 2), AO
  1046. #endif
  1047. leaq BUFFER, BO
  1048. #if defined(LN) || defined(RT)
  1049. movq KK, %rax
  1050. salq $1 + ZBASE_SHIFT, %rax
  1051. leaq (BO, %rax, 2), BO
  1052. #endif
  1053. movapd 0 * SIZE(AO), %xmm8
  1054. pxor %xmm0, %xmm0
  1055. movapd 2 * SIZE(AO), %xmm10
  1056. pxor %xmm1, %xmm1
  1057. movapd 4 * SIZE(AO), %xmm12
  1058. pxor %xmm2, %xmm2
  1059. movapd 6 * SIZE(AO), %xmm14
  1060. pxor %xmm3, %xmm3
  1061. movapd 0 * SIZE(BO), %xmm9
  1062. pxor %xmm4, %xmm4
  1063. movapd 2 * SIZE(BO), %xmm11
  1064. pxor %xmm5, %xmm5
  1065. movapd 4 * SIZE(BO), %xmm13
  1066. movapd 8 * SIZE(BO), %xmm15
  1067. PREFETCHW 4 * SIZE(CO1)
  1068. pxor %xmm6, %xmm6
  1069. PREFETCHW 4 * SIZE(CO2)
  1070. pxor %xmm7, %xmm7
  1071. #if defined(LT) || defined(RN)
  1072. movq KK, %rax
  1073. #else
  1074. movq K, %rax
  1075. subq KK, %rax
  1076. #endif
  1077. andq $-8, %rax
  1078. salq $4, %rax
  1079. je .L15
  1080. .L1X:
  1081. KERNEL1(16 * 0)
  1082. KERNEL2(16 * 0)
  1083. KERNEL3(16 * 0)
  1084. KERNEL4(16 * 0)
  1085. KERNEL5(16 * 0)
  1086. KERNEL6(16 * 0)
  1087. KERNEL7(16 * 0)
  1088. KERNEL8(16 * 0)
  1089. KERNEL1(16 * 1)
  1090. KERNEL2(16 * 1)
  1091. KERNEL3(16 * 1)
  1092. KERNEL4(16 * 1)
  1093. KERNEL5(16 * 1)
  1094. KERNEL6(16 * 1)
  1095. KERNEL7(16 * 1)
  1096. KERNEL8(16 * 1)
  1097. cmpq $64 * 2, %rax
  1098. jle .L12
  1099. KERNEL1(16 * 2)
  1100. KERNEL2(16 * 2)
  1101. KERNEL3(16 * 2)
  1102. KERNEL4(16 * 2)
  1103. KERNEL5(16 * 2)
  1104. KERNEL6(16 * 2)
  1105. KERNEL7(16 * 2)
  1106. KERNEL8(16 * 2)
  1107. KERNEL1(16 * 3)
  1108. KERNEL2(16 * 3)
  1109. KERNEL3(16 * 3)
  1110. KERNEL4(16 * 3)
  1111. KERNEL5(16 * 3)
  1112. KERNEL6(16 * 3)
  1113. KERNEL7(16 * 3)
  1114. KERNEL8(16 * 3)
  1115. cmpq $64 * 4, %rax
  1116. jle .L12
  1117. KERNEL1(16 * 4)
  1118. KERNEL2(16 * 4)
  1119. KERNEL3(16 * 4)
  1120. KERNEL4(16 * 4)
  1121. KERNEL5(16 * 4)
  1122. KERNEL6(16 * 4)
  1123. KERNEL7(16 * 4)
  1124. KERNEL8(16 * 4)
  1125. KERNEL1(16 * 5)
  1126. KERNEL2(16 * 5)
  1127. KERNEL3(16 * 5)
  1128. KERNEL4(16 * 5)
  1129. KERNEL5(16 * 5)
  1130. KERNEL6(16 * 5)
  1131. KERNEL7(16 * 5)
  1132. KERNEL8(16 * 5)
  1133. cmpq $64 * 6, %rax
  1134. jle .L12
  1135. KERNEL1(16 * 6)
  1136. KERNEL2(16 * 6)
  1137. KERNEL3(16 * 6)
  1138. KERNEL4(16 * 6)
  1139. KERNEL5(16 * 6)
  1140. KERNEL6(16 * 6)
  1141. KERNEL7(16 * 6)
  1142. KERNEL8(16 * 6)
  1143. KERNEL1(16 * 7)
  1144. KERNEL2(16 * 7)
  1145. KERNEL3(16 * 7)
  1146. KERNEL4(16 * 7)
  1147. KERNEL5(16 * 7)
  1148. KERNEL6(16 * 7)
  1149. KERNEL7(16 * 7)
  1150. KERNEL8(16 * 7)
  1151. addq $16 * 8 * SIZE, AO
  1152. addq $32 * 8 * SIZE, BO
  1153. subq $64 * 8, %rax
  1154. jg .L1X
  1155. .L12:
  1156. leaq (AO, %rax, 2), AO # * 16
  1157. leaq (BO, %rax, 4), BO # * 64
  1158. ALIGN_4
  1159. .L15:
  1160. #if defined(LT) || defined(RN)
  1161. movq KK, %rax
  1162. #else
  1163. movq K, %rax
  1164. subq KK, %rax
  1165. #endif
  1166. movapd POSINV, %xmm15
  1167. andq $7, %rax # if (k & 1)
  1168. BRANCH
  1169. je .L19
  1170. ALIGN_4
  1171. .L16:
  1172. mulpd %xmm8, %xmm9
  1173. addpd %xmm9, %xmm0
  1174. movapd 2 * SIZE(BO), %xmm9
  1175. mulpd %xmm8, %xmm9
  1176. addpd %xmm9, %xmm1
  1177. movapd 4 * SIZE(BO), %xmm9
  1178. mulpd %xmm8, %xmm9
  1179. mulpd 6 * SIZE(BO), %xmm8
  1180. addpd %xmm9, %xmm2
  1181. movapd 0 * SIZE(BO), %xmm9
  1182. addpd %xmm8, %xmm3
  1183. movapd 4 * SIZE(AO), %xmm8
  1184. mulpd %xmm10, %xmm9
  1185. addpd %xmm9, %xmm4
  1186. movapd 2 * SIZE(BO), %xmm9
  1187. mulpd %xmm10, %xmm9
  1188. addpd %xmm9, %xmm5
  1189. movapd 4 * SIZE(BO), %xmm9
  1190. mulpd %xmm10, %xmm9
  1191. mulpd 6 * SIZE(BO), %xmm10
  1192. addpd %xmm9, %xmm6
  1193. movapd 8 * SIZE(BO), %xmm9
  1194. addpd %xmm10, %xmm7
  1195. movapd 6 * SIZE(AO), %xmm10
  1196. addq $4 * SIZE, AO # aoffset += 4
  1197. addq $8 * SIZE, BO # boffset1 += 8
  1198. decq %rax
  1199. jg .L16
  1200. ALIGN_4
  1201. .L19:
  1202. #if defined(LN) || defined(RT)
  1203. movq KK, %rax
  1204. #ifdef LN
  1205. subq $2, %rax
  1206. #else
  1207. subq $2, %rax
  1208. #endif
  1209. movq AORIG, AO
  1210. movq BORIG, B
  1211. leaq BUFFER, BO
  1212. salq $ZBASE_SHIFT, %rax
  1213. leaq (AO, %rax, 2), AO
  1214. leaq (B, %rax, 2), B
  1215. leaq (BO, %rax, 4), BO
  1216. #endif
  1217. SHUFPD_1 %xmm1, %xmm1
  1218. SHUFPD_1 %xmm3, %xmm3
  1219. SHUFPD_1 %xmm5, %xmm5
  1220. SHUFPD_1 %xmm7, %xmm7
  1221. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1222. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1223. xorpd %xmm15, %xmm1
  1224. xorpd %xmm15, %xmm3
  1225. xorpd %xmm15, %xmm5
  1226. xorpd %xmm15, %xmm7
  1227. #else
  1228. xorpd %xmm15, %xmm0
  1229. xorpd %xmm15, %xmm2
  1230. xorpd %xmm15, %xmm4
  1231. xorpd %xmm15, %xmm6
  1232. #endif
  1233. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1234. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1235. subpd %xmm1, %xmm0
  1236. subpd %xmm3, %xmm2
  1237. subpd %xmm5, %xmm4
  1238. subpd %xmm7, %xmm6
  1239. #else
  1240. addpd %xmm1, %xmm0
  1241. addpd %xmm3, %xmm2
  1242. addpd %xmm5, %xmm4
  1243. addpd %xmm7, %xmm6
  1244. #endif
  1245. #if defined(LN) || defined(LT)
  1246. movapd 0 * SIZE(B), %xmm1
  1247. movapd 2 * SIZE(B), %xmm3
  1248. movapd 4 * SIZE(B), %xmm5
  1249. movapd 6 * SIZE(B), %xmm7
  1250. subpd %xmm0, %xmm1
  1251. subpd %xmm2, %xmm3
  1252. subpd %xmm4, %xmm5
  1253. subpd %xmm6, %xmm7
  1254. #else
  1255. movapd 0 * SIZE(AO), %xmm1
  1256. movapd 2 * SIZE(AO), %xmm5
  1257. movapd 4 * SIZE(AO), %xmm3
  1258. movapd 6 * SIZE(AO), %xmm7
  1259. subpd %xmm0, %xmm1
  1260. subpd %xmm2, %xmm3
  1261. subpd %xmm4, %xmm5
  1262. subpd %xmm6, %xmm7
  1263. #endif
  1264. #ifndef CONJ
  1265. SHUFPD_1 %xmm15, %xmm15
  1266. #endif
  1267. #ifdef LN
  1268. movlpd 6 * SIZE(AO), %xmm8
  1269. movhpd 6 * SIZE(AO), %xmm8
  1270. movlpd 7 * SIZE(AO), %xmm9
  1271. movhpd 7 * SIZE(AO), %xmm9
  1272. movlpd 4 * SIZE(AO), %xmm10
  1273. movhpd 4 * SIZE(AO), %xmm10
  1274. movlpd 5 * SIZE(AO), %xmm11
  1275. movhpd 5 * SIZE(AO), %xmm11
  1276. movlpd 0 * SIZE(AO), %xmm12
  1277. movhpd 0 * SIZE(AO), %xmm12
  1278. movlpd 1 * SIZE(AO), %xmm13
  1279. movhpd 1 * SIZE(AO), %xmm13
  1280. pshufd $0x4e, %xmm5, %xmm4
  1281. pshufd $0x4e, %xmm7, %xmm6
  1282. xorpd %xmm15, %xmm4
  1283. xorpd %xmm15, %xmm6
  1284. mulpd %xmm8, %xmm5
  1285. mulpd %xmm9, %xmm4
  1286. mulpd %xmm8, %xmm7
  1287. mulpd %xmm9, %xmm6
  1288. addpd %xmm4, %xmm5
  1289. addpd %xmm6, %xmm7
  1290. movapd %xmm5, %xmm0
  1291. movapd %xmm7, %xmm2
  1292. pshufd $0x4e, %xmm5, %xmm4
  1293. pshufd $0x4e, %xmm7, %xmm6
  1294. xorpd %xmm15, %xmm4
  1295. xorpd %xmm15, %xmm6
  1296. mulpd %xmm10, %xmm0
  1297. mulpd %xmm10, %xmm2
  1298. mulpd %xmm11, %xmm4
  1299. mulpd %xmm11, %xmm6
  1300. subpd %xmm0, %xmm1
  1301. subpd %xmm2, %xmm3
  1302. subpd %xmm4, %xmm1
  1303. subpd %xmm6, %xmm3
  1304. pshufd $0x4e, %xmm1, %xmm0
  1305. pshufd $0x4e, %xmm3, %xmm2
  1306. xorpd %xmm15, %xmm0
  1307. xorpd %xmm15, %xmm2
  1308. mulpd %xmm12, %xmm1
  1309. mulpd %xmm13, %xmm0
  1310. mulpd %xmm12, %xmm3
  1311. mulpd %xmm13, %xmm2
  1312. addpd %xmm0, %xmm1
  1313. addpd %xmm2, %xmm3
  1314. #endif
  1315. #ifdef LT
  1316. movlpd 0 * SIZE(AO), %xmm8
  1317. movhpd 0 * SIZE(AO), %xmm8
  1318. movlpd 1 * SIZE(AO), %xmm9
  1319. movhpd 1 * SIZE(AO), %xmm9
  1320. movlpd 2 * SIZE(AO), %xmm10
  1321. movhpd 2 * SIZE(AO), %xmm10
  1322. movlpd 3 * SIZE(AO), %xmm11
  1323. movhpd 3 * SIZE(AO), %xmm11
  1324. movlpd 6 * SIZE(AO), %xmm12
  1325. movhpd 6 * SIZE(AO), %xmm12
  1326. movlpd 7 * SIZE(AO), %xmm13
  1327. movhpd 7 * SIZE(AO), %xmm13
  1328. pshufd $0x4e, %xmm1, %xmm0
  1329. pshufd $0x4e, %xmm3, %xmm2
  1330. xorpd %xmm15, %xmm0
  1331. xorpd %xmm15, %xmm2
  1332. mulpd %xmm8, %xmm1
  1333. mulpd %xmm9, %xmm0
  1334. mulpd %xmm8, %xmm3
  1335. mulpd %xmm9, %xmm2
  1336. addpd %xmm0, %xmm1
  1337. addpd %xmm2, %xmm3
  1338. movapd %xmm1, %xmm0
  1339. movapd %xmm3, %xmm2
  1340. pshufd $0x4e, %xmm1, %xmm4
  1341. pshufd $0x4e, %xmm3, %xmm6
  1342. xorpd %xmm15, %xmm4
  1343. xorpd %xmm15, %xmm6
  1344. mulpd %xmm10, %xmm0
  1345. mulpd %xmm10, %xmm2
  1346. mulpd %xmm11, %xmm4
  1347. mulpd %xmm11, %xmm6
  1348. subpd %xmm0, %xmm5
  1349. subpd %xmm2, %xmm7
  1350. subpd %xmm4, %xmm5
  1351. subpd %xmm6, %xmm7
  1352. pshufd $0x4e, %xmm5, %xmm4
  1353. pshufd $0x4e, %xmm7, %xmm6
  1354. xorpd %xmm15, %xmm4
  1355. xorpd %xmm15, %xmm6
  1356. mulpd %xmm12, %xmm5
  1357. mulpd %xmm13, %xmm4
  1358. mulpd %xmm12, %xmm7
  1359. mulpd %xmm13, %xmm6
  1360. addpd %xmm4, %xmm5
  1361. addpd %xmm6, %xmm7
  1362. #endif
  1363. #ifdef RN
  1364. movlpd 0 * SIZE(B), %xmm8
  1365. movhpd 0 * SIZE(B), %xmm8
  1366. movlpd 1 * SIZE(B), %xmm9
  1367. movhpd 1 * SIZE(B), %xmm9
  1368. movlpd 2 * SIZE(B), %xmm10
  1369. movhpd 2 * SIZE(B), %xmm10
  1370. movlpd 3 * SIZE(B), %xmm11
  1371. movhpd 3 * SIZE(B), %xmm11
  1372. movlpd 6 * SIZE(B), %xmm12
  1373. movhpd 6 * SIZE(B), %xmm12
  1374. movlpd 7 * SIZE(B), %xmm13
  1375. movhpd 7 * SIZE(B), %xmm13
  1376. pshufd $0x4e, %xmm1, %xmm0
  1377. pshufd $0x4e, %xmm5, %xmm4
  1378. xorpd %xmm15, %xmm0
  1379. xorpd %xmm15, %xmm4
  1380. mulpd %xmm8, %xmm1
  1381. mulpd %xmm9, %xmm0
  1382. mulpd %xmm8, %xmm5
  1383. mulpd %xmm9, %xmm4
  1384. addpd %xmm0, %xmm1
  1385. addpd %xmm4, %xmm5
  1386. movapd %xmm1, %xmm0
  1387. movapd %xmm5, %xmm2
  1388. pshufd $0x4e, %xmm1, %xmm4
  1389. pshufd $0x4e, %xmm5, %xmm6
  1390. xorpd %xmm15, %xmm4
  1391. xorpd %xmm15, %xmm6
  1392. mulpd %xmm10, %xmm0
  1393. mulpd %xmm10, %xmm2
  1394. mulpd %xmm11, %xmm4
  1395. mulpd %xmm11, %xmm6
  1396. subpd %xmm0, %xmm3
  1397. subpd %xmm2, %xmm7
  1398. subpd %xmm4, %xmm3
  1399. subpd %xmm6, %xmm7
  1400. pshufd $0x4e, %xmm3, %xmm2
  1401. pshufd $0x4e, %xmm7, %xmm6
  1402. xorpd %xmm15, %xmm2
  1403. xorpd %xmm15, %xmm6
  1404. mulpd %xmm12, %xmm3
  1405. mulpd %xmm13, %xmm2
  1406. mulpd %xmm12, %xmm7
  1407. mulpd %xmm13, %xmm6
  1408. addpd %xmm2, %xmm3
  1409. addpd %xmm6, %xmm7
  1410. #endif
  1411. #ifdef RT
  1412. movlpd 6 * SIZE(B), %xmm8
  1413. movhpd 6 * SIZE(B), %xmm8
  1414. movlpd 7 * SIZE(B), %xmm9
  1415. movhpd 7 * SIZE(B), %xmm9
  1416. movlpd 4 * SIZE(B), %xmm10
  1417. movhpd 4 * SIZE(B), %xmm10
  1418. movlpd 5 * SIZE(B), %xmm11
  1419. movhpd 5 * SIZE(B), %xmm11
  1420. movlpd 0 * SIZE(B), %xmm12
  1421. movhpd 0 * SIZE(B), %xmm12
  1422. movlpd 1 * SIZE(B), %xmm13
  1423. movhpd 1 * SIZE(B), %xmm13
  1424. pshufd $0x4e, %xmm3, %xmm2
  1425. pshufd $0x4e, %xmm7, %xmm6
  1426. xorpd %xmm15, %xmm2
  1427. xorpd %xmm15, %xmm6
  1428. mulpd %xmm8, %xmm3
  1429. mulpd %xmm9, %xmm2
  1430. mulpd %xmm8, %xmm7
  1431. mulpd %xmm9, %xmm6
  1432. addpd %xmm2, %xmm3
  1433. addpd %xmm6, %xmm7
  1434. movapd %xmm3, %xmm0
  1435. movapd %xmm7, %xmm2
  1436. pshufd $0x4e, %xmm3, %xmm4
  1437. pshufd $0x4e, %xmm7, %xmm6
  1438. xorpd %xmm15, %xmm4
  1439. xorpd %xmm15, %xmm6
  1440. mulpd %xmm10, %xmm0
  1441. mulpd %xmm10, %xmm2
  1442. mulpd %xmm11, %xmm4
  1443. mulpd %xmm11, %xmm6
  1444. subpd %xmm0, %xmm1
  1445. subpd %xmm2, %xmm5
  1446. subpd %xmm4, %xmm1
  1447. subpd %xmm6, %xmm5
  1448. pshufd $0x4e, %xmm1, %xmm0
  1449. pshufd $0x4e, %xmm5, %xmm4
  1450. xorpd %xmm15, %xmm0
  1451. xorpd %xmm15, %xmm4
  1452. mulpd %xmm12, %xmm1
  1453. mulpd %xmm13, %xmm0
  1454. mulpd %xmm12, %xmm5
  1455. mulpd %xmm13, %xmm4
  1456. addpd %xmm0, %xmm1
  1457. addpd %xmm4, %xmm5
  1458. #endif
  1459. #ifdef LN
  1460. subq $4 * SIZE, CO1
  1461. subq $4 * SIZE, CO2
  1462. #endif
  1463. movsd %xmm1, 0 * SIZE(CO1)
  1464. movhpd %xmm1, 1 * SIZE(CO1)
  1465. movsd %xmm5, 2 * SIZE(CO1)
  1466. movhpd %xmm5, 3 * SIZE(CO1)
  1467. movsd %xmm3, 0 * SIZE(CO2)
  1468. movhpd %xmm3, 1 * SIZE(CO2)
  1469. movsd %xmm7, 2 * SIZE(CO2)
  1470. movhpd %xmm7, 3 * SIZE(CO2)
  1471. #if defined(LN) || defined(LT)
  1472. movapd %xmm1, 0 * SIZE(B)
  1473. movapd %xmm3, 2 * SIZE(B)
  1474. movapd %xmm5, 4 * SIZE(B)
  1475. movapd %xmm7, 6 * SIZE(B)
  1476. movlpd %xmm1, 0 * SIZE(BO)
  1477. movlpd %xmm1, 1 * SIZE(BO)
  1478. movhpd %xmm1, 2 * SIZE(BO)
  1479. movhpd %xmm1, 3 * SIZE(BO)
  1480. movlpd %xmm3, 4 * SIZE(BO)
  1481. movlpd %xmm3, 5 * SIZE(BO)
  1482. movhpd %xmm3, 6 * SIZE(BO)
  1483. movhpd %xmm3, 7 * SIZE(BO)
  1484. movlpd %xmm5, 8 * SIZE(BO)
  1485. movlpd %xmm5, 9 * SIZE(BO)
  1486. movhpd %xmm5, 10 * SIZE(BO)
  1487. movhpd %xmm5, 11 * SIZE(BO)
  1488. movlpd %xmm7, 12 * SIZE(BO)
  1489. movlpd %xmm7, 13 * SIZE(BO)
  1490. movhpd %xmm7, 14 * SIZE(BO)
  1491. movhpd %xmm7, 15 * SIZE(BO)
  1492. #else
  1493. movapd %xmm1, 0 * SIZE(AO)
  1494. movapd %xmm5, 2 * SIZE(AO)
  1495. movapd %xmm3, 4 * SIZE(AO)
  1496. movapd %xmm7, 6 * SIZE(AO)
  1497. #endif
  1498. #ifndef LN
  1499. addq $4 * SIZE, CO1
  1500. addq $4 * SIZE, CO2
  1501. #endif
  1502. #if defined(LT) || defined(RN)
  1503. movq K, %rax
  1504. subq KK, %rax
  1505. salq $ZBASE_SHIFT, %rax
  1506. leaq (AO, %rax, 2), AO
  1507. #ifdef LT
  1508. addq $8 * SIZE, B
  1509. #endif
  1510. #endif
  1511. #ifdef LN
  1512. subq $2, KK
  1513. movq BORIG, B
  1514. #endif
  1515. #ifdef LT
  1516. addq $2, KK
  1517. #endif
  1518. #ifdef RT
  1519. movq K, %rax
  1520. movq BORIG, B
  1521. salq $1 + ZBASE_SHIFT, %rax
  1522. addq %rax, AORIG
  1523. #endif
  1524. decq I # i --
  1525. jg .L10
  1526. ALIGN_4
  1527. .L30:
  1528. testq $1, M
  1529. jle .L99
  1530. #ifdef LN
  1531. movq K, %rax
  1532. salq $0 + ZBASE_SHIFT, %rax
  1533. subq %rax, AORIG
  1534. #endif
  1535. #if defined(LN) || defined(RT)
  1536. movq KK, %rax
  1537. movq AORIG, AO
  1538. salq $ZBASE_SHIFT, %rax
  1539. addq %rax, AO
  1540. #endif
  1541. leaq BUFFER, BO
  1542. #if defined(LN) || defined(RT)
  1543. movq KK, %rax
  1544. salq $1 + ZBASE_SHIFT, %rax
  1545. leaq (BO, %rax, 2), BO
  1546. #endif
  1547. pxor %xmm0, %xmm0
  1548. pxor %xmm1, %xmm1
  1549. pxor %xmm2, %xmm2
  1550. pxor %xmm3, %xmm3
  1551. #if defined(LT) || defined(RN)
  1552. movq KK, %rax
  1553. #else
  1554. movq K, %rax
  1555. subq KK, %rax
  1556. #endif
  1557. sarq $2, %rax
  1558. je .L42
  1559. .L41:
  1560. movapd 0 * SIZE(AO), %xmm8
  1561. movapd 0 * SIZE(BO), %xmm9
  1562. mulpd %xmm8, %xmm9
  1563. addpd %xmm9, %xmm0
  1564. movapd 2 * SIZE(BO), %xmm9
  1565. mulpd %xmm8, %xmm9
  1566. addpd %xmm9, %xmm1
  1567. movapd 4 * SIZE(BO), %xmm9
  1568. mulpd %xmm8, %xmm9
  1569. addpd %xmm9, %xmm2
  1570. movapd 6 * SIZE(BO), %xmm9
  1571. mulpd %xmm8, %xmm9
  1572. addpd %xmm9, %xmm3
  1573. movapd 2 * SIZE(AO), %xmm8
  1574. movapd 8 * SIZE(BO), %xmm9
  1575. mulpd %xmm8, %xmm9
  1576. addpd %xmm9, %xmm0
  1577. movapd 10 * SIZE(BO), %xmm9
  1578. mulpd %xmm8, %xmm9
  1579. addpd %xmm9, %xmm1
  1580. movapd 12 * SIZE(BO), %xmm9
  1581. mulpd %xmm8, %xmm9
  1582. addpd %xmm9, %xmm2
  1583. movapd 14 * SIZE(BO), %xmm9
  1584. mulpd %xmm8, %xmm9
  1585. addpd %xmm9, %xmm3
  1586. movapd 4 * SIZE(AO), %xmm8
  1587. movapd 16 * SIZE(BO), %xmm9
  1588. mulpd %xmm8, %xmm9
  1589. addpd %xmm9, %xmm0
  1590. movapd 18 * SIZE(BO), %xmm9
  1591. mulpd %xmm8, %xmm9
  1592. addpd %xmm9, %xmm1
  1593. movapd 20 * SIZE(BO), %xmm9
  1594. mulpd %xmm8, %xmm9
  1595. addpd %xmm9, %xmm2
  1596. movapd 22 * SIZE(BO), %xmm9
  1597. mulpd %xmm8, %xmm9
  1598. addpd %xmm9, %xmm3
  1599. movapd 6 * SIZE(AO), %xmm8
  1600. movapd 24 * SIZE(BO), %xmm9
  1601. mulpd %xmm8, %xmm9
  1602. addpd %xmm9, %xmm0
  1603. movapd 26 * SIZE(BO), %xmm9
  1604. mulpd %xmm8, %xmm9
  1605. addpd %xmm9, %xmm1
  1606. movapd 28 * SIZE(BO), %xmm9
  1607. mulpd %xmm8, %xmm9
  1608. addpd %xmm9, %xmm2
  1609. movapd 30 * SIZE(BO), %xmm9
  1610. mulpd %xmm8, %xmm9
  1611. addpd %xmm9, %xmm3
  1612. addq $ 8 * SIZE, AO
  1613. addq $32 * SIZE, BO
  1614. decq %rax
  1615. jne .L41
  1616. .L42:
  1617. #if defined(LT) || defined(RN)
  1618. movq KK, %rax
  1619. #else
  1620. movq K, %rax
  1621. subq KK, %rax
  1622. #endif
  1623. movapd POSINV, %xmm15
  1624. andq $3, %rax # if (k & 1)
  1625. BRANCH
  1626. jle .L44
  1627. .L43:
  1628. movapd 0 * SIZE(AO), %xmm8
  1629. movapd 0 * SIZE(BO), %xmm9
  1630. mulpd %xmm8, %xmm9
  1631. addpd %xmm9, %xmm0
  1632. movapd 2 * SIZE(BO), %xmm9
  1633. mulpd %xmm8, %xmm9
  1634. addpd %xmm9, %xmm1
  1635. movapd 4 * SIZE(BO), %xmm9
  1636. mulpd %xmm8, %xmm9
  1637. addpd %xmm9, %xmm2
  1638. movapd 6 * SIZE(BO), %xmm9
  1639. mulpd %xmm8, %xmm9
  1640. addpd %xmm9, %xmm3
  1641. addq $2 * SIZE, AO # aoffset += 4
  1642. addq $8 * SIZE, BO # boffset1 += 8
  1643. decq %rax
  1644. jg .L43
  1645. ALIGN_4
  1646. .L44:
  1647. #if defined(LN) || defined(RT)
  1648. movq KK, %rax
  1649. #ifdef LN
  1650. subq $1, %rax
  1651. #else
  1652. subq $2, %rax
  1653. #endif
  1654. movq AORIG, AO
  1655. movq BORIG, B
  1656. leaq BUFFER, BO
  1657. salq $ZBASE_SHIFT, %rax
  1658. leaq (AO, %rax, 1), AO
  1659. leaq (B, %rax, 2), B
  1660. leaq (BO, %rax, 4), BO
  1661. #endif
  1662. SHUFPD_1 %xmm1, %xmm1
  1663. SHUFPD_1 %xmm3, %xmm3
  1664. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1665. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1666. xorpd %xmm15, %xmm1
  1667. xorpd %xmm15, %xmm3
  1668. #else
  1669. xorpd %xmm15, %xmm0
  1670. xorpd %xmm15, %xmm2
  1671. #endif
  1672. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1673. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1674. subpd %xmm1, %xmm0
  1675. subpd %xmm3, %xmm2
  1676. #else
  1677. addpd %xmm1, %xmm0
  1678. addpd %xmm3, %xmm2
  1679. #endif
  1680. #if defined(LN) || defined(LT)
  1681. movapd 0 * SIZE(B), %xmm1
  1682. movapd 2 * SIZE(B), %xmm3
  1683. subpd %xmm0, %xmm1
  1684. subpd %xmm2, %xmm3
  1685. #else
  1686. movapd 0 * SIZE(AO), %xmm1
  1687. movapd 2 * SIZE(AO), %xmm3
  1688. subpd %xmm0, %xmm1
  1689. subpd %xmm2, %xmm3
  1690. #endif
  1691. #ifndef CONJ
  1692. SHUFPD_1 %xmm15, %xmm15
  1693. #endif
  1694. #if defined(LN) || defined(LT)
  1695. movlpd 0 * SIZE(AO), %xmm8
  1696. movhpd 0 * SIZE(AO), %xmm8
  1697. movlpd 1 * SIZE(AO), %xmm9
  1698. movhpd 1 * SIZE(AO), %xmm9
  1699. pshufd $0x4e, %xmm1, %xmm0
  1700. pshufd $0x4e, %xmm3, %xmm2
  1701. xorpd %xmm15, %xmm0
  1702. xorpd %xmm15, %xmm2
  1703. mulpd %xmm8, %xmm1
  1704. mulpd %xmm9, %xmm0
  1705. mulpd %xmm8, %xmm3
  1706. mulpd %xmm9, %xmm2
  1707. addpd %xmm0, %xmm1
  1708. addpd %xmm2, %xmm3
  1709. #endif
  1710. #ifdef RN
  1711. movlpd 0 * SIZE(B), %xmm8
  1712. movhpd 0 * SIZE(B), %xmm8
  1713. movlpd 1 * SIZE(B), %xmm9
  1714. movhpd 1 * SIZE(B), %xmm9
  1715. movlpd 2 * SIZE(B), %xmm10
  1716. movhpd 2 * SIZE(B), %xmm10
  1717. movlpd 3 * SIZE(B), %xmm11
  1718. movhpd 3 * SIZE(B), %xmm11
  1719. movlpd 6 * SIZE(B), %xmm12
  1720. movhpd 6 * SIZE(B), %xmm12
  1721. movlpd 7 * SIZE(B), %xmm13
  1722. movhpd 7 * SIZE(B), %xmm13
  1723. pshufd $0x4e, %xmm1, %xmm0
  1724. xorpd %xmm15, %xmm0
  1725. mulpd %xmm8, %xmm1
  1726. mulpd %xmm9, %xmm0
  1727. addpd %xmm0, %xmm1
  1728. movapd %xmm1, %xmm0
  1729. pshufd $0x4e, %xmm1, %xmm4
  1730. xorpd %xmm15, %xmm4
  1731. mulpd %xmm10, %xmm0
  1732. mulpd %xmm11, %xmm4
  1733. subpd %xmm0, %xmm3
  1734. subpd %xmm4, %xmm3
  1735. pshufd $0x4e, %xmm3, %xmm2
  1736. xorpd %xmm15, %xmm2
  1737. mulpd %xmm12, %xmm3
  1738. mulpd %xmm13, %xmm2
  1739. addpd %xmm2, %xmm3
  1740. #endif
  1741. #ifdef RT
  1742. movlpd 6 * SIZE(B), %xmm8
  1743. movhpd 6 * SIZE(B), %xmm8
  1744. movlpd 7 * SIZE(B), %xmm9
  1745. movhpd 7 * SIZE(B), %xmm9
  1746. movlpd 4 * SIZE(B), %xmm10
  1747. movhpd 4 * SIZE(B), %xmm10
  1748. movlpd 5 * SIZE(B), %xmm11
  1749. movhpd 5 * SIZE(B), %xmm11
  1750. movlpd 0 * SIZE(B), %xmm12
  1751. movhpd 0 * SIZE(B), %xmm12
  1752. movlpd 1 * SIZE(B), %xmm13
  1753. movhpd 1 * SIZE(B), %xmm13
  1754. pshufd $0x4e, %xmm3, %xmm2
  1755. xorpd %xmm15, %xmm2
  1756. mulpd %xmm8, %xmm3
  1757. mulpd %xmm9, %xmm2
  1758. addpd %xmm2, %xmm3
  1759. movapd %xmm3, %xmm0
  1760. pshufd $0x4e, %xmm3, %xmm4
  1761. xorpd %xmm15, %xmm4
  1762. mulpd %xmm10, %xmm0
  1763. mulpd %xmm11, %xmm4
  1764. subpd %xmm0, %xmm1
  1765. subpd %xmm4, %xmm1
  1766. pshufd $0x4e, %xmm1, %xmm0
  1767. xorpd %xmm15, %xmm0
  1768. mulpd %xmm12, %xmm1
  1769. mulpd %xmm13, %xmm0
  1770. addpd %xmm0, %xmm1
  1771. #endif
  1772. #ifdef LN
  1773. subq $2 * SIZE, CO1
  1774. subq $2 * SIZE, CO2
  1775. #endif
  1776. movsd %xmm1, 0 * SIZE(CO1)
  1777. movhpd %xmm1, 1 * SIZE(CO1)
  1778. movsd %xmm3, 0 * SIZE(CO2)
  1779. movhpd %xmm3, 1 * SIZE(CO2)
  1780. #if defined(LN) || defined(LT)
  1781. movapd %xmm1, 0 * SIZE(B)
  1782. movapd %xmm3, 2 * SIZE(B)
  1783. movlpd %xmm1, 0 * SIZE(BO)
  1784. movlpd %xmm1, 1 * SIZE(BO)
  1785. movhpd %xmm1, 2 * SIZE(BO)
  1786. movhpd %xmm1, 3 * SIZE(BO)
  1787. movlpd %xmm3, 4 * SIZE(BO)
  1788. movlpd %xmm3, 5 * SIZE(BO)
  1789. movhpd %xmm3, 6 * SIZE(BO)
  1790. movhpd %xmm3, 7 * SIZE(BO)
  1791. #else
  1792. movapd %xmm1, 0 * SIZE(AO)
  1793. movapd %xmm3, 2 * SIZE(AO)
  1794. #endif
  1795. #ifndef LN
  1796. addq $2 * SIZE, CO1
  1797. addq $2 * SIZE, CO2
  1798. #endif
  1799. #if defined(LT) || defined(RN)
  1800. movq K, %rax
  1801. subq KK, %rax
  1802. salq $ZBASE_SHIFT, %rax
  1803. leaq (AO, %rax, 1), AO
  1804. #ifdef LT
  1805. addq $4 * SIZE, B
  1806. #endif
  1807. #endif
  1808. #ifdef LN
  1809. subq $1, KK
  1810. movq BORIG, B
  1811. #endif
  1812. #ifdef LT
  1813. addq $1, KK
  1814. #endif
  1815. #ifdef RT
  1816. movq K, %rax
  1817. movq BORIG, B
  1818. salq $0 + ZBASE_SHIFT, %rax
  1819. addq %rax, AORIG
  1820. #endif
  1821. ALIGN_4
  1822. .L99:
  1823. #ifdef LN
  1824. leaq (, K, SIZE), %rax
  1825. leaq (B, %rax, 4), B
  1826. #endif
  1827. #if defined(LT) || defined(RN)
  1828. movq K, %rax
  1829. subq KK, %rax
  1830. leaq (,%rax, SIZE), %rax
  1831. leaq (B, %rax, 2 * COMPSIZE), B
  1832. #endif
  1833. #ifdef RN
  1834. addq $2, KK
  1835. #endif
  1836. #ifdef RT
  1837. subq $2, KK
  1838. #endif
  1839. decq J # j --
  1840. jg .L01
  1841. ALIGN_3
  1842. .L999:
  1843. movq %rbx, %rsp
  1844. movq 0(%rsp), %rbx
  1845. movq 8(%rsp), %rbp
  1846. movq 16(%rsp), %r12
  1847. movq 24(%rsp), %r13
  1848. movq 32(%rsp), %r14
  1849. movq 40(%rsp), %r15
  1850. #ifdef WINDOWS_ABI
  1851. movq 48(%rsp), %rdi
  1852. movq 56(%rsp), %rsi
  1853. movups 64(%rsp), %xmm6
  1854. movups 80(%rsp), %xmm7
  1855. movups 96(%rsp), %xmm8
  1856. movups 112(%rsp), %xmm9
  1857. movups 128(%rsp), %xmm10
  1858. movups 144(%rsp), %xmm11
  1859. movups 160(%rsp), %xmm12
  1860. movups 176(%rsp), %xmm13
  1861. movups 192(%rsp), %xmm14
  1862. movups 208(%rsp), %xmm15
  1863. #endif
  1864. addq $STACKSIZE, %rsp
  1865. ret
  1866. EPILOGUE