You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT_2x2_sse2.S 43 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define N %r14
  44. #define K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define J %r12
  51. #define AO %rdi
  52. #define BO %rsi
  53. #define CO1 %r15
  54. #define CO2 %rbp
  55. #ifndef WINDOWS_ABI
  56. #define STACKSIZE 64
  57. #define OLD_LDC 8 + STACKSIZE(%rsp)
  58. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  59. #else
  60. #define STACKSIZE 256
  61. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  62. #define OLD_A 48 + STACKSIZE(%rsp)
  63. #define OLD_B 56 + STACKSIZE(%rsp)
  64. #define OLD_C 64 + STACKSIZE(%rsp)
  65. #define OLD_LDC 72 + STACKSIZE(%rsp)
  66. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  67. #endif
  68. #define POSINV 0(%rsp)
  69. #define ALPHA_R 16(%rsp)
  70. #define ALPHA_I 32(%rsp)
  71. #define OFFSET 40(%rsp)
  72. #define KK 48(%rsp)
  73. #define KKK 56(%rsp)
  74. #define AORIG 64(%rsp)
  75. #define BORIG 72(%rsp)
  76. #define BUFFER 128(%rsp)
  77. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
  78. #define PREFETCH prefetch
  79. #define PREFETCHW prefetchw
  80. #define PREFETCHNTA prefetchnta
  81. #define PREFETCHSIZE (8 * 6 + 4)
  82. #endif
  83. #ifdef GENERIC
  84. #define PREFETCH prefetcht0
  85. #define PREFETCHW prefetcht0
  86. #define PREFETCHNTA prefetchnta
  87. #define PREFETCHSIZE (8 * 6 + 4)
  88. #endif
  89. #define KERNEL1(xx) \
  90. mulpd %xmm8, %xmm9 ;\
  91. addpd %xmm9, %xmm0 ;\
  92. movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  93. mulpd %xmm8, %xmm11 ;\
  94. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\
  95. addpd %xmm11, %xmm1 ;\
  96. movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  97. mulpd %xmm8, %xmm13 ;\
  98. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
  99. addpd %xmm13, %xmm2 ;\
  100. movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  101. addpd %xmm8, %xmm3 ;\
  102. movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
  103. #define KERNEL2(xx) \
  104. mulpd %xmm10, %xmm9 ;\
  105. addpd %xmm9, %xmm4 ;\
  106. movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  107. mulpd %xmm10, %xmm11 ;\
  108. addpd %xmm11, %xmm5 ;\
  109. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  110. mulpd %xmm10, %xmm13 ;\
  111. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
  112. addpd %xmm13, %xmm6 ;\
  113. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  114. addpd %xmm10, %xmm7 ;\
  115. movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
  116. #define KERNEL3(xx) \
  117. mulpd %xmm12, %xmm15 ;\
  118. addpd %xmm15, %xmm0 ;\
  119. movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  120. mulpd %xmm12, %xmm11 ;\
  121. addpd %xmm11, %xmm1 ;\
  122. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  123. mulpd %xmm12, %xmm13 ;\
  124. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
  125. addpd %xmm13, %xmm2 ;\
  126. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  127. addpd %xmm12, %xmm3 ;\
  128. movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
  129. #define KERNEL4(xx) \
  130. mulpd %xmm14, %xmm15 ;\
  131. addpd %xmm15, %xmm4 ;\
  132. movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  133. mulpd %xmm14, %xmm11 ;\
  134. addpd %xmm11, %xmm5 ;\
  135. movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  136. mulpd %xmm14, %xmm13 ;\
  137. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
  138. addpd %xmm13, %xmm6 ;\
  139. movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  140. addpd %xmm14, %xmm7 ;\
  141. movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
  142. #define KERNEL5(xx) \
  143. mulpd %xmm8, %xmm9 ;\
  144. addpd %xmm9, %xmm0 ;\
  145. movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  146. mulpd %xmm8, %xmm11 ;\
  147. PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\
  148. addpd %xmm11, %xmm1 ;\
  149. movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  150. mulpd %xmm8, %xmm13 ;\
  151. mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
  152. addpd %xmm13, %xmm2 ;\
  153. movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  154. addpd %xmm8, %xmm3 ;\
  155. movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
  156. #define KERNEL6(xx) \
  157. mulpd %xmm10, %xmm9 ;\
  158. addpd %xmm9, %xmm4 ;\
  159. movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  160. mulpd %xmm10, %xmm11 ;\
  161. addpd %xmm11, %xmm5 ;\
  162. movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  163. mulpd %xmm10, %xmm13 ;\
  164. mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
  165. addpd %xmm13, %xmm6 ;\
  166. movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  167. addpd %xmm10, %xmm7 ;\
  168. movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
  169. #define KERNEL7(xx) \
  170. mulpd %xmm12, %xmm15 ;\
  171. addpd %xmm15, %xmm0 ;\
  172. movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  173. mulpd %xmm12, %xmm11 ;\
  174. addpd %xmm11, %xmm1 ;\
  175. movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  176. mulpd %xmm12, %xmm13 ;\
  177. mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
  178. addpd %xmm13, %xmm2 ;\
  179. movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  180. addpd %xmm12, %xmm3 ;\
  181. movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
  182. #define KERNEL8(xx) \
  183. mulpd %xmm14, %xmm15 ;\
  184. addpd %xmm15, %xmm4 ;\
  185. movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  186. mulpd %xmm14, %xmm11 ;\
  187. addpd %xmm11, %xmm5 ;\
  188. movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  189. mulpd %xmm14, %xmm13 ;\
  190. mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
  191. addpd %xmm13, %xmm6 ;\
  192. movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  193. addpd %xmm14, %xmm7 ;\
  194. movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
  195. #ifndef CONJ
  196. #define NN
  197. #else
  198. #if defined(LN) || defined(LT)
  199. #define CN
  200. #else
  201. #define NC
  202. #endif
  203. #endif
  204. PROLOGUE
  205. PROFCODE
  206. subq $STACKSIZE, %rsp
  207. movq %rbx, 0(%rsp)
  208. movq %rbp, 8(%rsp)
  209. movq %r12, 16(%rsp)
  210. movq %r13, 24(%rsp)
  211. movq %r14, 32(%rsp)
  212. movq %r15, 40(%rsp)
  213. #ifdef WINDOWS_ABI
  214. movq %rdi, 48(%rsp)
  215. movq %rsi, 56(%rsp)
  216. movups %xmm6, 64(%rsp)
  217. movups %xmm7, 80(%rsp)
  218. movups %xmm8, 96(%rsp)
  219. movups %xmm9, 112(%rsp)
  220. movups %xmm10, 128(%rsp)
  221. movups %xmm11, 144(%rsp)
  222. movups %xmm12, 160(%rsp)
  223. movups %xmm13, 176(%rsp)
  224. movups %xmm14, 192(%rsp)
  225. movups %xmm15, 208(%rsp)
  226. movq ARG1, OLD_M
  227. movq ARG2, OLD_N
  228. movq ARG3, K
  229. movq OLD_A, A
  230. movq OLD_B, B
  231. movq OLD_C, C
  232. movq OLD_LDC, LDC
  233. movsd OLD_OFFSET, %xmm4
  234. movaps %xmm3, %xmm0
  235. #else
  236. movq OLD_LDC, LDC
  237. movsd OLD_OFFSET, %xmm4
  238. #endif
  239. movq %rsp, %rbx # save old stack
  240. subq $128 + LOCAL_BUFFER_SIZE, %rsp
  241. andq $-4096, %rsp # align stack
  242. STACK_TOUCHING
  243. movq OLD_M, M
  244. movq OLD_N, N
  245. pcmpeqb %xmm15, %xmm15
  246. psllq $63, %xmm15 # Generate mask
  247. pxor %xmm2, %xmm2
  248. movlpd %xmm2, 0 + POSINV
  249. movlpd %xmm15, 8 + POSINV
  250. movlpd %xmm4, OFFSET
  251. movlpd %xmm4, KK
  252. salq $ZBASE_SHIFT, LDC
  253. #ifdef LN
  254. movq M, %rax
  255. salq $ZBASE_SHIFT, %rax
  256. addq %rax, C
  257. imulq K, %rax
  258. addq %rax, A
  259. #endif
  260. #ifdef RT
  261. movq N, %rax
  262. salq $ZBASE_SHIFT, %rax
  263. imulq K, %rax
  264. addq %rax, B
  265. movq N, %rax
  266. imulq LDC, %rax
  267. addq %rax, C
  268. #endif
  269. #ifdef RN
  270. negq KK
  271. #endif
  272. #ifdef RT
  273. movq N, %rax
  274. subq OFFSET, %rax
  275. movq %rax, KK
  276. #endif
  277. movq N, J
  278. sarq $1, J # j = (n >> 2)
  279. jle .L100
  280. ALIGN_4
  281. .L01:
  282. #ifdef LN
  283. movq OFFSET, %rax
  284. addq M, %rax
  285. movq %rax, KK
  286. #endif
  287. leaq BUFFER, BO
  288. #ifdef RT
  289. movq K, %rax
  290. salq $1 + ZBASE_SHIFT, %rax
  291. subq %rax, B
  292. #endif
  293. #if defined(LN) || defined(RT)
  294. movq KK, %rax
  295. movq B, BORIG
  296. salq $ZBASE_SHIFT, %rax
  297. leaq (B, %rax, 2), B
  298. leaq (BO, %rax, 4), BO
  299. #endif
  300. #if defined(LT)
  301. movq OFFSET, %rax
  302. movq %rax, KK
  303. #endif
  304. #if defined(LT) || defined(RN)
  305. movq KK, %rax
  306. #else
  307. movq K, %rax
  308. subq KK, %rax
  309. #endif
  310. sarq $2, %rax
  311. jle .L03
  312. addq %rax, %rax
  313. ALIGN_4
  314. .L02:
  315. PREFETCHNTA 56 * SIZE(B)
  316. movlpd 0 * SIZE(B), %xmm0
  317. movlpd 1 * SIZE(B), %xmm1
  318. movlpd 2 * SIZE(B), %xmm2
  319. movlpd 3 * SIZE(B), %xmm3
  320. movlpd 4 * SIZE(B), %xmm4
  321. movlpd 5 * SIZE(B), %xmm5
  322. movlpd 6 * SIZE(B), %xmm6
  323. movlpd 7 * SIZE(B), %xmm7
  324. movlpd %xmm0, 0 * SIZE(BO)
  325. movlpd %xmm0, 1 * SIZE(BO)
  326. movlpd %xmm1, 2 * SIZE(BO)
  327. movlpd %xmm1, 3 * SIZE(BO)
  328. movlpd %xmm2, 4 * SIZE(BO)
  329. movlpd %xmm2, 5 * SIZE(BO)
  330. movlpd %xmm3, 6 * SIZE(BO)
  331. movlpd %xmm3, 7 * SIZE(BO)
  332. movlpd %xmm4, 8 * SIZE(BO)
  333. movlpd %xmm4, 9 * SIZE(BO)
  334. movlpd %xmm5, 10 * SIZE(BO)
  335. movlpd %xmm5, 11 * SIZE(BO)
  336. movlpd %xmm6, 12 * SIZE(BO)
  337. movlpd %xmm6, 13 * SIZE(BO)
  338. movlpd %xmm7, 14 * SIZE(BO)
  339. movlpd %xmm7, 15 * SIZE(BO)
  340. subq $-16 * SIZE, BO
  341. addq $ 8 * SIZE, B
  342. decq %rax
  343. jne .L02
  344. ALIGN_4
  345. .L03:
  346. #if defined(LT) || defined(RN)
  347. movq KK, %rax
  348. #else
  349. movq K, %rax
  350. subq KK, %rax
  351. #endif
  352. andq $3, %rax
  353. BRANCH
  354. jle .L05
  355. ALIGN_4
  356. .L04:
  357. movlpd 0 * SIZE(B), %xmm0
  358. movlpd 1 * SIZE(B), %xmm1
  359. movlpd 2 * SIZE(B), %xmm2
  360. movlpd 3 * SIZE(B), %xmm3
  361. movlpd %xmm0, 0 * SIZE(BO)
  362. movlpd %xmm0, 1 * SIZE(BO)
  363. movlpd %xmm1, 2 * SIZE(BO)
  364. movlpd %xmm1, 3 * SIZE(BO)
  365. movlpd %xmm2, 4 * SIZE(BO)
  366. movlpd %xmm2, 5 * SIZE(BO)
  367. movlpd %xmm3, 6 * SIZE(BO)
  368. movlpd %xmm3, 7 * SIZE(BO)
  369. addq $ 4 * SIZE, B
  370. addq $ 8 * SIZE, BO
  371. decq %rax
  372. jne .L04
  373. ALIGN_4
  374. .L05:
  375. #if defined(LT) || defined(RN)
  376. movq A, AO
  377. #else
  378. movq A, AORIG
  379. #endif
  380. #ifdef RT
  381. leaq (, LDC, 2), %rax
  382. subq %rax, C
  383. #endif
  384. movq C, CO1
  385. leaq (C, LDC, 1), CO2
  386. #ifndef RT
  387. leaq (C, LDC, 2), C
  388. #endif
  389. movq M, I
  390. sarq $1, I # i = (m >> 2)
  391. jle .L30
  392. ALIGN_4
  393. .L10:
  394. #ifdef LN
  395. movq K, %rax
  396. salq $1 + ZBASE_SHIFT, %rax
  397. subq %rax, AORIG
  398. #endif
  399. #if defined(LN) || defined(RT)
  400. movq KK, %rax
  401. movq AORIG, AO
  402. salq $ZBASE_SHIFT, %rax
  403. leaq (AO, %rax, 2), AO
  404. #endif
  405. leaq BUFFER, BO
  406. #if defined(LN) || defined(RT)
  407. movq KK, %rax
  408. salq $1 + ZBASE_SHIFT, %rax
  409. leaq (BO, %rax, 2), BO
  410. #endif
  411. movapd 0 * SIZE(AO), %xmm8
  412. pxor %xmm0, %xmm0
  413. movapd 2 * SIZE(AO), %xmm10
  414. pxor %xmm1, %xmm1
  415. movapd 4 * SIZE(AO), %xmm12
  416. pxor %xmm2, %xmm2
  417. movapd 6 * SIZE(AO), %xmm14
  418. pxor %xmm3, %xmm3
  419. movapd 0 * SIZE(BO), %xmm9
  420. pxor %xmm4, %xmm4
  421. movapd 2 * SIZE(BO), %xmm11
  422. pxor %xmm5, %xmm5
  423. movapd 4 * SIZE(BO), %xmm13
  424. movapd 8 * SIZE(BO), %xmm15
  425. PREFETCHW 4 * SIZE(CO1)
  426. pxor %xmm6, %xmm6
  427. PREFETCHW 4 * SIZE(CO2)
  428. pxor %xmm7, %xmm7
  429. #if defined(LT) || defined(RN)
  430. movq KK, %rax
  431. #else
  432. movq K, %rax
  433. subq KK, %rax
  434. #endif
  435. andq $-8, %rax
  436. salq $4, %rax
  437. je .L15
  438. .L1X:
  439. KERNEL1(16 * 0)
  440. KERNEL2(16 * 0)
  441. KERNEL3(16 * 0)
  442. KERNEL4(16 * 0)
  443. KERNEL5(16 * 0)
  444. KERNEL6(16 * 0)
  445. KERNEL7(16 * 0)
  446. KERNEL8(16 * 0)
  447. KERNEL1(16 * 1)
  448. KERNEL2(16 * 1)
  449. KERNEL3(16 * 1)
  450. KERNEL4(16 * 1)
  451. KERNEL5(16 * 1)
  452. KERNEL6(16 * 1)
  453. KERNEL7(16 * 1)
  454. KERNEL8(16 * 1)
  455. cmpq $64 * 2, %rax
  456. jle .L12
  457. KERNEL1(16 * 2)
  458. KERNEL2(16 * 2)
  459. KERNEL3(16 * 2)
  460. KERNEL4(16 * 2)
  461. KERNEL5(16 * 2)
  462. KERNEL6(16 * 2)
  463. KERNEL7(16 * 2)
  464. KERNEL8(16 * 2)
  465. KERNEL1(16 * 3)
  466. KERNEL2(16 * 3)
  467. KERNEL3(16 * 3)
  468. KERNEL4(16 * 3)
  469. KERNEL5(16 * 3)
  470. KERNEL6(16 * 3)
  471. KERNEL7(16 * 3)
  472. KERNEL8(16 * 3)
  473. cmpq $64 * 4, %rax
  474. jle .L12
  475. KERNEL1(16 * 4)
  476. KERNEL2(16 * 4)
  477. KERNEL3(16 * 4)
  478. KERNEL4(16 * 4)
  479. KERNEL5(16 * 4)
  480. KERNEL6(16 * 4)
  481. KERNEL7(16 * 4)
  482. KERNEL8(16 * 4)
  483. KERNEL1(16 * 5)
  484. KERNEL2(16 * 5)
  485. KERNEL3(16 * 5)
  486. KERNEL4(16 * 5)
  487. KERNEL5(16 * 5)
  488. KERNEL6(16 * 5)
  489. KERNEL7(16 * 5)
  490. KERNEL8(16 * 5)
  491. cmpq $64 * 6, %rax
  492. jle .L12
  493. KERNEL1(16 * 6)
  494. KERNEL2(16 * 6)
  495. KERNEL3(16 * 6)
  496. KERNEL4(16 * 6)
  497. KERNEL5(16 * 6)
  498. KERNEL6(16 * 6)
  499. KERNEL7(16 * 6)
  500. KERNEL8(16 * 6)
  501. KERNEL1(16 * 7)
  502. KERNEL2(16 * 7)
  503. KERNEL3(16 * 7)
  504. KERNEL4(16 * 7)
  505. KERNEL5(16 * 7)
  506. KERNEL6(16 * 7)
  507. KERNEL7(16 * 7)
  508. KERNEL8(16 * 7)
  509. addq $16 * 8 * SIZE, AO
  510. addq $32 * 8 * SIZE, BO
  511. subq $64 * 8, %rax
  512. jg .L1X
  513. .L12:
  514. leaq (AO, %rax, 2), AO # * 16
  515. leaq (BO, %rax, 4), BO # * 64
  516. ALIGN_4
  517. .L15:
  518. #if defined(LT) || defined(RN)
  519. movq KK, %rax
  520. #else
  521. movq K, %rax
  522. subq KK, %rax
  523. #endif
  524. movapd POSINV, %xmm15
  525. andq $7, %rax # if (k & 1)
  526. BRANCH
  527. je .L19
  528. ALIGN_4
  529. .L16:
  530. mulpd %xmm8, %xmm9
  531. addpd %xmm9, %xmm0
  532. movapd 2 * SIZE(BO), %xmm9
  533. mulpd %xmm8, %xmm9
  534. addpd %xmm9, %xmm1
  535. movapd 4 * SIZE(BO), %xmm9
  536. mulpd %xmm8, %xmm9
  537. mulpd 6 * SIZE(BO), %xmm8
  538. addpd %xmm9, %xmm2
  539. movapd 0 * SIZE(BO), %xmm9
  540. addpd %xmm8, %xmm3
  541. movapd 4 * SIZE(AO), %xmm8
  542. mulpd %xmm10, %xmm9
  543. addpd %xmm9, %xmm4
  544. movapd 2 * SIZE(BO), %xmm9
  545. mulpd %xmm10, %xmm9
  546. addpd %xmm9, %xmm5
  547. movapd 4 * SIZE(BO), %xmm9
  548. mulpd %xmm10, %xmm9
  549. mulpd 6 * SIZE(BO), %xmm10
  550. addpd %xmm9, %xmm6
  551. movapd 8 * SIZE(BO), %xmm9
  552. addpd %xmm10, %xmm7
  553. movapd 6 * SIZE(AO), %xmm10
  554. addq $4 * SIZE, AO # aoffset += 4
  555. addq $8 * SIZE, BO # boffset1 += 8
  556. decq %rax
  557. jg .L16
  558. ALIGN_4
  559. .L19:
  560. #if defined(LN) || defined(RT)
  561. movq KK, %rax
  562. #ifdef LN
  563. subq $2, %rax
  564. #else
  565. subq $2, %rax
  566. #endif
  567. movq AORIG, AO
  568. movq BORIG, B
  569. leaq BUFFER, BO
  570. salq $ZBASE_SHIFT, %rax
  571. leaq (AO, %rax, 2), AO
  572. leaq (B, %rax, 2), B
  573. leaq (BO, %rax, 4), BO
  574. #endif
  575. SHUFPD_1 %xmm1, %xmm1
  576. SHUFPD_1 %xmm3, %xmm3
  577. SHUFPD_1 %xmm5, %xmm5
  578. SHUFPD_1 %xmm7, %xmm7
  579. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  580. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  581. xorpd %xmm15, %xmm1
  582. xorpd %xmm15, %xmm3
  583. xorpd %xmm15, %xmm5
  584. xorpd %xmm15, %xmm7
  585. #else
  586. xorpd %xmm15, %xmm0
  587. xorpd %xmm15, %xmm2
  588. xorpd %xmm15, %xmm4
  589. xorpd %xmm15, %xmm6
  590. #endif
  591. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  592. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  593. subpd %xmm1, %xmm0
  594. subpd %xmm3, %xmm2
  595. subpd %xmm5, %xmm4
  596. subpd %xmm7, %xmm6
  597. #else
  598. addpd %xmm1, %xmm0
  599. addpd %xmm3, %xmm2
  600. addpd %xmm5, %xmm4
  601. addpd %xmm7, %xmm6
  602. #endif
  603. #if defined(LN) || defined(LT)
  604. movapd 0 * SIZE(B), %xmm1
  605. movapd 2 * SIZE(B), %xmm3
  606. movapd 4 * SIZE(B), %xmm5
  607. movapd 6 * SIZE(B), %xmm7
  608. subpd %xmm0, %xmm1
  609. subpd %xmm2, %xmm3
  610. subpd %xmm4, %xmm5
  611. subpd %xmm6, %xmm7
  612. #else
  613. movapd 0 * SIZE(AO), %xmm1
  614. movapd 2 * SIZE(AO), %xmm5
  615. movapd 4 * SIZE(AO), %xmm3
  616. movapd 6 * SIZE(AO), %xmm7
  617. subpd %xmm0, %xmm1
  618. subpd %xmm2, %xmm3
  619. subpd %xmm4, %xmm5
  620. subpd %xmm6, %xmm7
  621. #endif
  622. #ifndef CONJ
  623. SHUFPD_1 %xmm15, %xmm15
  624. #endif
  625. #ifdef LN
  626. movlpd 6 * SIZE(AO), %xmm8
  627. movhpd 6 * SIZE(AO), %xmm8
  628. movlpd 7 * SIZE(AO), %xmm9
  629. movhpd 7 * SIZE(AO), %xmm9
  630. movlpd 4 * SIZE(AO), %xmm10
  631. movhpd 4 * SIZE(AO), %xmm10
  632. movlpd 5 * SIZE(AO), %xmm11
  633. movhpd 5 * SIZE(AO), %xmm11
  634. movlpd 0 * SIZE(AO), %xmm12
  635. movhpd 0 * SIZE(AO), %xmm12
  636. movlpd 1 * SIZE(AO), %xmm13
  637. movhpd 1 * SIZE(AO), %xmm13
  638. pshufd $0x4e, %xmm5, %xmm4
  639. pshufd $0x4e, %xmm7, %xmm6
  640. xorpd %xmm15, %xmm4
  641. xorpd %xmm15, %xmm6
  642. mulpd %xmm8, %xmm5
  643. mulpd %xmm9, %xmm4
  644. mulpd %xmm8, %xmm7
  645. mulpd %xmm9, %xmm6
  646. addpd %xmm4, %xmm5
  647. addpd %xmm6, %xmm7
  648. movapd %xmm5, %xmm0
  649. movapd %xmm7, %xmm2
  650. pshufd $0x4e, %xmm5, %xmm4
  651. pshufd $0x4e, %xmm7, %xmm6
  652. xorpd %xmm15, %xmm4
  653. xorpd %xmm15, %xmm6
  654. mulpd %xmm10, %xmm0
  655. mulpd %xmm10, %xmm2
  656. mulpd %xmm11, %xmm4
  657. mulpd %xmm11, %xmm6
  658. subpd %xmm0, %xmm1
  659. subpd %xmm2, %xmm3
  660. subpd %xmm4, %xmm1
  661. subpd %xmm6, %xmm3
  662. pshufd $0x4e, %xmm1, %xmm0
  663. pshufd $0x4e, %xmm3, %xmm2
  664. xorpd %xmm15, %xmm0
  665. xorpd %xmm15, %xmm2
  666. mulpd %xmm12, %xmm1
  667. mulpd %xmm13, %xmm0
  668. mulpd %xmm12, %xmm3
  669. mulpd %xmm13, %xmm2
  670. addpd %xmm0, %xmm1
  671. addpd %xmm2, %xmm3
  672. #endif
  673. #ifdef LT
  674. movlpd 0 * SIZE(AO), %xmm8
  675. movhpd 0 * SIZE(AO), %xmm8
  676. movlpd 1 * SIZE(AO), %xmm9
  677. movhpd 1 * SIZE(AO), %xmm9
  678. movlpd 2 * SIZE(AO), %xmm10
  679. movhpd 2 * SIZE(AO), %xmm10
  680. movlpd 3 * SIZE(AO), %xmm11
  681. movhpd 3 * SIZE(AO), %xmm11
  682. movlpd 6 * SIZE(AO), %xmm12
  683. movhpd 6 * SIZE(AO), %xmm12
  684. movlpd 7 * SIZE(AO), %xmm13
  685. movhpd 7 * SIZE(AO), %xmm13
  686. pshufd $0x4e, %xmm1, %xmm0
  687. pshufd $0x4e, %xmm3, %xmm2
  688. xorpd %xmm15, %xmm0
  689. xorpd %xmm15, %xmm2
  690. mulpd %xmm8, %xmm1
  691. mulpd %xmm9, %xmm0
  692. mulpd %xmm8, %xmm3
  693. mulpd %xmm9, %xmm2
  694. addpd %xmm0, %xmm1
  695. addpd %xmm2, %xmm3
  696. movapd %xmm1, %xmm0
  697. movapd %xmm3, %xmm2
  698. pshufd $0x4e, %xmm1, %xmm4
  699. pshufd $0x4e, %xmm3, %xmm6
  700. xorpd %xmm15, %xmm4
  701. xorpd %xmm15, %xmm6
  702. mulpd %xmm10, %xmm0
  703. mulpd %xmm10, %xmm2
  704. mulpd %xmm11, %xmm4
  705. mulpd %xmm11, %xmm6
  706. subpd %xmm0, %xmm5
  707. subpd %xmm2, %xmm7
  708. subpd %xmm4, %xmm5
  709. subpd %xmm6, %xmm7
  710. pshufd $0x4e, %xmm5, %xmm4
  711. pshufd $0x4e, %xmm7, %xmm6
  712. xorpd %xmm15, %xmm4
  713. xorpd %xmm15, %xmm6
  714. mulpd %xmm12, %xmm5
  715. mulpd %xmm13, %xmm4
  716. mulpd %xmm12, %xmm7
  717. mulpd %xmm13, %xmm6
  718. addpd %xmm4, %xmm5
  719. addpd %xmm6, %xmm7
  720. #endif
  721. #ifdef RN
  722. movlpd 0 * SIZE(B), %xmm8
  723. movhpd 0 * SIZE(B), %xmm8
  724. movlpd 1 * SIZE(B), %xmm9
  725. movhpd 1 * SIZE(B), %xmm9
  726. movlpd 2 * SIZE(B), %xmm10
  727. movhpd 2 * SIZE(B), %xmm10
  728. movlpd 3 * SIZE(B), %xmm11
  729. movhpd 3 * SIZE(B), %xmm11
  730. movlpd 6 * SIZE(B), %xmm12
  731. movhpd 6 * SIZE(B), %xmm12
  732. movlpd 7 * SIZE(B), %xmm13
  733. movhpd 7 * SIZE(B), %xmm13
  734. pshufd $0x4e, %xmm1, %xmm0
  735. pshufd $0x4e, %xmm5, %xmm4
  736. xorpd %xmm15, %xmm0
  737. xorpd %xmm15, %xmm4
  738. mulpd %xmm8, %xmm1
  739. mulpd %xmm9, %xmm0
  740. mulpd %xmm8, %xmm5
  741. mulpd %xmm9, %xmm4
  742. addpd %xmm0, %xmm1
  743. addpd %xmm4, %xmm5
  744. movapd %xmm1, %xmm0
  745. movapd %xmm5, %xmm2
  746. pshufd $0x4e, %xmm1, %xmm4
  747. pshufd $0x4e, %xmm5, %xmm6
  748. xorpd %xmm15, %xmm4
  749. xorpd %xmm15, %xmm6
  750. mulpd %xmm10, %xmm0
  751. mulpd %xmm10, %xmm2
  752. mulpd %xmm11, %xmm4
  753. mulpd %xmm11, %xmm6
  754. subpd %xmm0, %xmm3
  755. subpd %xmm2, %xmm7
  756. subpd %xmm4, %xmm3
  757. subpd %xmm6, %xmm7
  758. pshufd $0x4e, %xmm3, %xmm2
  759. pshufd $0x4e, %xmm7, %xmm6
  760. xorpd %xmm15, %xmm2
  761. xorpd %xmm15, %xmm6
  762. mulpd %xmm12, %xmm3
  763. mulpd %xmm13, %xmm2
  764. mulpd %xmm12, %xmm7
  765. mulpd %xmm13, %xmm6
  766. addpd %xmm2, %xmm3
  767. addpd %xmm6, %xmm7
  768. #endif
  769. #ifdef RT
  770. movlpd 6 * SIZE(B), %xmm8
  771. movhpd 6 * SIZE(B), %xmm8
  772. movlpd 7 * SIZE(B), %xmm9
  773. movhpd 7 * SIZE(B), %xmm9
  774. movlpd 4 * SIZE(B), %xmm10
  775. movhpd 4 * SIZE(B), %xmm10
  776. movlpd 5 * SIZE(B), %xmm11
  777. movhpd 5 * SIZE(B), %xmm11
  778. movlpd 0 * SIZE(B), %xmm12
  779. movhpd 0 * SIZE(B), %xmm12
  780. movlpd 1 * SIZE(B), %xmm13
  781. movhpd 1 * SIZE(B), %xmm13
  782. pshufd $0x4e, %xmm3, %xmm2
  783. pshufd $0x4e, %xmm7, %xmm6
  784. xorpd %xmm15, %xmm2
  785. xorpd %xmm15, %xmm6
  786. mulpd %xmm8, %xmm3
  787. mulpd %xmm9, %xmm2
  788. mulpd %xmm8, %xmm7
  789. mulpd %xmm9, %xmm6
  790. addpd %xmm2, %xmm3
  791. addpd %xmm6, %xmm7
  792. movapd %xmm3, %xmm0
  793. movapd %xmm7, %xmm2
  794. pshufd $0x4e, %xmm3, %xmm4
  795. pshufd $0x4e, %xmm7, %xmm6
  796. xorpd %xmm15, %xmm4
  797. xorpd %xmm15, %xmm6
  798. mulpd %xmm10, %xmm0
  799. mulpd %xmm10, %xmm2
  800. mulpd %xmm11, %xmm4
  801. mulpd %xmm11, %xmm6
  802. subpd %xmm0, %xmm1
  803. subpd %xmm2, %xmm5
  804. subpd %xmm4, %xmm1
  805. subpd %xmm6, %xmm5
  806. pshufd $0x4e, %xmm1, %xmm0
  807. pshufd $0x4e, %xmm5, %xmm4
  808. xorpd %xmm15, %xmm0
  809. xorpd %xmm15, %xmm4
  810. mulpd %xmm12, %xmm1
  811. mulpd %xmm13, %xmm0
  812. mulpd %xmm12, %xmm5
  813. mulpd %xmm13, %xmm4
  814. addpd %xmm0, %xmm1
  815. addpd %xmm4, %xmm5
  816. #endif
  817. #ifdef LN
  818. subq $4 * SIZE, CO1
  819. subq $4 * SIZE, CO2
  820. #endif
  821. movsd %xmm1, 0 * SIZE(CO1)
  822. movhpd %xmm1, 1 * SIZE(CO1)
  823. movsd %xmm5, 2 * SIZE(CO1)
  824. movhpd %xmm5, 3 * SIZE(CO1)
  825. movsd %xmm3, 0 * SIZE(CO2)
  826. movhpd %xmm3, 1 * SIZE(CO2)
  827. movsd %xmm7, 2 * SIZE(CO2)
  828. movhpd %xmm7, 3 * SIZE(CO2)
  829. #if defined(LN) || defined(LT)
  830. movapd %xmm1, 0 * SIZE(B)
  831. movapd %xmm3, 2 * SIZE(B)
  832. movapd %xmm5, 4 * SIZE(B)
  833. movapd %xmm7, 6 * SIZE(B)
  834. movlpd %xmm1, 0 * SIZE(BO)
  835. movlpd %xmm1, 1 * SIZE(BO)
  836. movhpd %xmm1, 2 * SIZE(BO)
  837. movhpd %xmm1, 3 * SIZE(BO)
  838. movlpd %xmm3, 4 * SIZE(BO)
  839. movlpd %xmm3, 5 * SIZE(BO)
  840. movhpd %xmm3, 6 * SIZE(BO)
  841. movhpd %xmm3, 7 * SIZE(BO)
  842. movlpd %xmm5, 8 * SIZE(BO)
  843. movlpd %xmm5, 9 * SIZE(BO)
  844. movhpd %xmm5, 10 * SIZE(BO)
  845. movhpd %xmm5, 11 * SIZE(BO)
  846. movlpd %xmm7, 12 * SIZE(BO)
  847. movlpd %xmm7, 13 * SIZE(BO)
  848. movhpd %xmm7, 14 * SIZE(BO)
  849. movhpd %xmm7, 15 * SIZE(BO)
  850. #else
  851. movapd %xmm1, 0 * SIZE(AO)
  852. movapd %xmm5, 2 * SIZE(AO)
  853. movapd %xmm3, 4 * SIZE(AO)
  854. movapd %xmm7, 6 * SIZE(AO)
  855. #endif
  856. #ifndef LN
  857. addq $4 * SIZE, CO1
  858. addq $4 * SIZE, CO2
  859. #endif
  860. #if defined(LT) || defined(RN)
  861. movq K, %rax
  862. subq KK, %rax
  863. salq $ZBASE_SHIFT, %rax
  864. leaq (AO, %rax, 2), AO
  865. #ifdef LT
  866. addq $8 * SIZE, B
  867. #endif
  868. #endif
  869. #ifdef LN
  870. subq $2, KK
  871. movq BORIG, B
  872. #endif
  873. #ifdef LT
  874. addq $2, KK
  875. #endif
  876. #ifdef RT
  877. movq K, %rax
  878. movq BORIG, B
  879. salq $1 + ZBASE_SHIFT, %rax
  880. addq %rax, AORIG
  881. #endif
  882. decq I # i --
  883. jg .L10
  884. ALIGN_4
  885. .L30:
  886. testq $1, M
  887. jle .L99
  888. #ifdef LN
  889. movq K, %rax
  890. salq $0 + ZBASE_SHIFT, %rax
  891. subq %rax, AORIG
  892. #endif
  893. #if defined(LN) || defined(RT)
  894. movq KK, %rax
  895. movq AORIG, AO
  896. salq $ZBASE_SHIFT, %rax
  897. addq %rax, AO
  898. #endif
  899. leaq BUFFER, BO
  900. #if defined(LN) || defined(RT)
  901. movq KK, %rax
  902. salq $1 + ZBASE_SHIFT, %rax
  903. leaq (BO, %rax, 2), BO
  904. #endif
  905. pxor %xmm0, %xmm0
  906. pxor %xmm1, %xmm1
  907. pxor %xmm2, %xmm2
  908. pxor %xmm3, %xmm3
  909. #if defined(LT) || defined(RN)
  910. movq KK, %rax
  911. #else
  912. movq K, %rax
  913. subq KK, %rax
  914. #endif
  915. sarq $2, %rax
  916. je .L42
  917. .L41:
  918. movapd 0 * SIZE(AO), %xmm8
  919. movapd 0 * SIZE(BO), %xmm9
  920. mulpd %xmm8, %xmm9
  921. addpd %xmm9, %xmm0
  922. movapd 2 * SIZE(BO), %xmm9
  923. mulpd %xmm8, %xmm9
  924. addpd %xmm9, %xmm1
  925. movapd 4 * SIZE(BO), %xmm9
  926. mulpd %xmm8, %xmm9
  927. addpd %xmm9, %xmm2
  928. movapd 6 * SIZE(BO), %xmm9
  929. mulpd %xmm8, %xmm9
  930. addpd %xmm9, %xmm3
  931. movapd 2 * SIZE(AO), %xmm8
  932. movapd 8 * SIZE(BO), %xmm9
  933. mulpd %xmm8, %xmm9
  934. addpd %xmm9, %xmm0
  935. movapd 10 * SIZE(BO), %xmm9
  936. mulpd %xmm8, %xmm9
  937. addpd %xmm9, %xmm1
  938. movapd 12 * SIZE(BO), %xmm9
  939. mulpd %xmm8, %xmm9
  940. addpd %xmm9, %xmm2
  941. movapd 14 * SIZE(BO), %xmm9
  942. mulpd %xmm8, %xmm9
  943. addpd %xmm9, %xmm3
  944. movapd 4 * SIZE(AO), %xmm8
  945. movapd 16 * SIZE(BO), %xmm9
  946. mulpd %xmm8, %xmm9
  947. addpd %xmm9, %xmm0
  948. movapd 18 * SIZE(BO), %xmm9
  949. mulpd %xmm8, %xmm9
  950. addpd %xmm9, %xmm1
  951. movapd 20 * SIZE(BO), %xmm9
  952. mulpd %xmm8, %xmm9
  953. addpd %xmm9, %xmm2
  954. movapd 22 * SIZE(BO), %xmm9
  955. mulpd %xmm8, %xmm9
  956. addpd %xmm9, %xmm3
  957. movapd 6 * SIZE(AO), %xmm8
  958. movapd 24 * SIZE(BO), %xmm9
  959. mulpd %xmm8, %xmm9
  960. addpd %xmm9, %xmm0
  961. movapd 26 * SIZE(BO), %xmm9
  962. mulpd %xmm8, %xmm9
  963. addpd %xmm9, %xmm1
  964. movapd 28 * SIZE(BO), %xmm9
  965. mulpd %xmm8, %xmm9
  966. addpd %xmm9, %xmm2
  967. movapd 30 * SIZE(BO), %xmm9
  968. mulpd %xmm8, %xmm9
  969. addpd %xmm9, %xmm3
  970. addq $ 8 * SIZE, AO
  971. addq $32 * SIZE, BO
  972. decq %rax
  973. jne .L41
  974. .L42:
  975. #if defined(LT) || defined(RN)
  976. movq KK, %rax
  977. #else
  978. movq K, %rax
  979. subq KK, %rax
  980. #endif
  981. movapd POSINV, %xmm15
  982. andq $3, %rax # if (k & 1)
  983. BRANCH
  984. jle .L44
  985. .L43:
  986. movapd 0 * SIZE(AO), %xmm8
  987. movapd 0 * SIZE(BO), %xmm9
  988. mulpd %xmm8, %xmm9
  989. addpd %xmm9, %xmm0
  990. movapd 2 * SIZE(BO), %xmm9
  991. mulpd %xmm8, %xmm9
  992. addpd %xmm9, %xmm1
  993. movapd 4 * SIZE(BO), %xmm9
  994. mulpd %xmm8, %xmm9
  995. addpd %xmm9, %xmm2
  996. movapd 6 * SIZE(BO), %xmm9
  997. mulpd %xmm8, %xmm9
  998. addpd %xmm9, %xmm3
  999. addq $2 * SIZE, AO # aoffset += 4
  1000. addq $8 * SIZE, BO # boffset1 += 8
  1001. decq %rax
  1002. jg .L43
  1003. ALIGN_4
  1004. .L44:
  1005. #if defined(LN) || defined(RT)
  1006. movq KK, %rax
  1007. #ifdef LN
  1008. subq $1, %rax
  1009. #else
  1010. subq $2, %rax
  1011. #endif
  1012. movq AORIG, AO
  1013. movq BORIG, B
  1014. leaq BUFFER, BO
  1015. salq $ZBASE_SHIFT, %rax
  1016. leaq (AO, %rax, 1), AO
  1017. leaq (B, %rax, 2), B
  1018. leaq (BO, %rax, 4), BO
  1019. #endif
  1020. SHUFPD_1 %xmm1, %xmm1
  1021. SHUFPD_1 %xmm3, %xmm3
  1022. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1023. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1024. xorpd %xmm15, %xmm1
  1025. xorpd %xmm15, %xmm3
  1026. #else
  1027. xorpd %xmm15, %xmm0
  1028. xorpd %xmm15, %xmm2
  1029. #endif
  1030. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1031. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1032. subpd %xmm1, %xmm0
  1033. subpd %xmm3, %xmm2
  1034. #else
  1035. addpd %xmm1, %xmm0
  1036. addpd %xmm3, %xmm2
  1037. #endif
  1038. #if defined(LN) || defined(LT)
  1039. movapd 0 * SIZE(B), %xmm1
  1040. movapd 2 * SIZE(B), %xmm3
  1041. subpd %xmm0, %xmm1
  1042. subpd %xmm2, %xmm3
  1043. #else
  1044. movapd 0 * SIZE(AO), %xmm1
  1045. movapd 2 * SIZE(AO), %xmm3
  1046. subpd %xmm0, %xmm1
  1047. subpd %xmm2, %xmm3
  1048. #endif
  1049. #ifndef CONJ
  1050. SHUFPD_1 %xmm15, %xmm15
  1051. #endif
  1052. #if defined(LN) || defined(LT)
  1053. movlpd 0 * SIZE(AO), %xmm8
  1054. movhpd 0 * SIZE(AO), %xmm8
  1055. movlpd 1 * SIZE(AO), %xmm9
  1056. movhpd 1 * SIZE(AO), %xmm9
  1057. pshufd $0x4e, %xmm1, %xmm0
  1058. pshufd $0x4e, %xmm3, %xmm2
  1059. xorpd %xmm15, %xmm0
  1060. xorpd %xmm15, %xmm2
  1061. mulpd %xmm8, %xmm1
  1062. mulpd %xmm9, %xmm0
  1063. mulpd %xmm8, %xmm3
  1064. mulpd %xmm9, %xmm2
  1065. addpd %xmm0, %xmm1
  1066. addpd %xmm2, %xmm3
  1067. #endif
  1068. #ifdef RN
  1069. movlpd 0 * SIZE(B), %xmm8
  1070. movhpd 0 * SIZE(B), %xmm8
  1071. movlpd 1 * SIZE(B), %xmm9
  1072. movhpd 1 * SIZE(B), %xmm9
  1073. movlpd 2 * SIZE(B), %xmm10
  1074. movhpd 2 * SIZE(B), %xmm10
  1075. movlpd 3 * SIZE(B), %xmm11
  1076. movhpd 3 * SIZE(B), %xmm11
  1077. movlpd 6 * SIZE(B), %xmm12
  1078. movhpd 6 * SIZE(B), %xmm12
  1079. movlpd 7 * SIZE(B), %xmm13
  1080. movhpd 7 * SIZE(B), %xmm13
  1081. pshufd $0x4e, %xmm1, %xmm0
  1082. xorpd %xmm15, %xmm0
  1083. mulpd %xmm8, %xmm1
  1084. mulpd %xmm9, %xmm0
  1085. addpd %xmm0, %xmm1
  1086. movapd %xmm1, %xmm0
  1087. pshufd $0x4e, %xmm1, %xmm4
  1088. xorpd %xmm15, %xmm4
  1089. mulpd %xmm10, %xmm0
  1090. mulpd %xmm11, %xmm4
  1091. subpd %xmm0, %xmm3
  1092. subpd %xmm4, %xmm3
  1093. pshufd $0x4e, %xmm3, %xmm2
  1094. xorpd %xmm15, %xmm2
  1095. mulpd %xmm12, %xmm3
  1096. mulpd %xmm13, %xmm2
  1097. addpd %xmm2, %xmm3
  1098. #endif
  1099. #ifdef RT
  1100. movlpd 6 * SIZE(B), %xmm8
  1101. movhpd 6 * SIZE(B), %xmm8
  1102. movlpd 7 * SIZE(B), %xmm9
  1103. movhpd 7 * SIZE(B), %xmm9
  1104. movlpd 4 * SIZE(B), %xmm10
  1105. movhpd 4 * SIZE(B), %xmm10
  1106. movlpd 5 * SIZE(B), %xmm11
  1107. movhpd 5 * SIZE(B), %xmm11
  1108. movlpd 0 * SIZE(B), %xmm12
  1109. movhpd 0 * SIZE(B), %xmm12
  1110. movlpd 1 * SIZE(B), %xmm13
  1111. movhpd 1 * SIZE(B), %xmm13
  1112. pshufd $0x4e, %xmm3, %xmm2
  1113. xorpd %xmm15, %xmm2
  1114. mulpd %xmm8, %xmm3
  1115. mulpd %xmm9, %xmm2
  1116. addpd %xmm2, %xmm3
  1117. movapd %xmm3, %xmm0
  1118. pshufd $0x4e, %xmm3, %xmm4
  1119. xorpd %xmm15, %xmm4
  1120. mulpd %xmm10, %xmm0
  1121. mulpd %xmm11, %xmm4
  1122. subpd %xmm0, %xmm1
  1123. subpd %xmm4, %xmm1
  1124. pshufd $0x4e, %xmm1, %xmm0
  1125. xorpd %xmm15, %xmm0
  1126. mulpd %xmm12, %xmm1
  1127. mulpd %xmm13, %xmm0
  1128. addpd %xmm0, %xmm1
  1129. #endif
  1130. #ifdef LN
  1131. subq $2 * SIZE, CO1
  1132. subq $2 * SIZE, CO2
  1133. #endif
  1134. movsd %xmm1, 0 * SIZE(CO1)
  1135. movhpd %xmm1, 1 * SIZE(CO1)
  1136. movsd %xmm3, 0 * SIZE(CO2)
  1137. movhpd %xmm3, 1 * SIZE(CO2)
  1138. #if defined(LN) || defined(LT)
  1139. movapd %xmm1, 0 * SIZE(B)
  1140. movapd %xmm3, 2 * SIZE(B)
  1141. movlpd %xmm1, 0 * SIZE(BO)
  1142. movlpd %xmm1, 1 * SIZE(BO)
  1143. movhpd %xmm1, 2 * SIZE(BO)
  1144. movhpd %xmm1, 3 * SIZE(BO)
  1145. movlpd %xmm3, 4 * SIZE(BO)
  1146. movlpd %xmm3, 5 * SIZE(BO)
  1147. movhpd %xmm3, 6 * SIZE(BO)
  1148. movhpd %xmm3, 7 * SIZE(BO)
  1149. #else
  1150. movapd %xmm1, 0 * SIZE(AO)
  1151. movapd %xmm3, 2 * SIZE(AO)
  1152. #endif
  1153. #ifndef LN
  1154. addq $2 * SIZE, CO1
  1155. addq $2 * SIZE, CO2
  1156. #endif
  1157. #if defined(LT) || defined(RN)
  1158. movq K, %rax
  1159. subq KK, %rax
  1160. salq $ZBASE_SHIFT, %rax
  1161. leaq (AO, %rax, 1), AO
  1162. #ifdef LT
  1163. addq $4 * SIZE, B
  1164. #endif
  1165. #endif
  1166. #ifdef LN
  1167. subq $1, KK
  1168. movq BORIG, B
  1169. #endif
  1170. #ifdef LT
  1171. addq $1, KK
  1172. #endif
  1173. #ifdef RT
  1174. movq K, %rax
  1175. movq BORIG, B
  1176. salq $0 + ZBASE_SHIFT, %rax
  1177. addq %rax, AORIG
  1178. #endif
  1179. ALIGN_4
  1180. .L99:
  1181. #ifdef LN
  1182. leaq (, K, SIZE), %rax
  1183. leaq (B, %rax, 4), B
  1184. #endif
  1185. #if defined(LT) || defined(RN)
  1186. movq K, %rax
  1187. subq KK, %rax
  1188. leaq (,%rax, SIZE), %rax
  1189. leaq (B, %rax, 2 * COMPSIZE), B
  1190. #endif
  1191. #ifdef RN
  1192. addq $2, KK
  1193. #endif
  1194. #ifdef RT
  1195. subq $2, KK
  1196. #endif
  1197. decq J # j --
  1198. jg .L01
  1199. .L100:
  1200. testq $1, N
  1201. jle .L999
  1202. .L101:
  1203. #ifdef LN
  1204. movq OFFSET, %rax
  1205. addq M, %rax
  1206. movq %rax, KK
  1207. #endif
  1208. /* Copying to Sub Buffer */
  1209. leaq BUFFER, BO
  1210. #ifdef RT
  1211. movq K, %rax
  1212. salq $0 + ZBASE_SHIFT, %rax
  1213. subq %rax, B
  1214. #endif
  1215. #if defined(LN) || defined(RT)
  1216. movq KK, %rax
  1217. movq B, BORIG
  1218. salq $ZBASE_SHIFT, %rax
  1219. leaq (B, %rax, 1), B
  1220. leaq (BO, %rax, 2), BO
  1221. #endif
  1222. #if defined(LT)
  1223. movq OFFSET, %rax
  1224. movq %rax, KK
  1225. #endif
  1226. #if defined(LT) || defined(RN)
  1227. movq KK, %rax
  1228. #else
  1229. movq K, %rax
  1230. subq KK, %rax
  1231. #endif
  1232. sarq $2, %rax
  1233. jle .L103
  1234. ALIGN_4
  1235. .L102:
  1236. movlpd 0 * SIZE(B), %xmm0
  1237. movlpd 1 * SIZE(B), %xmm1
  1238. movlpd 2 * SIZE(B), %xmm2
  1239. movlpd 3 * SIZE(B), %xmm3
  1240. movlpd 4 * SIZE(B), %xmm4
  1241. movlpd 5 * SIZE(B), %xmm5
  1242. movlpd 6 * SIZE(B), %xmm6
  1243. movlpd 7 * SIZE(B), %xmm7
  1244. movlpd %xmm0, 0 * SIZE(BO)
  1245. movlpd %xmm0, 1 * SIZE(BO)
  1246. movlpd %xmm1, 2 * SIZE(BO)
  1247. movlpd %xmm1, 3 * SIZE(BO)
  1248. movlpd %xmm2, 4 * SIZE(BO)
  1249. movlpd %xmm2, 5 * SIZE(BO)
  1250. movlpd %xmm3, 6 * SIZE(BO)
  1251. movlpd %xmm3, 7 * SIZE(BO)
  1252. movlpd %xmm4, 8 * SIZE(BO)
  1253. movlpd %xmm4, 9 * SIZE(BO)
  1254. movlpd %xmm5, 10 * SIZE(BO)
  1255. movlpd %xmm5, 11 * SIZE(BO)
  1256. movlpd %xmm6, 12 * SIZE(BO)
  1257. movlpd %xmm6, 13 * SIZE(BO)
  1258. movlpd %xmm7, 14 * SIZE(BO)
  1259. movlpd %xmm7, 15 * SIZE(BO)
  1260. subq $-16 * SIZE, BO
  1261. addq $ 8 * SIZE, B
  1262. decq %rax
  1263. jne .L102
  1264. ALIGN_4
  1265. .L103:
  1266. #if defined(LT) || defined(RN)
  1267. movq KK, %rax
  1268. #else
  1269. movq K, %rax
  1270. subq KK, %rax
  1271. #endif
  1272. andq $3, %rax
  1273. BRANCH
  1274. jle .L105
  1275. ALIGN_4
  1276. .L104:
  1277. movlpd 0 * SIZE(B), %xmm0
  1278. movlpd 1 * SIZE(B), %xmm1
  1279. movlpd %xmm0, 0 * SIZE(BO)
  1280. movlpd %xmm0, 1 * SIZE(BO)
  1281. movlpd %xmm1, 2 * SIZE(BO)
  1282. movlpd %xmm1, 3 * SIZE(BO)
  1283. addq $4 * SIZE, BO
  1284. addq $2 * SIZE, B
  1285. decq %rax
  1286. jne .L104
  1287. ALIGN_4
  1288. .L105:
  1289. #if defined(LT) || defined(RN)
  1290. movq A, AO
  1291. #else
  1292. movq A, AORIG
  1293. #endif
  1294. #ifdef RT
  1295. subq LDC, C
  1296. #endif
  1297. movq C, CO1
  1298. #ifndef RT
  1299. addq LDC, C
  1300. #endif
  1301. movq M, I
  1302. sarq $1, I # i = (m >> 2)
  1303. jle .L130
  1304. ALIGN_4
  1305. .L110:
  1306. #ifdef LN
  1307. movq K, %rax
  1308. salq $1 + ZBASE_SHIFT, %rax
  1309. subq %rax, AORIG
  1310. #endif
  1311. #if defined(LN) || defined(RT)
  1312. movq KK, %rax
  1313. movq AORIG, AO
  1314. salq $ZBASE_SHIFT, %rax
  1315. leaq (AO, %rax, 2), AO
  1316. #endif
  1317. leaq BUFFER, BO
  1318. #if defined(LN) || defined(RT)
  1319. movq KK, %rax
  1320. salq $0 + ZBASE_SHIFT, %rax
  1321. leaq (BO, %rax, 2), BO
  1322. #endif
  1323. pxor %xmm0, %xmm0
  1324. pxor %xmm1, %xmm1
  1325. pxor %xmm4, %xmm4
  1326. pxor %xmm5, %xmm5
  1327. PREFETCHW 4 * SIZE(CO1)
  1328. #if defined(LT) || defined(RN)
  1329. movq KK, %rax
  1330. #else
  1331. movq K, %rax
  1332. subq KK, %rax
  1333. #endif
  1334. sarq $2, %rax
  1335. je .L112
  1336. .L111:
  1337. movapd 0 * SIZE(AO), %xmm8
  1338. movapd 0 * SIZE(BO), %xmm9
  1339. mulpd %xmm8, %xmm9
  1340. addpd %xmm9, %xmm0
  1341. mulpd 2 * SIZE(BO), %xmm8
  1342. addpd %xmm8, %xmm1
  1343. movapd 2 * SIZE(AO), %xmm8
  1344. movapd 0 * SIZE(BO), %xmm9
  1345. mulpd %xmm8, %xmm9
  1346. addpd %xmm9, %xmm4
  1347. mulpd 2 * SIZE(BO), %xmm8
  1348. addpd %xmm8, %xmm5
  1349. movapd 4 * SIZE(AO), %xmm8
  1350. movapd 4 * SIZE(BO), %xmm9
  1351. mulpd %xmm8, %xmm9
  1352. addpd %xmm9, %xmm0
  1353. mulpd 6 * SIZE(BO), %xmm8
  1354. addpd %xmm8, %xmm1
  1355. movapd 6 * SIZE(AO), %xmm8
  1356. movapd 4 * SIZE(BO), %xmm9
  1357. mulpd %xmm8, %xmm9
  1358. addpd %xmm9, %xmm4
  1359. mulpd 6 * SIZE(BO), %xmm8
  1360. addpd %xmm8, %xmm5
  1361. movapd 8 * SIZE(AO), %xmm8
  1362. movapd 8 * SIZE(BO), %xmm9
  1363. mulpd %xmm8, %xmm9
  1364. addpd %xmm9, %xmm0
  1365. mulpd 10 * SIZE(BO), %xmm8
  1366. addpd %xmm8, %xmm1
  1367. movapd 10 * SIZE(AO), %xmm8
  1368. movapd 8 * SIZE(BO), %xmm9
  1369. mulpd %xmm8, %xmm9
  1370. addpd %xmm9, %xmm4
  1371. mulpd 10 * SIZE(BO), %xmm8
  1372. addpd %xmm8, %xmm5
  1373. movapd 12 * SIZE(AO), %xmm8
  1374. movapd 12 * SIZE(BO), %xmm9
  1375. mulpd %xmm8, %xmm9
  1376. addpd %xmm9, %xmm0
  1377. mulpd 14 * SIZE(BO), %xmm8
  1378. addpd %xmm8, %xmm1
  1379. movapd 14 * SIZE(AO), %xmm8
  1380. movapd 12 * SIZE(BO), %xmm9
  1381. mulpd %xmm8, %xmm9
  1382. addpd %xmm9, %xmm4
  1383. mulpd 14 * SIZE(BO), %xmm8
  1384. addpd %xmm8, %xmm5
  1385. addq $16 * SIZE, AO
  1386. addq $16 * SIZE, BO
  1387. decq %rax
  1388. jne .L111
  1389. ALIGN_4
  1390. .L112:
  1391. #if defined(LT) || defined(RN)
  1392. movq KK, %rax
  1393. #else
  1394. movq K, %rax
  1395. subq KK, %rax
  1396. #endif
  1397. movapd POSINV, %xmm15
  1398. andq $3, %rax # if (k & 1)
  1399. BRANCH
  1400. jle .L114
  1401. .L113:
  1402. movapd 0 * SIZE(AO), %xmm8
  1403. movapd 0 * SIZE(BO), %xmm9
  1404. mulpd %xmm8, %xmm9
  1405. addpd %xmm9, %xmm0
  1406. mulpd 2 * SIZE(BO), %xmm8
  1407. addpd %xmm8, %xmm1
  1408. movapd 2 * SIZE(AO), %xmm8
  1409. movapd 0 * SIZE(BO), %xmm9
  1410. mulpd %xmm8, %xmm9
  1411. addpd %xmm9, %xmm4
  1412. mulpd 2 * SIZE(BO), %xmm8
  1413. addpd %xmm8, %xmm5
  1414. addq $4 * SIZE, AO # aoffset += 4
  1415. addq $4 * SIZE, BO # boffset1 += 8
  1416. decq %rax
  1417. jg .L113
  1418. ALIGN_4
  1419. .L114:
  1420. #if defined(LN) || defined(RT)
  1421. movq KK, %rax
  1422. #ifdef LN
  1423. subq $2, %rax
  1424. #else
  1425. subq $1, %rax
  1426. #endif
  1427. movq AORIG, AO
  1428. movq BORIG, B
  1429. leaq BUFFER, BO
  1430. salq $ZBASE_SHIFT, %rax
  1431. leaq (AO, %rax, 2), AO
  1432. leaq (B, %rax, 1), B
  1433. leaq (BO, %rax, 2), BO
  1434. #endif
  1435. SHUFPD_1 %xmm1, %xmm1
  1436. SHUFPD_1 %xmm5, %xmm5
  1437. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1438. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1439. xorpd %xmm15, %xmm1
  1440. xorpd %xmm15, %xmm5
  1441. #else
  1442. xorpd %xmm15, %xmm0
  1443. xorpd %xmm15, %xmm4
  1444. #endif
  1445. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1446. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1447. subpd %xmm1, %xmm0
  1448. subpd %xmm5, %xmm4
  1449. #else
  1450. addpd %xmm1, %xmm0
  1451. addpd %xmm5, %xmm4
  1452. #endif
  1453. #if defined(LN) || defined(LT)
  1454. movapd 0 * SIZE(B), %xmm1
  1455. movapd 2 * SIZE(B), %xmm5
  1456. subpd %xmm0, %xmm1
  1457. subpd %xmm4, %xmm5
  1458. #else
  1459. movapd 0 * SIZE(AO), %xmm1
  1460. movapd 2 * SIZE(AO), %xmm5
  1461. subpd %xmm0, %xmm1
  1462. subpd %xmm4, %xmm5
  1463. #endif
  1464. #ifndef CONJ
  1465. SHUFPD_1 %xmm15, %xmm15
  1466. #endif
  1467. #ifdef LN
  1468. movlpd 6 * SIZE(AO), %xmm8
  1469. movhpd 6 * SIZE(AO), %xmm8
  1470. movlpd 7 * SIZE(AO), %xmm9
  1471. movhpd 7 * SIZE(AO), %xmm9
  1472. movlpd 4 * SIZE(AO), %xmm10
  1473. movhpd 4 * SIZE(AO), %xmm10
  1474. movlpd 5 * SIZE(AO), %xmm11
  1475. movhpd 5 * SIZE(AO), %xmm11
  1476. movlpd 0 * SIZE(AO), %xmm12
  1477. movhpd 0 * SIZE(AO), %xmm12
  1478. movlpd 1 * SIZE(AO), %xmm13
  1479. movhpd 1 * SIZE(AO), %xmm13
  1480. pshufd $0x4e, %xmm5, %xmm4
  1481. xorpd %xmm15, %xmm4
  1482. mulpd %xmm8, %xmm5
  1483. mulpd %xmm9, %xmm4
  1484. addpd %xmm4, %xmm5
  1485. movapd %xmm5, %xmm0
  1486. pshufd $0x4e, %xmm5, %xmm4
  1487. xorpd %xmm15, %xmm4
  1488. mulpd %xmm10, %xmm0
  1489. mulpd %xmm11, %xmm4
  1490. subpd %xmm0, %xmm1
  1491. subpd %xmm4, %xmm1
  1492. pshufd $0x4e, %xmm1, %xmm0
  1493. xorpd %xmm15, %xmm0
  1494. mulpd %xmm12, %xmm1
  1495. mulpd %xmm13, %xmm0
  1496. addpd %xmm0, %xmm1
  1497. #endif
  1498. #ifdef LT
  1499. movlpd 0 * SIZE(AO), %xmm8
  1500. movhpd 0 * SIZE(AO), %xmm8
  1501. movlpd 1 * SIZE(AO), %xmm9
  1502. movhpd 1 * SIZE(AO), %xmm9
  1503. movlpd 2 * SIZE(AO), %xmm10
  1504. movhpd 2 * SIZE(AO), %xmm10
  1505. movlpd 3 * SIZE(AO), %xmm11
  1506. movhpd 3 * SIZE(AO), %xmm11
  1507. movlpd 6 * SIZE(AO), %xmm12
  1508. movhpd 6 * SIZE(AO), %xmm12
  1509. movlpd 7 * SIZE(AO), %xmm13
  1510. movhpd 7 * SIZE(AO), %xmm13
  1511. pshufd $0x4e, %xmm1, %xmm0
  1512. xorpd %xmm15, %xmm0
  1513. mulpd %xmm8, %xmm1
  1514. mulpd %xmm9, %xmm0
  1515. addpd %xmm0, %xmm1
  1516. movapd %xmm1, %xmm0
  1517. pshufd $0x4e, %xmm1, %xmm4
  1518. xorpd %xmm15, %xmm4
  1519. mulpd %xmm10, %xmm0
  1520. mulpd %xmm11, %xmm4
  1521. subpd %xmm0, %xmm5
  1522. subpd %xmm4, %xmm5
  1523. pshufd $0x4e, %xmm5, %xmm4
  1524. xorpd %xmm15, %xmm4
  1525. mulpd %xmm12, %xmm5
  1526. mulpd %xmm13, %xmm4
  1527. addpd %xmm4, %xmm5
  1528. #endif
  1529. #ifdef RN
  1530. movlpd 0 * SIZE(B), %xmm8
  1531. movhpd 0 * SIZE(B), %xmm8
  1532. movlpd 1 * SIZE(B), %xmm9
  1533. movhpd 1 * SIZE(B), %xmm9
  1534. pshufd $0x4e, %xmm1, %xmm0
  1535. pshufd $0x4e, %xmm5, %xmm4
  1536. xorpd %xmm15, %xmm0
  1537. xorpd %xmm15, %xmm4
  1538. mulpd %xmm8, %xmm1
  1539. mulpd %xmm9, %xmm0
  1540. mulpd %xmm8, %xmm5
  1541. mulpd %xmm9, %xmm4
  1542. addpd %xmm0, %xmm1
  1543. addpd %xmm4, %xmm5
  1544. #endif
  1545. #ifdef RT
  1546. movlpd 0 * SIZE(B), %xmm8
  1547. movhpd 0 * SIZE(B), %xmm8
  1548. movlpd 1 * SIZE(B), %xmm9
  1549. movhpd 1 * SIZE(B), %xmm9
  1550. pshufd $0x4e, %xmm1, %xmm0
  1551. pshufd $0x4e, %xmm5, %xmm4
  1552. xorpd %xmm15, %xmm0
  1553. xorpd %xmm15, %xmm4
  1554. mulpd %xmm8, %xmm1
  1555. mulpd %xmm9, %xmm0
  1556. mulpd %xmm8, %xmm5
  1557. mulpd %xmm9, %xmm4
  1558. addpd %xmm0, %xmm1
  1559. addpd %xmm4, %xmm5
  1560. #endif
  1561. #ifdef LN
  1562. subq $4 * SIZE, CO1
  1563. #endif
  1564. movsd %xmm1, 0 * SIZE(CO1)
  1565. movhpd %xmm1, 1 * SIZE(CO1)
  1566. movsd %xmm5, 2 * SIZE(CO1)
  1567. movhpd %xmm5, 3 * SIZE(CO1)
  1568. #if defined(LN) || defined(LT)
  1569. movapd %xmm1, 0 * SIZE(B)
  1570. movapd %xmm5, 2 * SIZE(B)
  1571. movlpd %xmm1, 0 * SIZE(BO)
  1572. movlpd %xmm1, 1 * SIZE(BO)
  1573. movhpd %xmm1, 2 * SIZE(BO)
  1574. movhpd %xmm1, 3 * SIZE(BO)
  1575. movlpd %xmm5, 4 * SIZE(BO)
  1576. movlpd %xmm5, 5 * SIZE(BO)
  1577. movhpd %xmm5, 6 * SIZE(BO)
  1578. movhpd %xmm5, 7 * SIZE(BO)
  1579. #else
  1580. movapd %xmm1, 0 * SIZE(AO)
  1581. movapd %xmm5, 2 * SIZE(AO)
  1582. #endif
  1583. #ifndef LN
  1584. addq $4 * SIZE, CO1
  1585. #endif
  1586. #if defined(LT) || defined(RN)
  1587. movq K, %rax
  1588. subq KK, %rax
  1589. salq $ZBASE_SHIFT, %rax
  1590. leaq (AO, %rax, 2), AO
  1591. #ifdef LT
  1592. addq $4 * SIZE, B
  1593. #endif
  1594. #endif
  1595. #ifdef LN
  1596. subq $2, KK
  1597. movq BORIG, B
  1598. #endif
  1599. #ifdef LT
  1600. addq $2, KK
  1601. #endif
  1602. #ifdef RT
  1603. movq K, %rax
  1604. movq BORIG, B
  1605. salq $1 + ZBASE_SHIFT, %rax
  1606. addq %rax, AORIG
  1607. #endif
  1608. decq I # i --
  1609. jg .L110
  1610. ALIGN_4
  1611. .L130:
  1612. testq $1, M
  1613. jle .L199
  1614. ALIGN_4
  1615. .L140:
  1616. #ifdef LN
  1617. movq K, %rax
  1618. salq $0 + ZBASE_SHIFT, %rax
  1619. subq %rax, AORIG
  1620. #endif
  1621. #if defined(LN) || defined(RT)
  1622. movq KK, %rax
  1623. movq AORIG, AO
  1624. salq $ZBASE_SHIFT, %rax
  1625. leaq (AO, %rax, 1), AO
  1626. #endif
  1627. leaq BUFFER, BO
  1628. #if defined(LN) || defined(RT)
  1629. movq KK, %rax
  1630. salq $0 + ZBASE_SHIFT, %rax
  1631. leaq (BO, %rax, 2), BO
  1632. #endif
  1633. pxor %xmm0, %xmm0
  1634. pxor %xmm1, %xmm1
  1635. pxor %xmm2, %xmm2
  1636. pxor %xmm3, %xmm3
  1637. #if defined(LT) || defined(RN)
  1638. movq KK, %rax
  1639. #else
  1640. movq K, %rax
  1641. subq KK, %rax
  1642. #endif
  1643. sarq $2, %rax
  1644. je .L142
  1645. .L141:
  1646. movapd 0 * SIZE(AO), %xmm8
  1647. movapd 0 * SIZE(BO), %xmm9
  1648. mulpd %xmm8, %xmm9
  1649. addpd %xmm9, %xmm0
  1650. mulpd 2 * SIZE(BO), %xmm8
  1651. addpd %xmm8, %xmm1
  1652. movapd 2 * SIZE(AO), %xmm8
  1653. movapd 4 * SIZE(BO), %xmm9
  1654. mulpd %xmm8, %xmm9
  1655. addpd %xmm9, %xmm2
  1656. mulpd 6 * SIZE(BO), %xmm8
  1657. addpd %xmm8, %xmm3
  1658. movapd 4 * SIZE(AO), %xmm8
  1659. movapd 8 * SIZE(BO), %xmm9
  1660. mulpd %xmm8, %xmm9
  1661. addpd %xmm9, %xmm0
  1662. mulpd 10 * SIZE(BO), %xmm8
  1663. addpd %xmm8, %xmm1
  1664. movapd 6 * SIZE(AO), %xmm8
  1665. movapd 12 * SIZE(BO), %xmm9
  1666. mulpd %xmm8, %xmm9
  1667. addpd %xmm9, %xmm2
  1668. mulpd 14 * SIZE(BO), %xmm8
  1669. addpd %xmm8, %xmm3
  1670. addq $8 * SIZE, AO
  1671. addq $16 * SIZE, BO
  1672. decq %rax
  1673. jne .L141
  1674. .L142:
  1675. addpd %xmm2, %xmm0
  1676. addpd %xmm3, %xmm1
  1677. movapd POSINV, %xmm15
  1678. #if defined(LT) || defined(RN)
  1679. movq KK, %rax
  1680. #else
  1681. movq K, %rax
  1682. subq KK, %rax
  1683. #endif
  1684. andq $3, %rax # if (k & 1)
  1685. BRANCH
  1686. jle .L144
  1687. .L143:
  1688. movapd 0 * SIZE(AO), %xmm8
  1689. movapd 0 * SIZE(BO), %xmm9
  1690. mulpd %xmm8, %xmm9
  1691. addpd %xmm9, %xmm0
  1692. mulpd 2 * SIZE(BO), %xmm8
  1693. addpd %xmm8, %xmm1
  1694. addq $2 * SIZE, AO # aoffset += 4
  1695. addq $4 * SIZE, BO # boffset1 += 8
  1696. decq %rax
  1697. jg .L143
  1698. ALIGN_4
  1699. .L144:
  1700. #if defined(LN) || defined(RT)
  1701. movq KK, %rax
  1702. #ifdef LN
  1703. subq $1, %rax
  1704. #else
  1705. subq $1, %rax
  1706. #endif
  1707. movq AORIG, AO
  1708. movq BORIG, B
  1709. leaq BUFFER, BO
  1710. salq $ZBASE_SHIFT, %rax
  1711. leaq (AO, %rax, 1), AO
  1712. leaq (B, %rax, 1), B
  1713. leaq (BO, %rax, 2), BO
  1714. #endif
  1715. SHUFPD_1 %xmm1, %xmm1
  1716. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1717. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1718. xorpd %xmm15, %xmm1
  1719. #else
  1720. xorpd %xmm15, %xmm0
  1721. #endif
  1722. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1723. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1724. subpd %xmm1, %xmm0
  1725. #else
  1726. addpd %xmm1, %xmm0
  1727. #endif
  1728. #if defined(LN) || defined(LT)
  1729. movapd 0 * SIZE(B), %xmm1
  1730. subpd %xmm0, %xmm1
  1731. #else
  1732. movapd 0 * SIZE(AO), %xmm1
  1733. subpd %xmm0, %xmm1
  1734. #endif
  1735. #ifndef CONJ
  1736. SHUFPD_1 %xmm15, %xmm15
  1737. #endif
  1738. #ifdef LN
  1739. movlpd 0 * SIZE(AO), %xmm8
  1740. movhpd 0 * SIZE(AO), %xmm8
  1741. movlpd 1 * SIZE(AO), %xmm9
  1742. movhpd 1 * SIZE(AO), %xmm9
  1743. pshufd $0x4e, %xmm1, %xmm0
  1744. xorpd %xmm15, %xmm0
  1745. mulpd %xmm8, %xmm1
  1746. mulpd %xmm9, %xmm0
  1747. addpd %xmm0, %xmm1
  1748. #endif
  1749. #ifdef LT
  1750. movlpd 0 * SIZE(AO), %xmm8
  1751. movhpd 0 * SIZE(AO), %xmm8
  1752. movlpd 1 * SIZE(AO), %xmm9
  1753. movhpd 1 * SIZE(AO), %xmm9
  1754. pshufd $0x4e, %xmm1, %xmm0
  1755. xorpd %xmm15, %xmm0
  1756. mulpd %xmm8, %xmm1
  1757. mulpd %xmm9, %xmm0
  1758. addpd %xmm0, %xmm1
  1759. #endif
  1760. #ifdef RN
  1761. movlpd 0 * SIZE(B), %xmm8
  1762. movhpd 0 * SIZE(B), %xmm8
  1763. movlpd 1 * SIZE(B), %xmm9
  1764. movhpd 1 * SIZE(B), %xmm9
  1765. pshufd $0x4e, %xmm1, %xmm0
  1766. xorpd %xmm15, %xmm0
  1767. mulpd %xmm8, %xmm1
  1768. mulpd %xmm9, %xmm0
  1769. addpd %xmm0, %xmm1
  1770. #endif
  1771. #ifdef RT
  1772. movlpd 0 * SIZE(B), %xmm8
  1773. movhpd 0 * SIZE(B), %xmm8
  1774. movlpd 1 * SIZE(B), %xmm9
  1775. movhpd 1 * SIZE(B), %xmm9
  1776. pshufd $0x4e, %xmm1, %xmm0
  1777. xorpd %xmm15, %xmm0
  1778. mulpd %xmm8, %xmm1
  1779. mulpd %xmm9, %xmm0
  1780. addpd %xmm0, %xmm1
  1781. #endif
  1782. #ifdef LN
  1783. subq $2 * SIZE, CO1
  1784. #endif
  1785. movsd %xmm1, 0 * SIZE(CO1)
  1786. movhpd %xmm1, 1 * SIZE(CO1)
  1787. #if defined(LN) || defined(LT)
  1788. movapd %xmm1, 0 * SIZE(B)
  1789. movlpd %xmm1, 0 * SIZE(BO)
  1790. movlpd %xmm1, 1 * SIZE(BO)
  1791. movhpd %xmm1, 2 * SIZE(BO)
  1792. movhpd %xmm1, 3 * SIZE(BO)
  1793. #else
  1794. movapd %xmm1, 0 * SIZE(AO)
  1795. #endif
  1796. #ifndef LN
  1797. addq $2 * SIZE, CO1
  1798. #endif
  1799. #if defined(LT) || defined(RN)
  1800. movq K, %rax
  1801. subq KK, %rax
  1802. salq $ZBASE_SHIFT, %rax
  1803. leaq (AO, %rax, 1), AO
  1804. #ifdef LT
  1805. addq $2 * SIZE, B
  1806. #endif
  1807. #endif
  1808. #ifdef LN
  1809. subq $1, KK
  1810. movq BORIG, B
  1811. #endif
  1812. #ifdef LT
  1813. addq $1, KK
  1814. #endif
  1815. #ifdef RT
  1816. movq K, %rax
  1817. movq BORIG, B
  1818. salq $0 + ZBASE_SHIFT, %rax
  1819. addq %rax, AORIG
  1820. #endif
  1821. ALIGN_4
  1822. .L199:
  1823. #ifdef LN
  1824. leaq (, K, SIZE), %rax
  1825. leaq (B, %rax, 2), B
  1826. #endif
  1827. #if defined(LT) || defined(RN)
  1828. movq K, %rax
  1829. subq KK, %rax
  1830. leaq (,%rax, SIZE), %rax
  1831. leaq (B, %rax, 1 * COMPSIZE), B
  1832. #endif
  1833. #ifdef RN
  1834. addq $1, KK
  1835. #endif
  1836. #ifdef RT
  1837. subq $1, KK
  1838. #endif
  1839. ALIGN_4
  1840. .L999:
  1841. movq %rbx, %rsp
  1842. movq 0(%rsp), %rbx
  1843. movq 8(%rsp), %rbp
  1844. movq 16(%rsp), %r12
  1845. movq 24(%rsp), %r13
  1846. movq 32(%rsp), %r14
  1847. movq 40(%rsp), %r15
  1848. #ifdef WINDOWS_ABI
  1849. movq 48(%rsp), %rdi
  1850. movq 56(%rsp), %rsi
  1851. movups 64(%rsp), %xmm6
  1852. movups 80(%rsp), %xmm7
  1853. movups 96(%rsp), %xmm8
  1854. movups 112(%rsp), %xmm9
  1855. movups 128(%rsp), %xmm10
  1856. movups 144(%rsp), %xmm11
  1857. movups 160(%rsp), %xmm12
  1858. movups 176(%rsp), %xmm13
  1859. movups 192(%rsp), %xmm14
  1860. movups 208(%rsp), %xmm15
  1861. #endif
  1862. addq $STACKSIZE, %rsp
  1863. ret
  1864. EPILOGUE