From 713edf2794abfb65200aeda9872852bc9315ae16 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Wed, 17 Jan 2018 17:06:57 +0100 Subject: [PATCH] ADD comments to code of treelet kernel. --- README.md | 15 +- ...run_treeletkernel_acyclic-checkpoint.ipynb | 4 +- notebooks/run_treeletkernel_acyclic.ipynb | 4 +- .../__pycache__/treeletKernel.cpython-35.pyc | Bin 10032 -> 13388 bytes pygraph/kernels/deltaKernel.py | 4 +- pygraph/kernels/marginalizedKernel.py | 16 +- pygraph/kernels/pathKernel.py | 8 +- pygraph/kernels/spKernel.py | 4 +- pygraph/kernels/treeletKernel.py | 578 ++++++++++-------- pygraph/kernels/weisfeilerLehmanKernel.py | 14 +- 10 files changed, 380 insertions(+), 267 deletions(-) diff --git a/README.md b/README.md index bee38d2..bd582cc 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,20 @@ For predition we randomly divide the data in train and test subset, where 90% of * The targets of training data are normalized before calculating *path kernel* and *treelet kernel*. * See detail results in [results.md](pygraph/kernels/results.md). +## References +[1] K. M. Borgwardt and H.-P. Kriegel. Shortest-path kernels on graphs. In Proceedings of the International Conference on Data Mining, pages 74-81, 2005. + +[2] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003. + +[3] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). + +[4] N. Shervashidze, P. Schweitzer, E. J. van Leeuwen, K. Mehlhorn, and K. M. Borgwardt. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research, 12:2539-2561, 2011. + +[5] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. + ## Updates +### 2018.01.17 +* ADD comments to code of treelet kernel. - linlin ### 2018.01.16 * ADD *treelet kernel* and its result on dataset Asyclic. - linlin * MOD the way to calculate WL subtree kernel, correct its results. - linlin @@ -55,4 +68,4 @@ For predition we randomly divide the data in train and test subset, where 90% of * ADD *marginalized kernel* and its result. - linlin * ADD list required python packages in file README.md. - linlin ### 2017.11.24 -* ADD *shortest path kernel* and its result. - linlin \ No newline at end of file +* ADD *shortest path kernel* and its result. - linlin diff --git a/notebooks/.ipynb_checkpoints/run_treeletkernel_acyclic-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/run_treeletkernel_acyclic-checkpoint.ipynb index 425930a..c25eb36 100644 --- a/notebooks/.ipynb_checkpoints/run_treeletkernel_acyclic-checkpoint.ipynb +++ b/notebooks/.ipynb_checkpoints/run_treeletkernel_acyclic-checkpoint.ipynb @@ -101,8 +101,8 @@ "\n", "kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)\n", "\n", - "# %lprun -f spkernel \\\n", - "# kernel_train_test(datafile, kernel_file_path, spkernel, kernel_para, normalize = False)" + "# %lprun -f treeletkernel \\\n", + "# kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = False)" ] }, { diff --git a/notebooks/run_treeletkernel_acyclic.ipynb b/notebooks/run_treeletkernel_acyclic.ipynb index 425930a..c25eb36 100644 --- a/notebooks/run_treeletkernel_acyclic.ipynb +++ b/notebooks/run_treeletkernel_acyclic.ipynb @@ -101,8 +101,8 @@ "\n", "kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)\n", "\n", - "# %lprun -f spkernel \\\n", - "# kernel_train_test(datafile, kernel_file_path, spkernel, kernel_para, normalize = False)" + "# %lprun -f treeletkernel \\\n", + "# kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = False)" ] }, { diff --git a/pygraph/kernels/__pycache__/treeletKernel.cpython-35.pyc b/pygraph/kernels/__pycache__/treeletKernel.cpython-35.pyc index eb8890b9cc3a57dfb77492d34577c7f582ab09ae..7e648db8045e372a4bc081c60e9c16690ebb170a 100644 GIT binary patch literal 13388 zcmeHO-ESO6a_^qmFMgBawY43$5mODdo$>r{{ zXDBUc5(pyuBq2b8ga3daZ+E$;z)!&i2o6Soyclpk9)ihBf*=TlEq7-7v%0Ffs=B(mInvY9`OErZ;rFf!@sA?$$sv6WkN?+rx%f&^5uy;M zr6@?@39)90HJck);+`d>X_^v+6qnkfk`@~oQOM}Jl&ECIMotuRaoY}2=!}Nw5`}Kz zriGgkZdSNC;dTfYwOzvP79R<_(8EYp%CgDK2@HFjWj{8)2vC zu0$mrOSP)I7;M+QFw1Q`_ni_!wDEMA=O%uAPYMwTv1^Nal7?Qlgi1+dT}zDbr9>c! ziPR!RE#ky77%IhHS_C$+_?{4Z8PWJi1Ym5UJ(xbXup#!cqTy2AZng=6>)sT5IT54@ zJnF{jCnv^TTkK*okpWVuq}j7~k)S3yG22XzQxZK;GrRernBQC!^D#=h88NokAp%TS zraA6zhXD2?Kwna4&fSjtfDA%MGk|{NA?WONHfc``lqHB7cxMD0l5#uWQT);p;zqGj z+Nu--&k2<0RlLAiQN{YIv+k*?SMi-CFSzS@RbA>&b~^a*>#d?HZg`LZpR?!U4`**y zomZSU%6{O~mYrLGQB&*hX&_urV4`XsusPD*)pBXo30A!rhobVFQiP|gaX7DsnVN8J zPGhLnAv8)f#bPZTX(-6n%$*;_3{r_n!iYW$t*IUdN1?hW8--iOqr_^anPgNA@Rw?}idU@0WX!L6 z;ILvg8~ziQO@~%Ofc-ma%hMoZ^xyV^Eu{hKsb|=)Zxq3r#(6D1G%;Hn^=-$LE>~*B zAjaW!7BN#d;dn(%J0x(!tFqMSnZn>C3XyOcxRF#&J9Uy)f6_S*9JTx&Xrk%U=s$va z5sF_aN@eIp)_dnlrvp0n52O(%A@<8Vo^!@^mbS{6+;Y`9<2$}rf?e~ebCMo^;+sE+LY4N5}yt{#A!15{ug+W z?dGKt<`g!XY#YohnRWZ2uH_Y=O)c!c6c2bgg`uZq09jrbboQEA2U6m`5X*pqMIbGH zgz?g14$Wz$fLX;#ffkZmz$EW)fu#X0^opWe!Md zH8w<%WzurVO7R$z`fFQ9R`Kbvb;i)V;)g7kY3k3Wsq z3MW(#0#;W5$Y^uQ8-?O>Im^;OKfH!hLY_YT(i`=x9#Vx(3-pu+DN0Q zqg1Tcsz^_<`m1pa@m1$+1uJ!_wo$+GE{#h_bcmfZ2W-&fY^8>U*uR2A)Z;rrT^$1o zc}pF~k9vmE3Dl>XsH2Q{D@G`=fa*9FIbF7^+VhD0tTtb*9>aNZ)YMdzm{y|z$JhYZ6O?x7MNCzFcwNQg_u?f6 zPry2!lJxhy#dRf`X8I7RWEs)_^v5_rW;OY?@$8(!NP&gbd&rG_Z3q2)gj04UOG#S!Z0Nf;30_@S1T{+-pO zvyC}suZDtQFN%k@cz`c}91-K|woso+>SBu(DMqo{Xk5qFuvTOUBf0shoZoy8&#au+ zlLA^Inn9JJZ!o409QpS2y==9kZ>TMOLm2Y$^nJoCX`ydWBb5hF|s5~h#i-wn2Z`ZRzVZW6X;GQWZ4mSR??KJ;=+i4!)X};QoV&?xF z&iik-&U>A<^ZqltG2hzgrog5Tjj$10PYLyfjYOiu7d8Y1)g&mWNyy7c3qQZG z=Qn?b=PDk{o)35DNnqDRh-aJBHedMvA8x7MV8jjfQfrO2TyE3ffLwm~^!MS}L&8Dp3E@6VzJG8e66ZXdcpzi; z0)$q-*hOzBGM0xyZumUJ=|PIgm98V!hNy+6fRIKwjWxr4L3|7ru4BpnMHZ!1K#J=o zxDGV$_7U(HX|xyxD7kqdI}g&=hKX@#gEB52%+5FdhQ$f-H?ypNmSDjjwy?lGDMrDD z-z7Her~Xtl(-0S^yI$RRzBR=oG&K^o&^^r)`Ed?`!YX*lOB)u)At2Ob3Y49hGMePb zXiOmJV*D@V+LASrn>Ta#mn>|U__>8YZ-@4Ct!NMMj2%E53+CfcYt}5Sj znIssn(PmRDi!Dt#A9bOo2MGSRwgk5ji^cB{V$ZY^J9G%K!{30|!zgG&?CE{PavsDo zh_=N3T}QjwHN@t4wSk$KTlg=U*(O(`feil-j&o3Vzt_r{!^9w|okxUwJTYi;){SBx z#v5+hDhRQ-8=N_x6olOYtl3B4;6*APX~17}{t6zw&pg7iJ-6^Ct;qY8_WOsDJt%eu z#hO;y{k_g6dJU1;AV-#U2BvE$>ILVZIVCZDMx6&qeG`h1@tlx+tT=`*c}(U zqmVeH#>6PGpw^4#MzF~Nq7};k;A|2AdQ;a?27&$%+Un&50*~!aeF+Bo5{y{r-6YNN ztj3F6v8Ua%VBNb7XZ3de+|~;AzBj$9r|2Alx9w$w84&;9_9^sQtW=`7tWS~GqQmjZ zCy$&_9J=KUqIp0-5qpM4fOW{3hoxxvib2U7IsE!qpyhnyn;Z*7p_ubgVzGibam%IV zX~1hW6AK-)pU}d93E?9Y?oh5(i)!0Ul|lesApl?8pz}zQ74MFRT8dXYbsWh7C-yb zozH#(p}IcdTt_h8d1JzPw_L$-2jp%J2QGJOPSv}sWzmn1zED|O^)_nd>T*pXY*jAd ztm7?BnDaKMU#XS@u*7-8BVtiGePL?axmCO4OuafYb9U;*{43KJW?nvb@sh?llou35 zn;y~0M=2S0qyi2*-arD2Ib!n*KcKK985m^QHAR?_PGqrPm9$ZX&knEhBREvtfLEK* zC=O{qf?BtZK?S$*_#@3Rx=m=)bko5-I-4-@zc5$z$}6i&HPx)rAvOyB{SHAh0n3S& zs9(EaWg>2gh+@X|Rv;+RI4M$a6w90Y0*uKqPCj;wLVAXdU%qSJQuwfCjBmqRbq-M4 zvGv}!XRDe--zau_p3+iXh=<|WC{0{qghKk_{{_d=81=j37?j}K3gQ!TEHOeM{UUKJ zlrM$y<Vo!%sq4ykDc3~LM>>L>elu(>2O0QVr$F|r!r%xv=M${ZpXv%Gl z^A|cIJ)deOry)orJYm*Y^L?uMbJY2y3LRohg5(P^fn*b60xu$COxp9&Nebi7ky;MZ zf;JAo>7T=lQK2(JveC$=)DYl?G2RXJD(4NVI6fgW*0fcBg`BZfp)9@=qe^o@)^~>L zui-_e_a*UL7NMHt^EkyU$<)#h++IwFz}BC=R0Q+hm7562&~+V)ZTu~09xI?`aA=p^ zhAv2Hjif>KNe3a`#P+L>S@L}vb#>*RP{BB7R4j4OQDiY>yttcYlUgD{PzUL(wSjR0ke)C}s16XS4@ObBnJ;yYA+n7LD|Yfp*zvbFLXxG7AuK#z{k{uthp&u&gqcFNt~`L6juJC$!1LLJMWppfU8WZ^hpqe-%CZ}F z3aJG={wauu7|qB`re6-q5jifOvrkL>Swj+kcBdS`1)p)*FQ1Xac=zD<@X`!g_nRJ+ z(tU;jVaPfuPcXC>tbXf~yu{E5;*dOarq4O4|Awq<_GNii z4q7Qo5x+_#r1T+%Qkov0Sgf5R+8vG1i7}iBd~e^^rxBJ|j?US@B zlg@8A*cNXozkJ8TWyCsqEJKO|+*ZS`WR-W)^?If3nXtx_Ts`_`0ThiqMNssR(MTFg zR(}A5q3~8b6<-IWNQ|~TwDG1(e`F^#g?DjR(MvFfiPsxi!<6>$OeNqkP6xK_?XECV zOQ1)ZNr=#IjN==oP*bRItE7y8gFAuX0U^j&7`Z=U3d)0i`nb%98=_0wn4}0I8&!=8 zyTS3|N}`m-!#??INEmNvmb|4P+GLZQ30y(5afJ3PyN9?t2ag+Pcf*1HAINe7_ z&kO}kFe*+*0iLc7-hWc{hao7M0O-@J>i@5`GFC_uG!n4eBw(ktfYIJ;-0UHthxE1h zlsDR}*tdA|rM)W_aO17T62DKI>Llef6z<%eYW1?(>5~Q@v)qUm7?BVPW)+-#+|vE&x7Y!d<~7CTrel1^T7c-@q#%BR(v1cj7Qy-lxM zc!il|+>FNmgyggXLRwzvh#f|D-wIuEeo?7w)RnkWfTI^#S2VZj9*5 zu-`EV&*YdSbI@UNi~JGzqT4=42v#fQC59Mg%2glBrlQNrVOOb!NNb4#rM~t?JIKKd za4Ft+orp2=|0pyR*d`JL|0b zrv`O*XZFsWbMHOpyXTyH=bV`s8p_uvU#L_cBKiv@K3SB{;_!ZklZ~IEI#DGqE2=1R zh}I2SH-*5Us|Kl%)S`+dwIQuGRQWZ z+S;Y%rW;sXjaqdsNY|alntv%6@R~dBinG*c+Rpds{Ud;9>++gg-MaMPhT}Gzy7!>( zI!@j3&&%bBt;_G{=jt`jUukY`of%~@7_uCh>dL|H*eB-d&6R51JA*<*><%)EmYc($ zJHV$rpIFWGEmPl;TDd>Ow^BH9!DB*FgDS6%7*e}pa1B0uCynDN9Nv#{0;)i*bsu9Z z5Mnx@r9{5sDohPf2E3RlwLL@&m|@H!)D2WJeHOJUry2w%;(NeLlOtG$je`X9ngI+Uf>UBdaf_cHUKzj4P_-IrOmpkdg zEVZscJB%BXI6NLIM^_HZ1Xk;Htw4Pzk$aC>WToS_!XnQb_yhJEtorJL9$y8js>kH{ zmw|)qT)n!vY*)`5LY0Suu9Ws3Ta&sYASQW^aOB7hF=#_!EV#y86n%xp~Dk-*)+19IXE0UETxyH7Kej=M=jSXy{dd2)$4=2VI40 zMJ*JAl?+v_X9=pK9S$^Re(?#qVX!KJ3O)i=XzQaZ(WW^cT9~+I1x82R``b^@!Zo5* z=#r@7*Tn&8CfE#^ku=SPByhv2(9QLdWMdL|=l&#lh$GCPzmN?$@u)5v*A2#X`M7R~ z=Acu|%P5*?gCFOg= zSlE6H$DKwbAMUdT0i&^4zHuqRJ+`Km)-6bRNMRO3EZJvbkuyvbfL>V32)6#+cbfmy zcbZ3KnxE+Q63+i8ocC9H=Y5Mhn9!Ijp7%G+)UIF_n*WaXrqswhk#+@3j=I zVv{o2wa$>hwmr}hH4*aPtC)WY^8j@K+K$X2AZpb23G zn4wWl0oH$Xoz9=d>HMLuHoo0g8@1DUC)N5+ud00WRzLDVDs)-0H3{7nDnE(4!lv|^ z$y0tUO{;L4cO+&V8wJ_k42V2aKm?@$?V>&a8t>rvQ7S?M_d_%e&>Z$m_MNh$`P(qB zT7%@{H^e)t^;nx|WR*Bj}Sf?hbGt8wn++G`7cX)$0!2)&Iq;MVMgNx3se3RhM$KWrS*Npx{YnPB^nM{?;`skFH!#fNS%+xJ2%Y} zvo%U>w1y#Lc_;{B8R~T15^F zZjW&1G1H(ZqARt1GP3~D*T)$+&}e?tb|JIS`n{+Vo>egKysZBs0l{~BAh1hR1PtF` z7iEUJ~>X_V6xY0D7k!2O zW3J!qhQelLwSk+MU;GcxY!}sNpb-CC$GO#ZkM<(7ml`Ckb3fS!5`!jZJv8jSuo2Q; zMTpfsb^v)0=o@)_GFuZ>l%w%*$7$0J}#CoH0J`FZ^96g==HUku-%xn>x@%R?{VuMsy?Id zg>Da5or%zkw#TVGLF{m6hfr?d!5x6GH~ss-hk*XFfR5FCez7Ur%NT`7Awo9f5|fOL z&J-d&u=ZjCvOTmoR-Vpef>^r18WQ#_m*_dR!p>BJID#uAmdhUEJE%QH?IJV|rAaC> z1kGWPhl*{4mK=5gg#;@mx?>E6@szMx_xZmZhE6h|?bdcfCo`yzLMQjHIsVd0 zwb5*BIG6hcP7;AIgz3_@vxc}@Gy-!|Ryt%6M@d``&m_kvebgA0)G5!ey0;ru3w-sA z@tLAJ`>9cN4>FA5r0or9Vt`8Oq5lV99KyVPfStPCfF%Z~q#kC#0`*9s9t)Hms7G(I z?YejIiLE&>W@g;GxuBLS#>{M>Rs!{SEIq3V!y$$aBaaMyv7ezMR)eq#wr~jDz_{Vf zPf0Xf`vZw_u|ELWG+RG{xBScG-Qj2iAyj3z9?(nM2CiHufqQGk4=TbzX& zaiA{9ASQ@-Vnf@195sbMiUQx03L>A31!Od00g+OP1Uw#k8En5HkwlFoWCYnH7@+_5 z@mNV)Yf=4ZW1h@%ratR(USi3(NeTMS(r;BNE@n zMnZ?rP>H32<0tH4`A=f!fW4?!OavCbqPFpH$kA&Aw8Eas;Hc8lm*SLon>kaz4g zXQ^IYcIrW%BeJ?If^@lmKR37LG@RC!d*)l{L1aj)D|heV1tZ!3d8fUN`eF1Gt}XZX zs9Y=~6b&*$z;H(pttKkc9T81?83di3>S~nJc#!2h$tRwXt{~#bzm2MB8cHa~v(U5j zeYWhmJhLB*yYF?m`+PC>!_O;YKmEKi_Uqfp7`yq%y2FL`W9!&&x{OIKXVy!Abz*=wC z8bNwz%a$yY*L3~z0lifdTW6~eGUCh}JIID^*sElt-C$*);k?8dHC{(5S-08rIaQT# z=`)G)Y=AUW&GjmSiB?`EAN96WPn7+@#CEfi33IfSoKQ*cx`?|Eg|#T244X1RvV5vC z5O?p-Brj%zxS!c>Kc_00E!RPZvr*xcc)M~&<06jm0Y-3|PhFRIf-iXy4F;25{0M5i z!;(x)tF$$$3Ti@4se6!F!`B#7_?mgOPu+>yQRLjlapv6H-nD5^kA^L{q}z<6hcV-j zIw-x}Z;Yx(@#Wt57*mH5J&voR((YavXHcchdobpNF@d~a5tJqLHKJyb`zxAUcR)RW z)bx`WvuN_@xH8-LFZ8*Z7TOW(rR06CvtCHOU>phCOe_5zGtQc))iG5tEW`a`jHsae z6fReANJ?=O{{uMqfyMh5@bs}}>rk?Og~3uB#>Fa*q@zwRG6e#+^74T$U_r+vGTeyD z;=PWm2*e`Q{i-Cak!y_TB~H1?x$c(D_mHvaz4r`9@DW->uedkGf5>VfRDoRt(v8?_ zAXmr_b;$Ohz$8c{c8gP7L<`)Rlf3zof$-?^EFPB` z-#mwVIK3*buplsv=VT7RGr+?Ug%S#+PO}`kPEd>p#ekf_E&4;`31RdRyc(it0B^f+ z5{UakKp9Wi*t1+L59k*FWV7P3c4+fsWw~-DUz(ntE`{Erq~Gu;ZC2qCTcze|$*Wy) zN=NO|@(z;drCOtO)GK+;3i8+)BVP2>_B1-f<8kbCW_InO2Ejt~$bBf3LyOVxnftad?0$)P3>QDm zuX_$vyEq-{aw<w)PxzKj{9 z^-Q5=XEQLdF%Ht+_Do6UN-dgCQ0|D~q8(Q*OKzR7s#E nj<(>Pl+C8SQ+Lj=mV3O(AH-W71vLovM-7zC!OY;u_;3FQ2M-CO diff --git a/pygraph/kernels/deltaKernel.py b/pygraph/kernels/deltaKernel.py index 31b376a..fd35d8c 100644 --- a/pygraph/kernels/deltaKernel.py +++ b/pygraph/kernels/deltaKernel.py @@ -8,8 +8,8 @@ def deltakernel(condition): Return ------ - Kernel : integer - Delta Kernel. + kernel : integer + Delta kernel. References ---------- diff --git a/pygraph/kernels/marginalizedKernel.py b/pygraph/kernels/marginalizedKernel.py index c3d168d..199164b 100644 --- a/pygraph/kernels/marginalizedKernel.py +++ b/pygraph/kernels/marginalizedKernel.py @@ -29,8 +29,8 @@ def marginalizedkernel(*args, node_label = 'atom', edge_label = 'bond_type', p_q Return ------ - Kmatrix/Kernel : Numpy matrix/int - Kernel matrix, each element of which is the marginalized kernel between 2 praphs. / Marginalized Kernel between 2 graphs. + Kmatrix/kernel : Numpy matrix/float + Kernel matrix, each element of which is the marginalized kernel between 2 praphs. / Marginalized kernel between 2 graphs. References ---------- @@ -65,24 +65,24 @@ def marginalizedkernel(*args, node_label = 'atom', edge_label = 'bond_type', p_q def _marginalizedkernel_do(G1, G2, node_label, edge_label, p_quit, itr): - """Calculate marginalized graph kernels between 2 graphs. + """Calculate marginalized graph kernel between 2 graphs. Parameters ---------- G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string - node attribute used as label. The default node label is atom. + node attribute used as label. edge_label : string - edge attribute used as label. The default edge label is bond_type. + edge attribute used as label. p_quit : integer - the termination probability in the random walks generating step + the termination probability in the random walks generating step. itr : integer - time of iterations to calculate R_inf + time of iterations to calculate R_inf. Return ------ - Kernel : int + kernel : float Marginalized Kernel between 2 graphs. """ # init parameters diff --git a/pygraph/kernels/pathKernel.py b/pygraph/kernels/pathKernel.py index bc317c7..869ed7a 100644 --- a/pygraph/kernels/pathKernel.py +++ b/pygraph/kernels/pathKernel.py @@ -25,8 +25,8 @@ def pathkernel(*args, node_label = 'atom', edge_label = 'bond_type'): Return ------ - Kmatrix/Kernel : Numpy matrix/int - Kernel matrix, each element of which is the path kernel between 2 praphs. / Path Kernel between 2 graphs. + Kmatrix/kernel : Numpy matrix/float + Kernel matrix, each element of which is the path kernel between 2 praphs. / Path kernel between 2 graphs. References ---------- @@ -64,7 +64,7 @@ def pathkernel(*args, node_label = 'atom', edge_label = 'bond_type'): def _pathkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', weight = None): - """Calculate mean average path kernels between 2 graphs. + """Calculate mean average path kernel between 2 graphs. Parameters ---------- @@ -79,7 +79,7 @@ def _pathkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', weight Return ------ - Kernel : int + kernel : float Path Kernel between 2 graphs. """ # calculate shortest paths for both graphs diff --git a/pygraph/kernels/spKernel.py b/pygraph/kernels/spKernel.py index 6136c78..0b2c024 100644 --- a/pygraph/kernels/spKernel.py +++ b/pygraph/kernels/spKernel.py @@ -25,8 +25,8 @@ def spkernel(*args, edge_weight = 'bond_type'): Return ------ - Kmatrix/Kernel : Numpy matrix/int - Kernel matrix, each element of which is the sp kernel between 2 praphs. / SP Kernel between 2 graphs. + Kmatrix/kernel : Numpy matrix/float + Kernel matrix, each element of which is the sp kernel between 2 praphs. / SP kernel between 2 graphs. References ---------- diff --git a/pygraph/kernels/treeletKernel.py b/pygraph/kernels/treeletKernel.py index 4988b57..9e99c89 100644 --- a/pygraph/kernels/treeletKernel.py +++ b/pygraph/kernels/treeletKernel.py @@ -10,266 +10,368 @@ import networkx as nx import numpy as np -def find_paths(G, source_node, length): - if length == 0: - return [[source_node]] - path = [ [source_node] + path for neighbor in G[source_node] \ - for path in find_paths(G, neighbor, length - 1) if source_node not in path ] - return path +def treeletkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True): + """Calculate treelet graph kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + / + G1, G2 : NetworkX graphs + 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + labeled : boolean + Whether the graphs are labeled. The default is True. + + Return + ------ + Kmatrix/kernel : Numpy matrix/float + Kernel matrix, each element of which is the treelet kernel between 2 praphs. / Treelet kernel between 2 graphs. + """ + if len(args) == 1: # for a list of graphs + Gn = args[0] + Kmatrix = np.zeros((len(Gn), len(Gn))) -def find_all_paths(G, length): - all_paths = [] - for node in G: - all_paths.extend(find_paths(G, node, length)) - all_paths_r = [ path[::-1] for path in all_paths ] + start_time = time.time() + + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + Kmatrix[i][j] = _treeletkernel_do(Gn[i], Gn[j], node_label = node_label, edge_label = edge_label, labeled = labeled) + Kmatrix[j][i] = Kmatrix[i][j] + + run_time = time.time() - start_time + print("\n --- treelet kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) + + return Kmatrix, run_time - # remove double direction - for idx, path in enumerate(all_paths[:-1]): - for path2 in all_paths_r[idx+1::]: - if path == path2: - all_paths[idx] = [] - break - - return list(filter(lambda a: a != [], all_paths)) + else: # for only 2 graphs + + start_time = time.time() + + kernel = _treeletkernel_do(args[0], args[1], node_label = node_label, edge_label = edge_label, labeled = labeled) + + run_time = time.time() - start_time + print("\n --- treelet kernel built in %s seconds ---" % (run_time)) + + return kernel, run_time + -def get_canonkey(G, node_label = 'atom', edge_label = 'bond_type', labeled = True): +def _treeletkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', labeled = True): + """Calculate treelet graph kernel between 2 graphs. - patterns = {} - canonkey = {} # canonical key + Parameters + ---------- + G1, G2 : NetworkX graphs + 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + labeled : boolean + Whether the graphs are labeled. The default is True. + + Return + ------ + kernel : float + Treelet Kernel between 2 graphs. + """ + canonkey1 = get_canonkeys(G1, node_label = node_label, edge_label = edge_label, labeled = labeled) + canonkey2 = get_canonkeys(G2, node_label = node_label, edge_label = edge_label, labeled = labeled) + + keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs + vector1 = np.matrix([ (canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys ]) + vector2 = np.matrix([ (canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys ]) + kernel = np.sum(np.exp(- np.square(vector1 - vector2) / 2)) + + return kernel + + +def get_canonkeys(G, node_label = 'atom', edge_label = 'bond_type', labeled = True): + """Generate canonical keys of all treelets in a graph. - ### structural analysis ### - # linear patterns - patterns['0'] = G.nodes() - canonkey['0'] = nx.number_of_nodes(G) - for i in range(1, 6): - patterns[str(i)] = find_all_paths(G, i) - canonkey[str(i)] = len(patterns[str(i)]) - - # n-star patterns - patterns['3star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3 ] - patterns['4star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4 ] - patterns['5star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5 ] - # n-star patterns - canonkey['6'] = len(patterns['3star']) - canonkey['8'] = len(patterns['4star']) - canonkey['d'] = len(patterns['5star']) + Parameters + ---------- + G : NetworkX graphs + The graph in which keys are generated. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + labeled : boolean + Whether the graphs are labeled. The default is True. - # pattern 7 - patterns['7'] = [] - for pattern in patterns['3star']: - for i in range(1, len(pattern)): - if G.degree(pattern[i]) >= 2: - pattern_t = pattern[:] - pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] - for neighborx in G[pattern[i]]: - if neighborx != pattern[0]: - new_pattern = pattern_t + [ neighborx ] - patterns['7'].append(new_pattern) - canonkey['7'] = len(patterns['7']) + Return + ------ + canonkey/canonkey_l : dict + For unlabeled graphs, canonkey is a dictionary which records amount of every tree pattern. For labeled graphs, canonkey_l is one which keeps track of amount of every treelet. - # pattern 11 - patterns['11'] = [] - for pattern in patterns['4star']: + References + ---------- + [1] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. + """ + patterns = {} # a dictionary which consists of lists of patterns for all graphlet. + canonkey = {} # canonical key, a dictionary which records amount of every tree pattern. + + ### structural analysis ### + ### In this section, a list of patterns is generated for each graphlet, where every pattern is represented by nodes ordered by + ### Morgan's extended labeling. + # linear patterns + patterns['0'] = G.nodes() + canonkey['0'] = nx.number_of_nodes(G) + for i in range(1, 6): + patterns[str(i)] = find_all_paths(G, i) + canonkey[str(i)] = len(patterns[str(i)]) + + # n-star patterns + patterns['3star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3 ] + patterns['4star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4 ] + patterns['5star'] = [ [node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5 ] + # n-star patterns + canonkey['6'] = len(patterns['3star']) + canonkey['8'] = len(patterns['4star']) + canonkey['d'] = len(patterns['5star']) + + # pattern 7 + patterns['7'] = [] # the 1st line of Table 1 in Ref [1] + for pattern in patterns['3star']: + for i in range(1, len(pattern)): # for each neighbor of node 0 + if G.degree(pattern[i]) >= 2: + pattern_t = pattern[:] + pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] # set the node with degree >= 2 as the 4th node + for neighborx in G[pattern[i]]: + if neighborx != pattern[0]: + new_pattern = pattern_t + [ neighborx ] + patterns['7'].append(new_pattern) + canonkey['7'] = len(patterns['7']) + + # pattern 11 + patterns['11'] = [] # the 4th line of Table 1 in Ref [1] + for pattern in patterns['4star']: + for i in range(1, len(pattern)): + if G.degree(pattern[i]) >= 2: + pattern_t = pattern[:] + pattern_t[i], pattern_t[4] = pattern_t[4], pattern_t[i] + for neighborx in G[pattern[i]]: + if neighborx != pattern[0]: + new_pattern = pattern_t + [ neighborx ] + patterns['11'].append(new_pattern) + canonkey['b'] = len(patterns['11']) + + # pattern 12 + patterns['12'] = [] # the 5th line of Table 1 in Ref [1] + rootlist = [] # a list of root nodes, whose extended labels are 3 + for pattern in patterns['3star']: + if pattern[0] not in rootlist: # prevent to count the same pattern twice from each of the two root nodes + rootlist.append(pattern[0]) for i in range(1, len(pattern)): - if G.degree(pattern[i]) >= 2: + if G.degree(pattern[i]) >= 3: + rootlist.append(pattern[i]) pattern_t = pattern[:] - pattern_t[i], pattern_t[4] = pattern_t[4], pattern_t[i] - for neighborx in G[pattern[i]]: - if neighborx != pattern[0]: - new_pattern = pattern_t + [ neighborx ] - patterns['11'].append(new_pattern) - canonkey['b'] = len(patterns['11']) - - # pattern 12 - patterns['12'] = [] - rootlist = [] - for pattern in patterns['3star']: - if pattern[0] not in rootlist: - rootlist.append(pattern[0]) - for i in range(1, len(pattern)): - if G.degree(pattern[i]) >= 3: - rootlist.append(pattern[i]) + pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] + for neighborx1 in G[pattern[i]]: + if neighborx1 != pattern[0]: + for neighborx2 in G[pattern[i]]: + if neighborx1 > neighborx2 and neighborx2 != pattern[0]: + new_pattern = pattern_t + [neighborx1] + [neighborx2] +# new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ] + patterns['12'].append(new_pattern) + canonkey['c'] = int(len(patterns['12']) / 2) + + # pattern 9 + patterns['9'] = [] # the 2nd line of Table 1 in Ref [1] + for pattern in patterns['3star']: + for pairs in [ [neighbor1, neighbor2] for neighbor1 in G[pattern[0]] if G.degree(neighbor1) >= 2 \ + for neighbor2 in G[pattern[0]] if G.degree(neighbor2) >= 2 if neighbor1 > neighbor2 ]: + pattern_t = pattern[:] + # move nodes with extended labels 4 to specific position to correspond to their children + pattern_t[pattern_t.index(pairs[0])], pattern_t[2] = pattern_t[2], pattern_t[pattern_t.index(pairs[0])] + pattern_t[pattern_t.index(pairs[1])], pattern_t[3] = pattern_t[3], pattern_t[pattern_t.index(pairs[1])] + for neighborx1 in G[pairs[0]]: + if neighborx1 != pattern[0]: + for neighborx2 in G[pairs[1]]: + if neighborx2 != pattern[0]: + new_pattern = pattern_t + [neighborx1] + [neighborx2] + patterns['9'].append(new_pattern) + canonkey['9'] = len(patterns['9']) + + # pattern 10 + patterns['10'] = [] # the 3rd line of Table 1 in Ref [1] + for pattern in patterns['3star']: + for i in range(1, len(pattern)): + if G.degree(pattern[i]) >= 2: + for neighborx in G[pattern[i]]: + if neighborx != pattern[0] and G.degree(neighborx) >= 2: pattern_t = pattern[:] pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] - for neighborx1 in G[pattern[i]]: - if neighborx1 != pattern[0]: - for neighborx2 in G[pattern[i]]: - if neighborx1 > neighborx2 and neighborx2 != pattern[0]: - new_pattern = pattern_t + [neighborx1] + [neighborx2] -# new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ] - patterns['12'].append(new_pattern) - canonkey['c'] = int(len(patterns['12']) / 2) - - # pattern 9 - patterns['9'] = [] - for pattern in patterns['3star']: - for pairs in [ [neighbor1, neighbor2] for neighbor1 in G[pattern[0]] if G.degree(neighbor1) >= 2 \ - for neighbor2 in G[pattern[0]] if G.degree(neighbor2) >= 2 if neighbor1 > neighbor2 ]: - pattern_t = pattern[:] - pattern_t[pattern_t.index(pairs[0])], pattern_t[2] = pattern_t[2], pattern_t[pattern_t.index(pairs[0])] - pattern_t[pattern_t.index(pairs[1])], pattern_t[3] = pattern_t[3], pattern_t[pattern_t.index(pairs[1])] - for neighborx1 in G[pairs[0]]: - if neighborx1 != pattern[0]: - for neighborx2 in G[pairs[1]]: - if neighborx2 != pattern[0]: - new_pattern = pattern_t + [neighborx1] + [neighborx2] - patterns['9'].append(new_pattern) - canonkey['9'] = len(patterns['9']) - - # pattern 10 - patterns['10'] = [] - for pattern in patterns['3star']: - for i in range(1, len(pattern)): - if G.degree(pattern[i]) >= 2: - for neighborx in G[pattern[i]]: - if neighborx != pattern[0] and G.degree(neighborx) >= 2: - pattern_t = pattern[:] - pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] - new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ] - patterns['10'].extend(new_patterns) - canonkey['a'] = len(patterns['10']) - - ### labeling information ### - if labeled == True: - canonkey_l = {} - - # linear patterns - canonkey_t = Counter(list(nx.get_node_attributes(G, node_label).values())) - for key in canonkey_t: - canonkey_l['0' + key] = canonkey_t[key] - - for i in range(1, 6): - treelet = [] - for pattern in patterns[str(i)]: - canonlist = list(chain.from_iterable((G.node[node][node_label], \ - G[node][pattern[idx+1]][edge_label]) for idx, node in enumerate(pattern[:-1]))) - canonlist.append(G.node[pattern[-1]][node_label]) - canonkey_t = ''.join(canonlist) - canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1] - treelet.append(str(i) + canonkey_t) - canonkey_l.update(Counter(treelet)) - - # n-star patterns - for i in range(3, 6): - treelet = [] - for pattern in patterns[str(i) + 'star']: - canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:] ] - canonlist.sort() - canonkey_t = ('d' if i == 5 else str(i * 2)) + G.node[pattern[0]][node_label] + ''.join(canonlist) - treelet.append(canonkey_t) - canonkey_l.update(Counter(treelet)) - - # pattern 7 - treelet = [] - for pattern in patterns['7']: - canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] - canonlist.sort() - canonkey_t = '7' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ - + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ - + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] - treelet.append(canonkey_t) - canonkey_l.update(Counter(treelet)) - - # pattern 11 + new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ] + patterns['10'].extend(new_patterns) + canonkey['a'] = len(patterns['10']) + + ### labeling information ### + ### In this section, a list of canonical keys is generated for every pattern obtained in the structural analysis + ### section above, which is a string corresponding to a unique treelet. A dictionary is built to keep track of + ### the amount of every treelet. + if labeled == True: + canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. + + # linear patterns + canonkey_t = Counter(list(nx.get_node_attributes(G, node_label).values())) + for key in canonkey_t: + canonkey_l['0' + key] = canonkey_t[key] + + for i in range(1, 6): treelet = [] - for pattern in patterns['11']: - canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:4] ] - canonlist.sort() - canonkey_t = 'b' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ - + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[0]][edge_label] \ - + G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] - treelet.append(canonkey_t) + for pattern in patterns[str(i)]: + canonlist = list(chain.from_iterable((G.node[node][node_label], \ + G[node][pattern[idx+1]][edge_label]) for idx, node in enumerate(pattern[:-1]))) + canonlist.append(G.node[pattern[-1]][node_label]) + canonkey_t = ''.join(canonlist) + canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1] + treelet.append(str(i) + canonkey_t) canonkey_l.update(Counter(treelet)) - # pattern 10 + # n-star patterns + for i in range(3, 6): treelet = [] - for pattern in patterns['10']: - canonkey4 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] - canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] + for pattern in patterns[str(i) + 'star']: + canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:] ] canonlist.sort() - canonkey0 = ''.join(canonlist) - canonkey_t = 'a' + G.node[pattern[3]][node_label] \ - + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] \ - + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ - + canonkey4 + canonkey0 + canonkey_t = ('d' if i == 5 else str(i * 2)) + G.node[pattern[0]][node_label] + ''.join(canonlist) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) + + # pattern 7 + treelet = [] + for pattern in patterns['7']: + canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] + canonlist.sort() + canonkey_t = '7' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ + + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ + + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] + treelet.append(canonkey_t) + canonkey_l.update(Counter(treelet)) + + # pattern 11 + treelet = [] + for pattern in patterns['11']: + canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:4] ] + canonlist.sort() + canonkey_t = 'b' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ + + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[0]][edge_label] \ + + G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] + treelet.append(canonkey_t) + canonkey_l.update(Counter(treelet)) + + # pattern 10 + treelet = [] + for pattern in patterns['10']: + canonkey4 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] + canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] + canonlist.sort() + canonkey0 = ''.join(canonlist) + canonkey_t = 'a' + G.node[pattern[3]][node_label] \ + + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] \ + + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ + + canonkey4 + canonkey0 + treelet.append(canonkey_t) + canonkey_l.update(Counter(treelet)) + + # pattern 12 + treelet = [] + for pattern in patterns['12']: + canonlist0 = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] + canonlist0.sort() + canonlist3 = [ G.node[leaf][node_label] + G[leaf][pattern[3]][edge_label] for leaf in pattern[4:6] ] + canonlist3.sort() - # pattern 12 - treelet = [] - for pattern in patterns['12']: - canonlist0 = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] - canonlist0.sort() - canonlist3 = [ G.node[leaf][node_label] + G[leaf][pattern[3]][edge_label] for leaf in pattern[4:6] ] - canonlist3.sort() - canonkey_t1 = 'c' + G.node[pattern[0]][node_label] \ - + ''.join(canonlist0) \ - + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ - + ''.join(canonlist3) - - canonkey_t2 = 'c' + G.node[pattern[3]][node_label] \ - + ''.join(canonlist3) \ - + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ - + ''.join(canonlist0) - - treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) - canonkey_l.update(Counter(treelet)) - - # pattern 9 - treelet = [] - for pattern in patterns['9']: - canonkey2 = G.node[pattern[4]][node_label] + G[pattern[4]][pattern[2]][edge_label] - canonkey3 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[3]][edge_label] - prekey2 = G.node[pattern[2]][node_label] + G[pattern[2]][pattern[0]][edge_label] - prekey3 = G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] - if prekey2 + canonkey2 < prekey3 + canonkey3: - canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ - + prekey2 + prekey3 + canonkey2 + canonkey3 - else: - canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ - + prekey3 + prekey2 + canonkey3 + canonkey2 - treelet.append('9' + G.node[pattern[0]][node_label] + canonkey_t) - canonkey_l.update(Counter(treelet)) - - return canonkey_l - - return canonkey - + # 2 possible key can be generated from 2 nodes with extended label 3, select the one with lower lexicographic order. + canonkey_t1 = 'c' + G.node[pattern[0]][node_label] \ + + ''.join(canonlist0) \ + + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ + + ''.join(canonlist3) -def treeletkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True): - if len(args) == 1: # for a list of graphs - Gn = args[0] - Kmatrix = np.zeros((len(Gn), len(Gn))) + canonkey_t2 = 'c' + G.node[pattern[3]][node_label] \ + + ''.join(canonlist3) \ + + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ + + ''.join(canonlist0) - start_time = time.time() - - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = treeletkernel(Gn[i], Gn[j], labeled = labeled, node_label = node_label, edge_label = edge_label) - Kmatrix[j][i] = Kmatrix[i][j] + treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) + canonkey_l.update(Counter(treelet)) - run_time = time.time() - start_time - print("\n --- treelet kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) - - return Kmatrix, run_time + # pattern 9 + treelet = [] + for pattern in patterns['9']: + canonkey2 = G.node[pattern[4]][node_label] + G[pattern[4]][pattern[2]][edge_label] + canonkey3 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[3]][edge_label] + prekey2 = G.node[pattern[2]][node_label] + G[pattern[2]][pattern[0]][edge_label] + prekey3 = G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] + if prekey2 + canonkey2 < prekey3 + canonkey3: + canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ + + prekey2 + prekey3 + canonkey2 + canonkey3 + else: + canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ + + prekey3 + prekey2 + canonkey3 + canonkey2 + treelet.append('9' + G.node[pattern[0]][node_label] + canonkey_t) + canonkey_l.update(Counter(treelet)) + + return canonkey_l + + return canonkey - else: # for only 2 graphs - - G1 = args[0] - G = args[1] - kernel = 0 - -# start_time = time.time() - - canonkey2 = get_canonkey(G, node_label = node_label, edge_label = edge_label, labeled = labeled) - canonkey1 = get_canonkey(G1, node_label = node_label, edge_label = edge_label, labeled = labeled) - - keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs - vector1 = np.matrix([ (canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys ]) - vector2 = np.matrix([ (canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys ]) - kernel = np.sum(np.exp(- np.square(vector1 - vector2) / 2)) + +def find_paths(G, source_node, length): + """Find all paths with a certain length those start from a source node. A recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + source_node : integer + The number of the node from where all paths start. + length : integer + The length of paths. -# run_time = time.time() - start_time -# print("\n --- treelet kernel built in %s seconds ---" % (run_time)) + Return + ------ + path : list of list + List of paths retrieved, where each path is represented by a list of nodes. + """ + if length == 0: + return [[source_node]] + path = [ [source_node] + path for neighbor in G[source_node] \ + for path in find_paths(G, neighbor, length - 1) if source_node not in path ] + return path - return kernel#, run_time \ No newline at end of file + +def find_all_paths(G, length): + """Find all paths with a certain length in a graph. A recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + length : integer + The length of paths. + + Return + ------ + path : list of list + List of paths retrieved, where each path is represented by a list of nodes. + """ + all_paths = [] + for node in G: + all_paths.extend(find_paths(G, node, length)) + all_paths_r = [ path[::-1] for path in all_paths ] + + # For each path, two presentation are retrieved from its two extremities. Remove one of them. + for idx, path in enumerate(all_paths[:-1]): + for path2 in all_paths_r[idx+1::]: + if path == path2: + all_paths[idx] = [] + break + + return list(filter(lambda a: a != [], all_paths)) \ No newline at end of file diff --git a/pygraph/kernels/weisfeilerLehmanKernel.py b/pygraph/kernels/weisfeilerLehmanKernel.py index 264ce21..e2d2bd2 100644 --- a/pygraph/kernels/weisfeilerLehmanKernel.py +++ b/pygraph/kernels/weisfeilerLehmanKernel.py @@ -9,8 +9,6 @@ import time from pygraph.kernels.spkernel import spkernel from pygraph.kernels.pathKernel import pathkernel -# test of WL subtree kernel on many graphs - import sys import pathlib from collections import Counter @@ -44,8 +42,8 @@ def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', Return ------ - Kmatrix/Kernel : Numpy matrix/int - Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman Kernel between 2 graphs. + Kmatrix/kernel : Numpy matrix/float + Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman kernel between 2 graphs. Notes ----- @@ -125,7 +123,7 @@ def _wl_subtreekernel_do(*args, node_label = 'atom', edge_label = 'bond_type', h Return ------ - Kmatrix/Kernel : Numpy matrix/int + Kmatrix/kernel : Numpy matrix/float Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. """ @@ -229,8 +227,8 @@ def _weisfeilerlehmankernel_do(G1, G2, height = 0): Return ------ - Kernel : int - Weisfeiler-Lehman Kernel between 2 graphs. + kernel : float + Weisfeiler-Lehman kernel between 2 graphs. """ # init. @@ -298,4 +296,4 @@ def relabel(G): # get the set of compressed labels labels_comp = list(nx.get_node_attributes(G, 'label').values()) - num_of_each_label.update(dict(Counter(labels_comp))) \ No newline at end of file + num_of_each_label.update(dict(Counter(labels_comp)))