From 8a8cbbe85f2fc8b1a4454b894339661613315afe Mon Sep 17 00:00:00 2001 From: knightmarehs Date: Thu, 13 Dec 2018 22:42:33 +0800 Subject: [PATCH] First commit The first commit of gAnswer project. --- src/addition/AddtionalFix.java | 238 ++++ src/addition/AggregationRecognition.java | 155 +++ src/fgmt/EntityFragment.java | 312 +++++ src/fgmt/Fragment.java | 8 + src/fgmt/RelationFragment.java | 105 ++ src/fgmt/TypeFragment.java | 179 +++ src/fgmt/VariableFragment.java | 56 + src/jgsc/GstoreConnector.java | 489 ++++++++ src/lcn/BuildIndexForEntityFragments.java | 133 ++ src/lcn/BuildIndexForTypeShortName.java | 107 ++ src/lcn/EntityFragmentFields.java | 64 + src/lcn/EntityNameAndScore.java | 31 + src/lcn/Main.java | 58 + src/lcn/SearchInEntityFragments.java | 84 ++ src/lcn/SearchInTypeShortName.java | 176 +++ src/log/QueryLogger.java | 116 ++ src/nlp/ds/DependencyTree.java | 402 +++++++ src/nlp/ds/DependencyTreeNode.java | 150 +++ src/nlp/ds/Sentence.java | 88 ++ src/nlp/ds/Word.java | 126 ++ src/nlp/tool/CoreNLP.java | 202 ++++ src/nlp/tool/Main.java | 42 + src/nlp/tool/MaltParser.java | 70 ++ src/nlp/tool/MaltParserCon.java | 73 ++ src/nlp/tool/NERecognizer.java | 53 + src/nlp/tool/StanfordParser.java | 51 + src/nlp/tool/StopWordsList.java | 614 ++++++++++ src/paradict/ParaphraseDictionary.java | 441 +++++++ src/paradict/PredicateIDAndSupport.java | 24 + src/qa/Answer.java | 105 ++ src/qa/GAnswer.java | 376 ++++++ src/qa/Globals.java | 118 ++ src/qa/Matches.java | 9 + src/qa/Query.java | 128 ++ src/qa/extract/CorefResolution.java | 153 +++ src/qa/extract/EntityRecognition.java | 918 ++++++++++++++ src/qa/extract/ExtractImplicitRelation.java | 467 +++++++ src/qa/extract/ExtractRelation.java | 472 ++++++++ src/qa/extract/TypeRecognition.java | 358 ++++++ src/qa/mapping/CompatibilityChecker.java | 690 +++++++++++ src/qa/mapping/DBpediaLookup.java | 164 +++ src/qa/mapping/EntityFragmentDict.java | 44 + src/qa/mapping/SemanticItemMapping.java | 811 +++++++++++++ src/qa/parsing/BuildQueryGraph.java | 1201 +++++++++++++++++++ src/qa/parsing/QuestionParsing.java | 208 ++++ src/rdf/EntityMapping.java | 40 + src/rdf/ImplicitRelation.java | 77 ++ src/rdf/MergedWord.java | 41 + src/rdf/NodeSelectedWithScore.java | 24 + src/rdf/PredicateMapping.java | 28 + src/rdf/SemanticQueryGraph.java | 180 +++ src/rdf/SemanticRelation.java | 171 +++ src/rdf/SemanticUnit.java | 61 + src/rdf/SimpleRelation.java | 88 ++ src/rdf/Sparql.java | 305 +++++ src/rdf/Triple.java | 257 ++++ src/rdf/TypeMapping.java | 53 + src/utils/FileUtil.java | 91 ++ src/utils/HttpRequest.java | 114 ++ 59 files changed, 12399 insertions(+) create mode 100644 src/addition/AddtionalFix.java create mode 100644 src/addition/AggregationRecognition.java create mode 100644 src/fgmt/EntityFragment.java create mode 100644 src/fgmt/Fragment.java create mode 100644 src/fgmt/RelationFragment.java create mode 100644 src/fgmt/TypeFragment.java create mode 100644 src/fgmt/VariableFragment.java create mode 100644 src/jgsc/GstoreConnector.java create mode 100644 src/lcn/BuildIndexForEntityFragments.java create mode 100644 src/lcn/BuildIndexForTypeShortName.java create mode 100644 src/lcn/EntityFragmentFields.java create mode 100644 src/lcn/EntityNameAndScore.java create mode 100644 src/lcn/Main.java create mode 100644 src/lcn/SearchInEntityFragments.java create mode 100644 src/lcn/SearchInTypeShortName.java create mode 100644 src/log/QueryLogger.java create mode 100644 src/nlp/ds/DependencyTree.java create mode 100644 src/nlp/ds/DependencyTreeNode.java create mode 100644 src/nlp/ds/Sentence.java create mode 100644 src/nlp/ds/Word.java create mode 100644 src/nlp/tool/CoreNLP.java create mode 100644 src/nlp/tool/Main.java create mode 100644 src/nlp/tool/MaltParser.java create mode 100644 src/nlp/tool/MaltParserCon.java create mode 100644 src/nlp/tool/NERecognizer.java create mode 100644 src/nlp/tool/StanfordParser.java create mode 100644 src/nlp/tool/StopWordsList.java create mode 100644 src/paradict/ParaphraseDictionary.java create mode 100644 src/paradict/PredicateIDAndSupport.java create mode 100644 src/qa/Answer.java create mode 100644 src/qa/GAnswer.java create mode 100644 src/qa/Globals.java create mode 100644 src/qa/Matches.java create mode 100644 src/qa/Query.java create mode 100644 src/qa/extract/CorefResolution.java create mode 100644 src/qa/extract/EntityRecognition.java create mode 100644 src/qa/extract/ExtractImplicitRelation.java create mode 100644 src/qa/extract/ExtractRelation.java create mode 100644 src/qa/extract/TypeRecognition.java create mode 100644 src/qa/mapping/CompatibilityChecker.java create mode 100644 src/qa/mapping/DBpediaLookup.java create mode 100644 src/qa/mapping/EntityFragmentDict.java create mode 100644 src/qa/mapping/SemanticItemMapping.java create mode 100644 src/qa/parsing/BuildQueryGraph.java create mode 100644 src/qa/parsing/QuestionParsing.java create mode 100644 src/rdf/EntityMapping.java create mode 100644 src/rdf/ImplicitRelation.java create mode 100644 src/rdf/MergedWord.java create mode 100644 src/rdf/NodeSelectedWithScore.java create mode 100644 src/rdf/PredicateMapping.java create mode 100644 src/rdf/SemanticQueryGraph.java create mode 100644 src/rdf/SemanticRelation.java create mode 100644 src/rdf/SemanticUnit.java create mode 100644 src/rdf/SimpleRelation.java create mode 100644 src/rdf/Sparql.java create mode 100644 src/rdf/Triple.java create mode 100644 src/rdf/TypeMapping.java create mode 100644 src/utils/FileUtil.java create mode 100644 src/utils/HttpRequest.java diff --git a/src/addition/AddtionalFix.java b/src/addition/AddtionalFix.java new file mode 100644 index 0000000..e7235cb --- /dev/null +++ b/src/addition/AddtionalFix.java @@ -0,0 +1,238 @@ +package addition; + +import java.util.ArrayList; +import java.util.HashMap; + +import paradict.PredicateIDAndSupport; +import log.QueryLogger; +//import nlp.ds.DependencyTree; +//import nlp.ds.DependencyTreeNode; +import nlp.ds.Word; +import nlp.ds.Sentence.SentenceType; +import qa.Globals; +//import qa.extract.TypeRecognition; +//import qa.mapping.SemanticItemMapping; +//import rdf.EntityMapping; +import rdf.SemanticUnit; +import rdf.Sparql; +import rdf.Sparql.QueryType; +import rdf.Triple; +//import fgmt.TypeFragment; + + +public class AddtionalFix +{ + public HashMap pattern2category = new HashMap(); + + public AddtionalFix() + { + // Some category mappings for DBpedia, try automatic linking methods later. | base form + pattern2category.put("gangster_from_the_prohibition_era", "Prohibition-era_gangsters"); + pattern2category.put("seven_wonder_of_the_ancient_world", "Seven_Wonders_of_the_Ancient_World"); + pattern2category.put("three_ship_use_by_columbus", "Christopher_Columbus"); + pattern2category.put("13_british_colony", "Thirteen_Colonies"); + } + + public void process(QueryLogger qlog) + { + fixCategory(qlog); + oneTriple(qlog); + oneNode(qlog); + + //aggregation + AggregationRecognition ar = new AggregationRecognition(); + ar.recognize(qlog); + + //query type + decideQueryType(qlog); + } + + public void decideQueryType(QueryLogger qlog) + { + for(Sparql spq: qlog.rankedSparqls) + if(qlog.s.sentenceType == SentenceType.GeneralQuestion) + spq.queryType = QueryType.Ask; + } + + public void fixCategory(QueryLogger qlog) + { + if(qlog == null || qlog.semanticUnitList == null) + return; + + String var = null, category = null; + for(SemanticUnit su: qlog.semanticUnitList) + { + if(su.centerWord.mayCategory) + { + var = "?"+su.centerWord.originalForm; + category = su.centerWord.category; + } + } + + if(category != null && var != null) + for(Sparql spq: qlog.rankedSparqls) + { + boolean occured = false; + for(Triple tri: spq.tripleList) + { + if(tri.subject.equals(var)) + { + occured = true; + break; + } + } + String oName = category; + String pName = "subject"; + int pid = Globals.pd.predicate_2_id.get(pName); + Triple triple = new Triple(Triple.VAR_ROLE_ID, var, pid, Triple.CAT_ROLE_ID, oName, null, 100); + spq.addTriple(triple); + } + } + + /* recognize one-Node query + * Two cases:1、Special question|Imperative sentence 2、General question + * 1-1:how many [], highest [] ... | For single variable, add constraint (aggregation) + * 1-2: What is backgammon? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...) + * 1-3: Give me all Seven Wonders of the Ancient World. | Notice, "Seven Wonders of the Ancient World" should be recognized as ENT before. (in fact it is CATEGORY in DBpeida) + * 2-1: Are there any [castles_in_the_United_States](yago:type) + * 2-2:Was Sigmund Freud married? | Lack of variable node. + * 2-3:Are penguins endangered? | No suitable relation matching, need transition. + */ + public void oneNode(QueryLogger qlog) + { + if(qlog == null || qlog.semanticUnitList == null || qlog.semanticUnitList.size()>1) + return; + + Word target = qlog.target; + Word[] words = qlog.s.words; + if(qlog.s.sentenceType != SentenceType.GeneralQuestion) + { + //1-1: how many [type] are there | List all [type] + if(target.mayType && target.tmList != null) + { + String subName = "?"+target.originalForm; + String typeName = target.tmList.get(0).typeName; + Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100); + Sparql sparql = new Sparql(); + sparql.addTriple(triple); + qlog.rankedSparqls.add(sparql); + } + //1-2: What is [ent]? + else if(target.mayEnt && target.emList != null) + { + if(words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be")) + { + int eid = target.emList.get(0).entityID; + String subName = target.emList.get(0).entityName; + Triple triple = new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+target.originalForm, null, target.emList.get(0).score); + Sparql sparql = new Sparql(); + sparql.addTriple(triple); + qlog.rankedSparqls.add(sparql); + } + } + //1-3: Give me all Seven Wonders of the Ancient World. + else if(target.mayCategory && target.category != null) + { + String oName = target.category; + String pName = "subject"; + int pid = Globals.pd.predicate_2_id.get(pName); + Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+target.originalForm, pid, Triple.CAT_ROLE_ID, oName, null, 100); + Sparql sparql = new Sparql(); + sparql.addTriple(triple); + qlog.rankedSparqls.add(sparql); + } + } + else + { + if(target.mayEnt && target.emList != null) + { + //2-2:Was Sigmund Freud married? + String relMention = ""; + for(Word word: words) + if(word != target && !word.baseForm.equals(".") && !word.baseForm.equals("?")) + relMention += word.baseForm+" "; + if(relMention.length() > 1) + relMention = relMention.substring(0, relMention.length()-1); + + ArrayList pmList = null; + if(Globals.pd.nlPattern_2_predicateList.containsKey(relMention)) + pmList = Globals.pd.nlPattern_2_predicateList.get(relMention); + + if(pmList != null && pmList.size() > 0) + { + int pid = pmList.get(0).predicateID; + int eid = target.emList.get(0).entityID; + String subName = target.emList.get(0).entityName; + Triple triple = new Triple(eid, subName, pid, Triple.VAR_ROLE_ID, "?x", null, 100); + Sparql sparql = new Sparql(); + sparql.addTriple(triple); + qlog.rankedSparqls.add(sparql); + } + + //2-3:Are penguins endangered? + else + { + if(target.position < words.length && pattern2category.containsKey(words[target.position].baseForm)) + { + String oName = pattern2category.get(words[target.position].baseForm); + String pName = "subject"; + int pid = Globals.pd.predicate_2_id.get(pName); + int eid = target.emList.get(0).entityID; + String subName = target.emList.get(0).entityName; + Triple triple = new Triple(eid, subName, pid, Triple.CAT_ROLE_ID, oName, null, 100); + Sparql sparql = new Sparql(); + sparql.addTriple(triple); + qlog.rankedSparqls.add(sparql); + } + } + } + //2-1: Are there any [castles_in_the_United_States](yago:type) + else if(target.mayType && target.tmList != null) + { + String typeName = target.tmList.get(0).typeName; + String subName = "?" + target.originalForm; + //System.out.println("typeName="+typeName+" subName="+subName); + Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100); + Sparql sparql = new Sparql(); + sparql.addTriple(triple); + qlog.rankedSparqls.add(sparql); + } + } + } + + /* + * One triple recognized but no suitable relation. + * */ + public void oneTriple (QueryLogger qlog) + { + if(qlog == null || qlog.semanticUnitList == null) + return; + + if(qlog.s.sentenceType == SentenceType.SpecialQuestion) + { + Word[] words = qlog.s.words; + if(qlog.semanticUnitList.size() == 2) + { + Word entWord = null, whWord = null; + for(int i=0;i= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be")) + { + int eid = entWord.emList.get(0).entityID; + String subName = entWord.emList.get(0).entityName; + Triple triple = new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+whWord.originalForm, null, entWord.emList.get(0).score); + Sparql sparql = new Sparql(); + sparql.addTriple(triple); + qlog.rankedSparqls.add(sparql); + } + } + } + } +} + diff --git a/src/addition/AggregationRecognition.java b/src/addition/AggregationRecognition.java new file mode 100644 index 0000000..bb3d92e --- /dev/null +++ b/src/addition/AggregationRecognition.java @@ -0,0 +1,155 @@ +package addition; + +import nlp.ds.DependencyTree; +import nlp.ds.DependencyTreeNode; +import nlp.ds.Word; +import qa.Globals; +import rdf.SemanticRelation; +import rdf.Sparql; +import rdf.Triple; +import log.QueryLogger; + +public class AggregationRecognition { + + // Numbers + static String x[]={"zero","one","two","three","four","five","six","seven","eight","nine"}; + static String y[]={"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen"}; + static String z[]={"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"}; + static int b; + + public static Integer translateNumbers(String str) // 1~100 + { + int flag; + try { + b=Integer.valueOf(str); + flag=1; + } + catch (Exception e){ + flag=2; + } + int i,j; + switch(flag) + { + case 1: + return b; + case 2: // Words need to be translated into numbers + boolean flag1=true; + for(i=0;i<8;i++) // 20~99 + { + for(j=0;j<10;j++) + { + String str1=z[i],str2=x[j]; + if(str.equals((str1))){ + return i*10+20; // 1x + } + + else if(str.equals((str1+" "+str2))){ + return i*10+j+20; + } + } + } + + for(i=0;i<10;i++){ + if(str.equals(x[i])){ + return i; + } + else if(str.equals(y[i])){ + return 10+i; + } + } + + System.out.println("Warning: Can not Translate Number: " + str); + } + return 1; + } + + + public void recognize(QueryLogger qlog) + { + DependencyTree ds = qlog.s.dependencyTreeStanford; + if(qlog.isMaltParserUsed) + ds = qlog.s.dependencyTreeMalt; + + Word[] words = qlog.s.words; + + // how often | how many + if(qlog.s.plainText.indexOf("How many")!=-1||qlog.s.plainText.indexOf("How often")!=-1||qlog.s.plainText.indexOf("how many")!=-1||qlog.s.plainText.indexOf("how often")!=-1) + { + for(Sparql sp: qlog.rankedSparqls) + { + sp.countTarget = true; + // How many pages does War and Peace have? --> res:War_and_Peace dbo:numberOfPages ?n . + // ?uri dbo:populationTotal ?inhabitants . + for(Triple triple: sp.tripleList) + { + String p = Globals.pd.getPredicateById(triple.predicateID).toLowerCase(); + if(p.contains("number") || p.contains("total") || p.contains("calories") || p.contains("satellites")) + { + sp.countTarget = false; + } + } + } + } + + // more than [num] [node] + for(DependencyTreeNode dtn: ds.nodesList) + { + if(dtn.word.baseForm.equals("more")) + { + if(dtn.father!=null && dtn.father.word.baseForm.equals("than")) + { + DependencyTreeNode tmp = dtn.father; + if(tmp.father!=null && tmp.father.word.posTag.equals("CD") && tmp.father.father!=null && tmp.father.father.word.posTag.startsWith("N")) + { + DependencyTreeNode target = tmp.father.father; + + // Which caves have more than 3 entrances | entranceCount | filter + for(Sparql sp: qlog.rankedSparqls) + { + if(target.father !=null && target.father.word.baseForm.equals("have")) + { + sp.moreThanStr = "GROUP BY ?" + qlog.target.originalForm + "\nHAVING (COUNT(?"+target.word.originalForm + ") > "+tmp.father.word.baseForm+")"; + } + else + { + int num = translateNumbers(tmp.father.word.baseForm); + sp.moreThanStr = "FILTER (?"+target.word.originalForm+"> " + num + ")"; + } + } + } + } + } + } + + // most + for(Word word: words) + { + if(word.baseForm.equals("most")) + { + Word modifiedWord = word.modifiedWord; + if(modifiedWord != null) + { + for(Sparql sp: qlog.rankedSparqls) + { + // Which Indian company has the most employees? --> ... dbo:numberOfEmployees ?n . || ?employees dbo:company ... + sp.mostStr = "ORDER BY DESC(COUNT(?"+modifiedWord.originalForm+"))\nOFFSET 0 LIMIT 1"; + for(Triple triple: sp.tripleList) + { + String p = Globals.pd.getPredicateById(triple.predicateID).toLowerCase(); + if(p.contains("number") || p.contains("total")) + { + sp.mostStr = "ORDER BY DESC(?"+modifiedWord.originalForm+")\nOFFSET 0 LIMIT 1"; + } + } + } + } + } + } + } + + public static void main(String[] args) { + System.out.println(translateNumbers("Twelve")); + System.out.println(translateNumbers("thirty two")); + } + +} diff --git a/src/fgmt/EntityFragment.java b/src/fgmt/EntityFragment.java new file mode 100644 index 0000000..3175c1e --- /dev/null +++ b/src/fgmt/EntityFragment.java @@ -0,0 +1,312 @@ +package fgmt; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; + +import rdf.EntityMapping; +import lcn.EntityFragmentFields; +import lcn.EntityNameAndScore; +import lcn.SearchInEntityFragments; + +public class EntityFragment extends Fragment { + + public int eId; + public HashSet inEdges = new HashSet(); + public HashSet outEdges = new HashSet(); + public HashSet types = new HashSet(); + + // in/out entity and the connected edges. Eg, , then outEntMap of eId contains > + public HashMap> inEntMap = new HashMap>(); // notice the input file should no redundant triple. + public HashMap> outEntMap = new HashMap>(); + + static double thres1 = 0.4; + static double thres2 = 0.8; + static int thres3 = 3; + static int k = 50; + + /** + * mention to entity using Lucene index. + * + * rule: + * select top-k results of each phrase. + * (1)if current lowest score < thres1, drop those score < thres1. + * (2)if current lowest score > thres2, add those score > thres2. + * + * exact match: + * (1)Lucene score = 1. + * (2)String match (lowercase): edit distance <= thres3. + * + * score: + * use Lucene score directly. + * + * @param phrase + * @return + */ + public static HashMap getCandEntityNames2(String phrase) { + + HashMap ret = new HashMap(); + ArrayList list1 = getCandEntityNames_subject(phrase, thres1, thres2, k); + + if(list1 == null) + return ret; + + int iter_size = 0; + if (list1.size() <= k) { + iter_size = list1.size(); + } + else if (list1.size() > k) { + if (list1.get(k-1).score >= thres2) { + iter_size = list1.size(); + } + else { + iter_size = k; + } + } + for(int i = 0; i < iter_size; i ++) { + if (i < k) { + ret.put(list1.get(i).entityID, getScore(phrase, list1.get(i).entityName, list1.get(i).score)); + } + else if (list1.get(i).score >= thres2) { + ret.put(list1.get(i).entityID, getScore(phrase, list1.get(i).entityName, list1.get(i).score)); + } + else { + break; + } + } + + return ret; + } + + public static ArrayList getEntityMappingList (String n) + { + HashMap map = getCandEntityNames2(n); + ArrayList ret = new ArrayList(); + for (int eid : map.keySet()) + { + String s = EntityFragmentFields.entityId2Name.get(eid); + ret.add(new EntityMapping(eid, s, map.get(eid))); + } + Collections.sort(ret); + return ret; + } + + public static double getScore (String s1, String s2, double luceneScore) { + double ret = luceneScore*100.0/(Math.log(calEditDistance(s1, s2)*1.5+1)+1); + return ret; + } + + /** + * Edit distance (all lowercase) + * @param s1 + * @param s2 + * @return + */ + public static int calEditDistance (String s1, String s2) { + s1 = s1.toLowerCase(); + s2 = s2.toLowerCase(); + + int d[][]; + int n = s1.length(); + int m = s2.length(); + int i, j, temp; + char ch1, ch2; + + if(n == 0) { + return m; + } + if(m == 0) { + return n; + } + + d = new int[n+1][m+1]; + for(i=0; i<=n; i++) { + d[i][0] = i; + } + for(j=0; j<=m; j++) { + d[0][j] = j; + } + + for(i=1; i<=n; i++) { + ch1 = s1.charAt(i-1); + for(j=1; j<=m; j++) { + ch2 = s2.charAt(j-1); + if(ch1 == ch2) { + temp = 0; + } else { + temp = 1; + } + d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp); + } + } + + return d[n][m]; + } + + private static int min(int a, int b, int c) { + int ab = a getCandEntityNames_subject(String phrase, double thres1, double thres2, int k) { + SearchInEntityFragments sf = new SearchInEntityFragments(); + //System.out.println("EntityFragment.getCandEntityNames_subject() ..."); + + ArrayList ret_sf = null; + try { + ret_sf = sf.searchName(phrase, thres1, thres2, k); + } catch (IOException e) { + //e.printStackTrace(); + System.err.println("Reading lcn index error"); + } + + return ret_sf; + } + + public static EntityFragment getEntityFragmentByEntityId(Integer entityId) + { + if(!EntityFragmentFields.entityFragmentString.containsKey(entityId)) + return null; + String fgmt = EntityFragmentFields.entityFragmentString.get(entityId); + EntityFragment ef = new EntityFragment(entityId, fgmt); + return ef; + } + + public static String getEntityFgmtStringByName(String entityName) + { + int id = EntityFragmentFields.entityName2Id.get(entityName); + String fgmt = EntityFragmentFields.entityFragmentString.get(id); + return fgmt; + } + + public EntityFragment(int eid, String fgmt) + { + eId = eid; + fragmentType = typeEnum.ENTITY_FRAGMENT; + + //eg: 11 |3961112:2881;410;,4641020:2330;, + fgmt = fgmt.replace('|', '#'); + String[] fields = fgmt.split("#"); + + if(fields.length > 0 && fields[0].length() > 0) + { + String[] entEdgesArr = fields[0].split(","); + for(int i = 0; i < entEdgesArr.length; i ++) + { + String[] nums = entEdgesArr[i].split(":"); + if(nums.length != 2) + continue; + int intEntId = Integer.valueOf(nums[0]); + String[] intEdges = nums[1].split(";"); + ArrayList intEdgeList = new ArrayList(); + for(String outEdge: intEdges) + { + intEdgeList.add(Integer.valueOf(outEdge)); + } + if(intEdgeList.size()>0) + inEntMap.put(intEntId, intEdgeList); + } + } + + if(fields.length > 1 && fields[1].length() > 0) + { + String[] entEdgesArr = fields[1].split(","); + for(int i = 0; i < entEdgesArr.length; i ++) + { + String[] nums = entEdgesArr[i].split(":"); + if(nums.length != 2) + continue; + int outEntId = Integer.valueOf(nums[0]); + String[] outEdges = nums[1].split(";"); + ArrayList outEdgeList = new ArrayList(); + for(String outEdge: outEdges) + { + outEdgeList.add(Integer.valueOf(outEdge)); + } + if(outEdgeList.size()>0) + outEntMap.put(outEntId, outEdgeList); + } + } + + if(fields.length > 2 && fields[2].length() > 0) { + String[] nums = fields[2].split(","); + for(int i = 0; i < nums.length; i ++) { + if (nums[i].length() > 0) { + inEdges.add(Integer.parseInt(nums[i])); + } + } + } + if(fields.length > 3 && fields[3].length() > 0) { + String[] nums = fields[3].split(","); + for(int i = 0; i < nums.length; i ++) { + if (nums[i].length() > 0) { + outEdges.add(Integer.parseInt(nums[i])); + } + } + } + if(fields.length > 4 && fields[4].length() > 0) { + String[] nums = fields[4].split(","); + for(int i = 0; i < nums.length; i ++) { + if (nums[i].length() > 0) { + types.add(Integer.parseInt(nums[i])); + } + } + } + + //TODO: fix data for DBpedia 2014 (should be eliminated when update dataset) + if(eid==2640237) //Barack_Obama + { + inEdges.add(8432); //spouse + outEdges.add(8432); + ArrayList outEdgeList = new ArrayList(); + outEdgeList.add(8432); + inEntMap.put(4953443, outEdgeList); + outEntMap.put(4953443, outEdgeList); + } + } + + @Override + public String toString() + { + StringBuilder ret = new StringBuilder(""); + for(Integer inEnt: inEntMap.keySet()) + { + ArrayList inEdgeList = inEntMap.get(inEnt); + if(inEdgeList==null || inEdgeList.size()==0) + continue; + ret.append(inEnt+":"); + for(int inEdge: inEdgeList) + ret.append(inEdge+";"); + ret.append(","); + } + ret.append('|'); + for(Integer outEnt: outEntMap.keySet()) + { + ArrayList outEdgeList = outEntMap.get(outEnt); + if(outEdgeList==null || outEdgeList.size()==0) + continue; + ret.append(outEnt+":"); + for(int outEdge: outEdgeList) + ret.append(outEdge+";"); + ret.append(","); + } + ret.append('|'); + for(Integer p : inEdges) { + ret.append(p); + ret.append(','); + } + ret.append('|'); + for(Integer p : outEdges) { + ret.append(p); + ret.append(','); + } + ret.append('|'); + for(Integer t : types) { + ret.append(t); + ret.append(','); + } + return ret.toString(); + } +} diff --git a/src/fgmt/Fragment.java b/src/fgmt/Fragment.java new file mode 100644 index 0000000..45b3545 --- /dev/null +++ b/src/fgmt/Fragment.java @@ -0,0 +1,8 @@ +package fgmt; + +public abstract class Fragment { + public enum typeEnum {ENTITY_FRAGMENT, RELATION_FRAGMENT, TYPE_FRAGMENT, VAR_FRAGMENT}; + + public typeEnum fragmentType; + public int fragmentId; +}; diff --git a/src/fgmt/RelationFragment.java b/src/fgmt/RelationFragment.java new file mode 100644 index 0000000..05332a4 --- /dev/null +++ b/src/fgmt/RelationFragment.java @@ -0,0 +1,105 @@ +package fgmt; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; + +import qa.Globals; +import utils.FileUtil; + +public class RelationFragment extends Fragment +{ + public static HashMap> relFragments = null; + public static HashMap> relationShortName2IdList = null; + public static HashSet literalRelationSet = null; + + public HashSet inTypes = new HashSet(); + public HashSet outTypes = new HashSet(); + + public static final int literalTypeId = -176; + + public RelationFragment(String inFgmt, String outFgmt, int fid) + { + fragmentId = fid; + fragmentType = typeEnum.RELATION_FRAGMENT; + String[] nums; + + // in + nums = inFgmt.split(","); + for(String s: nums) + if(s.length() > 0) + inTypes.add(Integer.parseInt(s)); + + // out + if(outFgmt.equals("itera")) + outTypes.add(literalTypeId); + else + { + nums = outFgmt.split(","); + for(String s: nums) + if(s.length() > 0) + outTypes.add(Integer.parseInt(s)); + } + } + + public static void load() throws Exception + { + String filename = Globals.localPath + "data/DBpedia2016/fragments/predicate_RDF_fragment/predicate_fragment.txt"; + List inputs = FileUtil.readFile(filename); + relFragments = new HashMap>(); + literalRelationSet = new HashSet(); + + for(String line: inputs) + { + String[] lines = line.split("\t"); + String inString = lines[0].substring(1, lines[0].length()-1); + int pid = Integer.parseInt(lines[1]); + String outString = lines[2].substring(1, lines[2].length()-1); + + // Record which relations can connect LITERAL objects. + if(outString.equals("itera")) // "literal".substring(1, length()-1) + literalRelationSet.add(pid); + + if(!relFragments.containsKey(pid)) + relFragments.put(pid, new ArrayList()); + relFragments.get(pid).add(new RelationFragment(inString, outString, pid)); + } + + loadId(); + } + + public static void loadId() throws IOException + { + String filename = Globals.localPath + "data/DBpedia2016/fragments/id_mappings/16predicate_id.txt"; + List inputs = FileUtil.readFile(filename); + relationShortName2IdList = new HashMap>(); + + for(String line: inputs) + { + String[] lines = line.split("\t"); + String rlnShortName = lines[0]; + + if (!relationShortName2IdList.containsKey(rlnShortName)) + relationShortName2IdList.put(rlnShortName, new ArrayList()); + relationShortName2IdList.get(rlnShortName).add(Integer.parseInt(lines[1])); + } + } + + public static boolean isLiteral (String p) + { + for (Integer i : relationShortName2IdList.get(p)) + if (literalRelationSet.contains(i)) + return true; + return false; + } + + public static boolean isLiteral (int pid) + { + if (literalRelationSet.contains(pid)) + return true; + else + return false; + } +} diff --git a/src/fgmt/TypeFragment.java b/src/fgmt/TypeFragment.java new file mode 100644 index 0000000..2c5bc10 --- /dev/null +++ b/src/fgmt/TypeFragment.java @@ -0,0 +1,179 @@ +package fgmt; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +import qa.Globals; + + +public class TypeFragment extends Fragment { + + public static HashMap typeFragments = null; + public static HashMap> typeShortName2IdList = null; + public static HashMap typeId2ShortName = null; + public static final int NO_RELATION = -24232; + + public static HashSet yagoTypeList = null; + + public HashSet inEdges = new HashSet(); + public HashSet outEdges = new HashSet(); + public HashSet entSet = new HashSet(); + + /* + * Eliminate some bad YAGO Types which conflict with: + * 1, ENT: amazon、earth、the_hunger_game、sparkling_wine + * 2, TYPE: type + * 3, RELATION: flow、owner、series、shot、part、care + * 4, others: peace、vice + */ + public static ArrayList stopYagoTypeList = null; + static void loadStopYagoTypeList() + { + stopYagoTypeList = new ArrayList(); + stopYagoTypeList.add("Amazon"); + stopYagoTypeList.add("Earth"); + stopYagoTypeList.add("TheHungerGames"); + stopYagoTypeList.add("SparklingWine"); + stopYagoTypeList.add("Type"); + stopYagoTypeList.add("Flow"); + stopYagoTypeList.add("Owner"); + stopYagoTypeList.add("Series"); + stopYagoTypeList.add("Shot"); + stopYagoTypeList.add("Part"); + stopYagoTypeList.add("Care"); + stopYagoTypeList.add("Peace"); + stopYagoTypeList.add("Vice"); + stopYagoTypeList.add("Dodo"); + stopYagoTypeList.add("CzechFilms"); + stopYagoTypeList.add("ChineseFilms"); + } + + public TypeFragment(String fgmt, int fid) + { + fragmentId = fid; + fragmentType = typeEnum.TYPE_FRAGMENT; + + fgmt = fgmt.replace('|', '#'); + String[] ss = fgmt.split("#"); + String[] nums; + + if (ss[0].length() > 0) { + nums = ss[0].split(","); + for(int i = 0; i < nums.length; i ++) { + if (nums[i].length() > 0) { + inEdges.add(Integer.parseInt(nums[i])); + } + } + } + else { + inEdges.add(NO_RELATION); + } + + if (ss.length > 1 && ss[1].length() > 0) { + nums = ss[1].split(","); + for(int i = 0; i < nums.length; i ++) { + if (nums[i].length() > 0) { + outEdges.add(Integer.parseInt(nums[i])); + } + } + } + else { + outEdges.add(NO_RELATION); + } + + if(ss.length > 2 && ss[2].length() > 0) + { + nums = ss[2].split(","); + for(int i = 0; i < nums.length; i ++) { + if (nums[i].length() > 0) { + entSet.add(Integer.parseInt(nums[i])); + } + } + } + } + + public static void load() throws Exception + { + String filename = Globals.localPath+"data/DBpedia2016/fragments/class_RDF_fragment/16type_fragment.txt"; + + File file = new File(filename); + InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); + BufferedReader br = new BufferedReader(in); + + typeFragments = new HashMap(); + + System.out.println("Loading type IDs and Fragments ..."); + String line; + while((line = br.readLine()) != null) { + String[] lines = line.split("\t"); + TypeFragment tfgmt = null; + if(lines[0].length() > 0 && !lines[0].equals("literal")) { + int tid = Integer.parseInt(lines[0]); + try{tfgmt = new TypeFragment(lines[1], tid);} + catch(Exception e){} + + + typeFragments.put(tid, tfgmt); + } + } + + br.close(); + + // can fix some data there + // load Type Id + loadId(); + System.out.println("Load "+typeId2ShortName.size()+" basic types and "+yagoTypeList.size()+" yago types."); + } + + public static void loadId() throws IOException + { + String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16basic_types_id.txt"; + String yagoFileName = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16yago_types_list.txt"; + + File file = new File(filename); + InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); + BufferedReader br = new BufferedReader(in); + + typeShortName2IdList = new HashMap>(); + typeId2ShortName = new HashMap(); + + String line; + while((line = br.readLine()) != null) { + String[] lines = line.split("\t"); + String typeShortName = lines[0]; + // reserve typeShortName's capitalization + if (!typeShortName2IdList.containsKey(typeShortName)) { + typeShortName2IdList.put(typeShortName, new ArrayList()); + } + typeShortName2IdList.get(typeShortName).add(Integer.parseInt(lines[1])); + typeId2ShortName.put(Integer.parseInt(lines[1]), typeShortName); + } + + // literalType + typeShortName2IdList.put("literal_HRZ", new ArrayList()); + typeShortName2IdList.get("literal_HRZ").add(RelationFragment.literalTypeId); + typeId2ShortName.put(RelationFragment.literalTypeId, "literal_HRZ"); + + br.close(); + + //load YAGO types + in = new InputStreamReader(new FileInputStream(yagoFileName),"utf-8"); + br = new BufferedReader(in); + yagoTypeList = new HashSet(); + while((line = br.readLine())!=null) + { + String[] lines = line.split("\t"); + String typeName = lines[0]; + yagoTypeList.add(typeName); + } + + loadStopYagoTypeList(); + yagoTypeList.removeAll(stopYagoTypeList); + } +} diff --git a/src/fgmt/VariableFragment.java b/src/fgmt/VariableFragment.java new file mode 100644 index 0000000..f3ff129 --- /dev/null +++ b/src/fgmt/VariableFragment.java @@ -0,0 +1,56 @@ +package fgmt; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; + +public class VariableFragment extends Fragment { + public static final int magic_number = -265; + + public ArrayList> candTypes = null; + public HashSet candEntities = null; + public boolean mayLiteral = false; + + public VariableFragment() + { + fragmentType = typeEnum.VAR_FRAGMENT; + candTypes = new ArrayList>(); + candEntities = new HashSet(); + } + + @Override + public String toString() + { + return "("+ candEntities.size() +")"; + } + + public boolean containsAll(HashSet s1) { + Iterator> it = candTypes.iterator(); + while(it.hasNext()) { + HashSet s2 = it.next(); + if (s2.contains(magic_number)) { + if (!Collections.disjoint(s1, s2)) { + return true; + } + } + else { + if (s1.containsAll(s2) && s2.containsAll(s1)) { + return true; + } + } + } + return false; + } + + public boolean contains(Integer i) { + Iterator> it = candTypes.iterator(); + while(it.hasNext()) { + HashSet s = it.next(); + if (s.contains(i)) { + return true; + } + } + return false; + } +} diff --git a/src/jgsc/GstoreConnector.java b/src/jgsc/GstoreConnector.java new file mode 100644 index 0000000..960fd07 --- /dev/null +++ b/src/jgsc/GstoreConnector.java @@ -0,0 +1,489 @@ +package jgsc; + +import java.io.*; +import java.net.*; +import java.lang.*; +import java.net.URLEncoder; +import java.net.URLDecoder; +import java.io.UnsupportedEncodingException; +import java.util.List; +import java.util.Map; + +public class GstoreConnector { + + public static final String defaultServerIP = "127.0.0.1"; + public static final int defaultServerPort = 9000; + + private String serverIP; + private int serverPort; + //private Socket socket = null; + + public GstoreConnector() { + this.serverIP = GstoreConnector.defaultServerIP; + this.serverPort = GstoreConnector.defaultServerPort; + } + + public GstoreConnector(int _port) { + this.serverIP = GstoreConnector.defaultServerIP; + this.serverPort = _port; + } + + public GstoreConnector(String _ip, int _port) { + this.serverIP = _ip; + this.serverPort = _port; + } + + //PERFORMANCE: what if the query result is too large? receive and save to file directly at once + //In addition, set the -Xmx larger(maybe in scale of Gs) if the query result could be very large, + //this may help to reduce the GC cost + public String sendGet(String param) { + String url = "http://" + this.serverIP + ":" + this.serverPort; + StringBuffer result = new StringBuffer(); + BufferedReader in = null; + System.out.println("parameter: "+param); + + try { + param = URLEncoder.encode(param, "UTF-8"); + } + catch (UnsupportedEncodingException ex) { + throw new RuntimeException("Broken VM does not support UTF-8"); + } + + try { + String urlNameString = url + "/" + param; + System.out.println("request: "+urlNameString); + URL realUrl = new URL(urlNameString); + // 閹垫挸绱戦崪瀛禦L娑斿妫块惃鍕箾閹猴拷 + URLConnection connection = realUrl.openConnection(); + // 鐠佸墽鐤嗛柅姘辨暏閻ㄥ嫯顕Ч鍌氱潣閹拷 + connection.setRequestProperty("accept", "*/*"); + connection.setRequestProperty("connection", "Keep-Alive"); + //set agent to avoid: speed limited by server if server think the client not a browser + connection.setRequestProperty("user-agent", + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); + // 瀵よ櫣鐝涚�圭偤妾惃鍕箾閹猴拷 + connection.connect(); + + long t0 = System.currentTimeMillis(); //ms + + // 閼惧嘲褰囬幍锟介張澶婃惙鎼存柨銇旂�涙顔� + Map> map = connection.getHeaderFields(); + // 闁秴宸婚幍锟介張澶屾畱閸濆秴绨叉径鏉戠摟濞堬拷 + //for (String key : map.keySet()) { + // System.out.println(key + "--->" + map.get(key)); + //} + + long t1 = System.currentTimeMillis(); //ms + //System.out.println("Time to get header: "+(t1 - t0)+" ms"); + //System.out.println("============================================"); + + // 鐎规矮绠� BufferedReader鏉堟挸鍙嗗ù浣规降鐠囪褰嘦RL閻ㄥ嫬鎼锋惔锟� + in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8")); + String line; + while ((line = in.readLine()) != null) { + //PERFORMANCE: this can be very costly if result is very large, because many temporary Strings are produced + //In this case, just print the line directly will be much faster + result.append(line+"\n"); + //System.out.println("get data size: " + line.length()); + //System.out.println(line); + } + + long t2 = System.currentTimeMillis(); //ms + //System.out.println("Time to get data: "+(t2 - t1)+" ms"); + } catch (Exception e) { + System.out.println("error in get request: " + e); + e.printStackTrace(); + } + // 娴h法鏁inally閸ф娼甸崗鎶芥4鏉堟挸鍙嗗ù锟� + finally { + try { + if (in != null) { + in.close(); + } + } catch (Exception e2) { + e2.printStackTrace(); + } + } + return result.toString(); + } + + public void sendGet(String param, String filename) { + String url = "http://" + this.serverIP + ":" + this.serverPort; + BufferedReader in = null; + System.out.println("parameter: "+param); + + if (filename == null) + return; + + FileWriter fw = null; + try { + fw = new FileWriter(filename); + } catch (IOException e) { + System.out.println("can not open " + filename + "!"); + } + + try { + param = URLEncoder.encode(param, "UTF-8"); + } catch (UnsupportedEncodingException ex) { + throw new RuntimeException("Broken VM does not support UTF-8"); + } + + try { + String urlNameString = url + "/" + param; + System.out.println("request: "+urlNameString); + URL realUrl = new URL(urlNameString); + // 閹垫挸绱戦崪瀛禦L娑斿妫块惃鍕箾閹猴拷 + URLConnection connection = realUrl.openConnection(); + // 鐠佸墽鐤嗛柅姘辨暏閻ㄥ嫯顕Ч鍌氱潣閹拷 + connection.setRequestProperty("accept", "*/*"); + connection.setRequestProperty("connection", "Keep-Alive"); + //set agent to avoid: speed limited by server if server think the client not a browser + connection.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); + // 瀵よ櫣鐝涚�圭偤妾惃鍕箾閹猴拷 + connection.connect(); + + long t0 = System.currentTimeMillis(); //ms + + // 閼惧嘲褰囬幍锟介張澶婃惙鎼存柨銇旂�涙顔� + Map> map = connection.getHeaderFields(); + // 闁秴宸婚幍锟介張澶屾畱閸濆秴绨叉径鏉戠摟濞堬拷 + //for (String key : map.keySet()) { + // System.out.println(key + "--->" + map.get(key)); + //} + + long t1 = System.currentTimeMillis(); // ms + //System.out.println("Time to get header: "+(t1 - t0)+" ms"); + + // 鐎规矮绠� BufferedReader鏉堟挸鍙嗗ù浣规降鐠囪褰嘦RL閻ㄥ嫬鎼锋惔锟� + in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8")); + char chars[] = new char[2048]; + int b; + while ((b = in.read(chars, 0, 2048)) != -1) { + if (fw != null) + fw.write(chars); + chars = new char[2048]; + } + + long t2 = System.currentTimeMillis(); //ms + //System.out.println("Time to get data: "+(t2 - t1)+" ms"); + } catch (Exception e) { + //System.out.println("error in get request: " + e); + e.printStackTrace(); + } + // 娴h法鏁inally閸ф娼甸崗鎶芥4鏉堟挸鍙嗗ù锟� + finally { + try { + if (in != null) { + in.close(); + } + if (fw != null) { + fw.close(); + } + } catch (Exception e2) { + e2.printStackTrace(); + } + } + return; + } + + +//NOTICE: no need to connect now, HTTP connection is kept by default + public boolean load(String _db_name, String _username, String _password) { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.load"); + return false; + } + + String cmd = "?operation=load&db_name=" + _db_name + "&username=" + _username + "&password=" + _password; + String msg = this.sendGet(cmd); + //if (!send_return) { + //System.err.println("send load command error. @GstoreConnector.load"); + //return false; + //} + + this.disconnect(); + + System.out.println(msg); + if (msg.equals("load database done.")) { + return true; + } + + return false; + } + + public boolean unload(String _db_name,String _username, String _password) { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.unload"); + return false; + } + + String cmd = "?operation=unload&db_name=" + _db_name + "&username=" + _username + "&password=" + _password; + String msg = this.sendGet(cmd); + + this.disconnect(); + + System.out.println(msg); + if (msg.equals("unload database done.")) { + return true; + } + + return false; + } + + public boolean build(String _db_name, String _rdf_file_path, String _username, String _password) { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.build"); + return false; + } + + //TODO: also use encode to support spaces? + //Consider change format into ?name=DBname + String cmd = "?operation=build&db_name=" + _db_name + "&ds_path=" + _rdf_file_path + "&username=" + _username + "&password=" + _password;; + String msg = this.sendGet(cmd); + + this.disconnect(); + + System.out.println(msg); + if (msg.equals("import RDF file to database done.")) { + return true; + } + + return false; + } + + //TODO: not implemented + public boolean drop(String _db_name) { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.drop"); + return false; + } + + String cmd = "drop/" + _db_name; + String msg = this.sendGet(cmd); + + this.disconnect(); + + System.out.println(msg); + return msg.equals("drop database done."); + } + + public String query(String _username, String _password, String _db_name, String _sparql) { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.query"); + return "connect to server error."; + } + + //URL encode should be used here + //try { + //_sparql = URLEncoder.encode("\""+_sparql+"\"", "UTF-8"); + //} + //catch (UnsupportedEncodingException ex) { + //throw new RuntimeException("Broken VM does not support UTF-8"); + //} + + String cmd = "?operation=query&username=" + _username + "&password=" + _password + "&db_name=" + _db_name + "&format=txt&sparql=" + _sparql; + //String cmd = "query/\"" + _sparql + "\""; + String msg = this.sendGet(cmd); + + this.disconnect(); + + return msg; + } + + public void query(String _username, String _password, String _db_name, String _sparql, String _filename) { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.query"); + } + + String cmd = "?operation=query&username=" + _username + "&password=" + _password + "&db_name=" + _db_name + "&format=json&sparql=" + _sparql; + this.sendGet(cmd, _filename); + + this.disconnect(); + + return; + } + + + // public String show() { + // return this.show(false); + // } + + //show all databases + public String show() { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.show"); + return "connect to server error."; + } + + String cmd = "?operation=show"; + String msg = this.sendGet(cmd); + + this.disconnect(); + return msg; + } + public String user(String type, String username1, String password1, String username2, String addtion) { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.show"); + return "connect to server error."; + } + + String cmd = "?operation=user&type=" + type + "&username1=" + username1 + "&password1=" + password1 + "&username2=" + username2 + "&addtion=" + addtion; + String msg = this.sendGet(cmd); + + this.disconnect(); + return msg; + } + public String showUser() { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.show"); + return "connect to server error."; + } + + String cmd = "?operation=showUser"; + String msg = this.sendGet(cmd); + + this.disconnect(); + return msg; + } + public String monitor(String db_name) { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.show"); + return "connect to server error."; + } + + String cmd = "?operation=monitor&db_name=" + db_name; + String msg = this.sendGet(cmd); + + this.disconnect(); + return msg; + } + public String checkpoint(String db_name) { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.show"); + return "connect to server error."; + } + + String cmd = "?operation=checkpoint&db_name=" + db_name; + String msg = this.sendGet(cmd); + + this.disconnect(); + return msg; + } + public String test_download(String filepath) + { + boolean connect_return = this.connect(); + if (!connect_return) { + System.err.println("connect to server error. @GstoreConnector.query"); + return "connect to server error."; + } + + //TEST: a small file, a large file + String cmd = "?operation=delete&download=true&filepath=" + filepath; + String msg = this.sendGet(cmd); + + this.disconnect(); + + return msg; + } + + private boolean connect() { + return true; + } + + private boolean disconnect() { + return true; + } + + private static byte[] packageMsgData(String _msg) { + //byte[] data_context = _msg.getBytes(); + byte[] data_context = null; + try { + data_context = _msg.getBytes("utf-8"); + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + System.err.println("utf-8 charset is unsupported."); + data_context = _msg.getBytes(); + } + int context_len = data_context.length + 1; // 1 byte for '\0' at the end of the context. + int data_len = context_len + 4; // 4 byte for one int(data_len at the data's head). + byte[] data = new byte[data_len]; + + // padding head(context_len). + byte[] head = GstoreConnector.intToByte4(context_len); + for (int i = 0; i < 4; i++) { + data[i] = head[i]; + } + + // padding context. + for (int i = 0; i < data_context.length; i++) { + data[i + 4] = data_context[i]; + } + // in C, there should be '\0' as the terminator at the end of a char array. so we need add '\0' at the end of sending message. + data[data_len - 1] = 0; + + return data; + } + + private static byte[] intToByte4(int _x) // with Little Endian format. + { + byte[] ret = new byte[4]; + ret[0] = (byte) (_x); + ret[1] = (byte) (_x >>> 8); + ret[2] = (byte) (_x >>> 16); + ret[3] = (byte) (_x >>> 24); + + return ret; + } + + private static int byte4ToInt(byte[] _b) // with Little Endian format. + { + int byte0 = _b[0] & 0xFF, byte1 = _b[1] & 0xFF, byte2 = _b[2] & 0xFF, byte3 = _b[3] & 0xFF; + int ret = (byte0) | (byte1 << 8) | (byte2 << 16) | (byte3 << 24); + + return ret; + } + + public static void main(String[] args) { + // initialize the GStore server's IP address and port. + GstoreConnector gc = new GstoreConnector("172.31.222.90", 9001); + + // build a new database by a RDF file. + // note that the relative path is related to gserver. + //gc.build("db_LUBM10", "example/rdf_triple/LUBM_10_GStore.n3"); + String sparql = "select ?x where {" + + " ?x" + + "}"; + + sparql = "select ?countries where { ?countries . ?caves . ?caves ?countries . } " + + "GROUP BY ?countries HAVING(COUNT(?caves) > 1000)"; + + sparql = "ASK where { .}"; + + sparql = "select DISTINCT ?film ?budget where { ?film . ?film . ?film ?budget . }"; + +// boolean flag = gc.load("dbpedia16", "root", "123456"); + //System.out.println(flag); + String answer = gc.query("root", "123456", "dbpedia16", sparql); + System.out.println(answer); + + //To count the time cost + //long startTime=System.nanoTime(); //ns + //long startTime=System.currentTimeMillis(); //ms + //doSomeThing(); //濞村鐦惃鍕敩閻焦顔� + //long endTime=System.currentTimeMillis(); //閼惧嘲褰囩紒鎾存将閺冨爼妫� + //System.out.println("缁嬪绨潻鎰攽閺冨爼妫块敍锟� "+(end-start)+"ms"); + } +} + diff --git a/src/lcn/BuildIndexForEntityFragments.java b/src/lcn/BuildIndexForEntityFragments.java new file mode 100644 index 0000000..e96e71e --- /dev/null +++ b/src/lcn/BuildIndexForEntityFragments.java @@ -0,0 +1,133 @@ +package lcn; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +//import java.io.IOException; +import java.io.InputStreamReader; +import java.util.Date; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; + +import qa.Globals; + +//import qa.Globals; + +/** + * Lucene建立索引的基本单元是document,同时其中的域filed可以根据需要自己添加 + * + * Document是一个记录,用来表示一个条目,相当于数据库中的一行记录,就是搜索建立的倒排索引的条目。 + * eg:你要搜索自己电脑上的文件,这个时候就可以创建field(字段,相关于数据库中的列。 然后用field组合成document,最后会变成若干文件。 + * 这个document和文件系统document不是一个概念。 + * + * StandardAnalyzer是lucene中内置的"标准分析器",可以做如下功能: + * 1、对原有句子按照空格进行了分词 + * 2、所有的大写字母都可以能转换为小写的字母 + * 3、可以去掉一些没有用处的单词,例如"is","the","are"等单词,也删除了所有的标点 + */ +public class BuildIndexForEntityFragments{ + public void indexforentity() throws Exception + { + if(EntityFragmentFields.entityId2Name == null) + EntityFragmentFields.load(); + + long startTime = new Date().getTime(); + + //Try update KB index to DBpedia2015. by husen 2016-04-08 + //Try update KB index to DBpedia2016. by husen 2018-8-22 + File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index"); + File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"); + + Analyzer luceneAnalyzer_en = new StandardAnalyzer(); + IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true); + + int mergeFactor = 100000; //default 10 + int maxBufferedDoc = 1000; //default 10 + int maxMergeDoc = Integer.MAX_VALUE; //INF + + //indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; + indexWriter_en.setMergeFactor(mergeFactor); + indexWriter_en.setMaxBufferedDocs(maxBufferedDoc); + indexWriter_en.setMaxMergeDocs(maxMergeDoc); + + + FileInputStream file = new FileInputStream(sourceDir_en); + InputStreamReader in = new InputStreamReader(file,"UTF-8"); + BufferedReader br = new BufferedReader(in); + + int count = 0; + while(true) + { + String _line = br.readLine(); + { + if(_line == null) break; + } + count++; + if(count % 100000 == 0) + System.out.println(count); + + String line = _line; + String temp[] = line.split("\t"); + + if(temp.length != 2) + continue; + else + { + int entity_id = Integer.parseInt(temp[0]); + if(!EntityFragmentFields.entityId2Name.containsKey(entity_id)) + continue; + + String entity_name = EntityFragmentFields.entityId2Name.get(entity_id); + String entity_fragment = temp[1]; + entity_name = entity_name.replace("____", " "); + entity_name = entity_name.replace("__", " "); + entity_name = entity_name.replace("_", " "); + + + Document document = new Document(); + + Field EntityName = new Field("EntityName", entity_name, Field.Store.YES, + Field.Index.TOKENIZED, + Field.TermVector.WITH_POSITIONS_OFFSETS); + Field EntityId = new Field("EntityId", String.valueOf(entity_id), + Field.Store.YES, Field.Index.NO); + Field EntityFragment = new Field("EntityFragment", entity_fragment, + Field.Store.YES, Field.Index.NO); + + document.add(EntityName); + document.add(EntityId); + document.add(EntityFragment); + indexWriter_en.addDocument(document); + } + } + + indexWriter_en.optimize(); + indexWriter_en.close(); + br.close(); + + // input the time of Build index + long endTime = new Date().getTime(); + System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime)); + } + + public static void main(String[] args) + { + BuildIndexForEntityFragments bef = new BuildIndexForEntityFragments(); + + try + { + Globals.localPath="D:/husen/gAnswer/"; + bef.indexforentity(); + } + catch (Exception e) + { + e.printStackTrace(); + } + } +} + + diff --git a/src/lcn/BuildIndexForTypeShortName.java b/src/lcn/BuildIndexForTypeShortName.java new file mode 100644 index 0000000..78b55f7 --- /dev/null +++ b/src/lcn/BuildIndexForTypeShortName.java @@ -0,0 +1,107 @@ +package lcn; + +import java.io.File; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; + +import qa.Globals; +import fgmt.TypeFragment; + +public class BuildIndexForTypeShortName { + public static void buildIndex(HashMap> typeShortName2IdList) throws Exception + { + long startTime = new Date().getTime(); + File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index"); + + Analyzer luceneAnalyzer_li = new StandardAnalyzer(); + IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true); + + int mergeFactor = 100000; + int maxBufferedDoc = 1000; + int maxMergeDoc = Integer.MAX_VALUE; + + //indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; + indexWriter_li.setMergeFactor(mergeFactor); + indexWriter_li.setMaxBufferedDocs(maxBufferedDoc); + indexWriter_li.setMaxMergeDocs(maxMergeDoc); + + int count = 0; + Iterator it = typeShortName2IdList.keySet().iterator(); + while (it.hasNext()) + { + String sn = it.next(); + if (sn.length() == 0) { + continue; + } + + count ++; + + StringBuilder splittedSn = new StringBuilder(""); + + if(sn.contains("_")) + { + String nsn = sn.replace("_", " "); + splittedSn.append(nsn.toLowerCase()); + } + else + { + int last = 0, i = 0; + for(i = 0; i < sn.length(); i ++) + { + // if it were not a small letter, then break it. + if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z')) + { + splittedSn.append(sn.substring(last, i).toLowerCase()); + splittedSn.append(' '); + last = i; + } + } + splittedSn.append(sn.substring(last, i).toLowerCase()); + while(splittedSn.charAt(0) == ' ') { + splittedSn.deleteCharAt(0); + } + } + + System.out.println("SplitttedType: "+splittedSn); + + Document document = new Document(); + + Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(), + Field.Store.YES, + Field.Index.TOKENIZED, + Field.TermVector.WITH_POSITIONS_OFFSETS); + Field TypeShortName = new Field("TypeShortName", sn, + Field.Store.YES, Field.Index.NO); + + document.add(SplittedTypeShortName); + document.add(TypeShortName); + indexWriter_li.addDocument(document); + } + + indexWriter_li.optimize(); + indexWriter_li.close(); + + // input the time of Build index + long endTime = new Date().getTime(); + System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime)); + } + + public static void main (String[] args) { + try { + Globals.localPath="D:/husen/gAnswer/"; + TypeFragment.load(); + BuildIndexForTypeShortName.buildIndex(TypeFragment.typeShortName2IdList); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/src/lcn/EntityFragmentFields.java b/src/lcn/EntityFragmentFields.java new file mode 100644 index 0000000..0b1a873 --- /dev/null +++ b/src/lcn/EntityFragmentFields.java @@ -0,0 +1,64 @@ +package lcn; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.HashMap; + +import qa.Globals; + +public class EntityFragmentFields { + + // entity dictionary + public static HashMap entityName2Id = null; + public static HashMap entityId2Name = null; + public static HashMap entityFragmentString = null; + + public static void load() throws IOException + { + String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16entity_id.txt"; + String fragmentFileName = Globals.localPath+"data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"; + File file = new File(filename); + BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8")); + + entityName2Id = new HashMap(); + entityId2Name = new HashMap(); + + long t1, t2, t3; + + t1 = System.currentTimeMillis(); + // load entity id + System.out.println("Loading entity id ..."); + String line; + while((line = br.readLine()) != null) + { + String[] lines = line.split("\t"); + String entName = lines[0].substring(1, lines[0].length()-1); + + entityName2Id.put(entName, Integer.parseInt(lines[1])); + entityId2Name.put(Integer.parseInt(lines[1]), entName); + } + br.close(); + t2 = System.currentTimeMillis(); + System.out.println("Load "+entityId2Name.size()+" entity ids in "+ (t2-t1) + "ms."); + + // load entity fragment + System.out.println("Loading entity fragments ..."); + br = new BufferedReader(new InputStreamReader(new FileInputStream(fragmentFileName),"utf-8")); + entityFragmentString = new HashMap(); + while((line = br.readLine()) != null) + { + String[] lines = line.split("\t"); + if(lines.length != 2) + continue; + int eId = Integer.parseInt(lines[0]); + entityFragmentString.put(eId, lines[1]); + } + t3 = System.currentTimeMillis(); + System.out.println("Load "+entityFragmentString.size()+" entity fragments in "+ (t3-t2) + "ms."); + + br.close(); + } +} diff --git a/src/lcn/EntityNameAndScore.java b/src/lcn/EntityNameAndScore.java new file mode 100644 index 0000000..74d940f --- /dev/null +++ b/src/lcn/EntityNameAndScore.java @@ -0,0 +1,31 @@ +package lcn; + +public class EntityNameAndScore implements Comparable { + public int entityID; + public String entityName; + public double score; + + public EntityNameAndScore(int id, String n, double s) { + entityID = id; + entityName = n; + score = s; + } + + @Override + public String toString() { + return entityID + ":<" + entityName + ">\t" + score; + } + + public int compareTo(EntityNameAndScore o) { + if(this.score < o.score) { + return 1; + } + else if (this.score > o.score) { + return -1; + } + else { + return 0; + } + } + +} diff --git a/src/lcn/Main.java b/src/lcn/Main.java new file mode 100644 index 0000000..2b5850b --- /dev/null +++ b/src/lcn/Main.java @@ -0,0 +1,58 @@ +package lcn; + +//import java.io.IOException; +//import java.util.ArrayList; +import java.util.ArrayList; +import java.util.Scanner; + +import fgmt.EntityFragment; +import qa.Globals; +import qa.mapping.EntityFragmentDict; + + +public class Main { + //Test: searching Entities and Types through Lucene Index. + public static void main(String[] aStrings) throws Exception{ + + //SearchInLiteralSubset se = new SearchInLiteralSubset(); + SearchInTypeShortName st = new SearchInTypeShortName(); + SearchInEntityFragments sf = new SearchInEntityFragments(); + EntityFragmentDict efd = new EntityFragmentDict(); + EntityFragmentFields eff = null; + Globals.localPath = "D:/husen/gAnswer/"; + Scanner sc = new Scanner(System.in); + System.out.print("input name: "); + + while(sc.hasNextLine()) + { + String literal = sc.nextLine(); + System.out.println(literal); + + //literal = cnlp.getBaseFormOfPattern(literal); + +//search Type + ArrayList result = st.searchType(literal, 0.4, 0.8, 10); + System.out.println("TypeShortName-->RESULT:"); + for (String s : result) { + System.out.println("<"+s + ">"); + } + +//search Ent Fragment +// int eId = EntityFragmentFields.entityName2Id.get(literal); +// EntityFragment ef = EntityFragment.getEntityFragmentByEntityId(eId); +// System.out.println(ef); + +//search Ent Name +// ArrayList result = sf.searchName(literal, 0.4, 0.8, 50); +// System.out.println("EntityName-->RESULT:"); +// for(EntityNameAndScore enas: result) +// { +// System.out.println(enas); +// } + + System.out.print("input name: "); + } + sc.close(); + } + +} diff --git a/src/lcn/SearchInEntityFragments.java b/src/lcn/SearchInEntityFragments.java new file mode 100644 index 0000000..0efd5cc --- /dev/null +++ b/src/lcn/SearchInEntityFragments.java @@ -0,0 +1,84 @@ +package lcn; + +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; + +import qa.Globals; + + +public class SearchInEntityFragments { + + /* + * Search entity in Lucene + * */ + public ArrayList searchName(String literal, double thres1, double thres2, int k) throws IOException { + Hits hits = null; + String queryString = null; + Query query = null; + + IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/entity_fragment_index"); + + ArrayList result = new ArrayList(); + + queryString = literal; + + Analyzer analyzer = new StandardAnalyzer(); + try + { + QueryParser qp = new QueryParser("EntityName", analyzer); + query = qp.parse(queryString); + } catch (ParseException e) + { + e.printStackTrace(); + } + + if (searcher != null) + { + hits = searcher.search(query); + //System.out.println("search for entity fragment hits.length=" + hits.length()); + if (hits.length() > 0) + { + //System.out.println("find " + hits.length() + " result!"); + for (int i=0; i;" + // +hits.doc(i).get("EntityFragment") + // + "; Score: " + hits.score(i) + // + "; Score2: " + hits.score(i)*(literalLength/hits.doc(i).get("EntityName").length())); + if(i= thres1) { + String en = hits.doc(i).get("EntityName"); + int id = Integer.parseInt(hits.doc(i).get("EntityId")); + result.add(new EntityNameAndScore(id, en, hits.score(i))); + } + else { + break; + } + } + else { + if (hits.score(i) >= thres2) { + String en = hits.doc(i).get("EntityName"); + int id = Integer.parseInt(hits.doc(i).get("EntityId")); + result.add(new EntityNameAndScore(id, en, hits.score(i))); + } + else { + break; + } + } + } + } + } + + //Collections.sort(result); + return result; + + } + +} diff --git a/src/lcn/SearchInTypeShortName.java b/src/lcn/SearchInTypeShortName.java new file mode 100644 index 0000000..7f7304d --- /dev/null +++ b/src/lcn/SearchInTypeShortName.java @@ -0,0 +1,176 @@ +package lcn; + +import java.util.ArrayList; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; + +import fgmt.TypeFragment; +import qa.Globals; +import rdf.TypeMapping; + +public class SearchInTypeShortName { + // get id and score -- husen + public ArrayList searchTypeScore(String s, double thres1, double thres2, int k) throws Exception + { + Hits hits = null; + String queryString = s; + Query query = null; + + IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/type_fragment_index"); + + ArrayList tmList = new ArrayList(); + + Analyzer analyzer = new StandardAnalyzer(); + try { + QueryParser qp = new QueryParser("SplittedTypeShortName", analyzer); + query = qp.parse(queryString); + } catch (ParseException e) { + e.printStackTrace(); + } + + if (searcher != null) { + hits = searcher.search(query); + + //System.out.println("find " + hits.length() + " matched type."); + if (hits.length() > 0) { + for (int i=0; i= thres1) + { + //System.out.println("Score>=thres1("+thres1+") ---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); + String type = hits.doc(i).get("TypeShortName"); + System.out.println("Matched type: " + type + " : " + hits.score(i)); + + ArrayList ret_in = TypeFragment.typeShortName2IdList.get(type); + if(ret_in!=null) + { + for(Integer tid: ret_in) + { + TypeMapping typeMapping = new TypeMapping(tid, hits.doc(i).get("TypeShortName"), hits.score(i)); + tmList.add(typeMapping); + } + } + } + else { + break; + } + } + else { + if(hits.score(i) >= thres2) + { + System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); + + ArrayList ret_in = TypeFragment.typeShortName2IdList.get(s); + if(ret_in!=null) + { + for(Integer tid: ret_in) + { + TypeMapping typeMapping = new TypeMapping(tid, hits.doc(i).get("TypeShortName"), hits.score(i)); + tmList.add(typeMapping); + } + } + } + else { + break; + } + } + } + } + } + return tmList; + } + + public ArrayList searchType(String s, double thres1, double thres2, int k) throws Exception + { + Hits hits = null; + String queryString = null; + Query query = null; + + IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/type_fragment_index"); + + ArrayList typeNames = new ArrayList(); + + //String[] array = s.split(" "); + //queryString = array[array.length-1]; + queryString = s; + + Analyzer analyzer = new StandardAnalyzer(); + try { + QueryParser qp = new QueryParser("SplittedTypeShortName", analyzer); + query = qp.parse(queryString); + } catch (ParseException e) { + e.printStackTrace(); + } + + if (searcher != null) { + hits = searcher.search(query); + + System.out.println("find " + hits.length() + " answars!"); + if (hits.length() > 0) { + for (int i=0; i= thres1){ + System.out.println("Score>=thres1("+thres1+") ---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); + typeNames.add(hits.doc(i).get("TypeShortName")); + //if (satisfiedStrictly(hits.doc(i).get("SplittedTypeShortName"), queryString)) typeNames.add(hits.doc(i).get("TypeShortName")); + } + else { + //break; + } + } + else { + if(hits.score(i) >= thres2){ + System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); + typeNames.add(hits.doc(i).get("TypeShortName")); + //if (satisfiedStrictly(hits.doc(i).get("SplittedTypeShortName"), queryString)) typeNames.add(hits.doc(i).get("TypeShortName")); + } + else { + break; + } + } + } + } + } + return typeNames; + } + + private boolean satisfiedStrictly (String splittedTypeShortName, String queryString) + { + String[] tnames = splittedTypeShortName.toLowerCase().split(" "); + String[] qnames = queryString.toLowerCase().split(" "); + for (int i = 0; i < tnames.length; i ++) { + if (tnames[i].length() == 0) continue; + boolean matched = false; + for (int j = 0; j < qnames.length; j ++) { + if (tnames[i].equals(qnames[j])) { + matched = true; + break; + } + } + if (!matched && !Globals.stopWordsList.isStopWord(tnames[i])) { + return false; + } + } + String qlast = qnames[qnames.length-1]; + boolean flag = false; + for (int i = 0; i < tnames.length; i ++) { + if (tnames[i].length() == 0) continue; + if (tnames[i].equals(qlast)) { + flag = true; + break; + } + } + + if (flag) return true; + else return false; + } + +} diff --git a/src/log/QueryLogger.java b/src/log/QueryLogger.java new file mode 100644 index 0000000..901ff7b --- /dev/null +++ b/src/log/QueryLogger.java @@ -0,0 +1,116 @@ +package log; + +//import java.io.File; +//import java.io.FileNotFoundException; +//import java.io.FileOutputStream; +//import java.io.OutputStreamWriter; +//import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; + +import javax.servlet.http.HttpServletRequest; + +//import qa.Globals; +import qa.Matches; +import qa.Query; +import rdf.EntityMapping; +import rdf.SemanticRelation; +import rdf.Sparql; +import rdf.MergedWord; +import rdf.SemanticUnit; +import qa.Answer; +import nlp.ds.Sentence; +import nlp.ds.Word; + +public class QueryLogger { + public Sentence s = null; + public String ipAdress = null; + + public Word target = null; + public Sparql sparql = null; + public Matches match = null; + public ArrayList answers = null; + + public boolean MODE_debug = false; + public boolean MODE_log = true; + public boolean MODE_fragment = true; + public boolean isMaltParserUsed = true; // Notice, we utilize Malt Parser as default parser, which is different from the older version. TODO: some coref rules need changed to fit Malt Parser. + + public HashMap timeTable = null; + public ArrayList mWordList = null; + public ArrayList semanticUnitList = null; + public HashMap semanticRelations = null; + public HashMap potentialSemanticRelations = null; + public HashMap> entityDictionary = null; + public ArrayList rankedSparqls = null; + + public String NRlog = ""; + public String SQGlog = ""; + public int gStoreCallTimes = 0; + + public QueryLogger (Query query) + { + timeTable = new HashMap(); + rankedSparqls = new ArrayList(); + mWordList = query.mWordList; + } + + public void reloadSentence(Sentence sentence) + { + this.s = sentence; + if(this.semanticUnitList != null) + this.semanticUnitList.clear(); + if(this.semanticRelations != null) + this.semanticRelations.clear(); + if(this.rankedSparqls != null) + this.rankedSparqls.clear(); + } + + // Source code: http://edu.21cn.com/java/g_189_755584-1.htm + public static String getIpAddr(HttpServletRequest request) { + String ip = request.getHeader("x-forwarded-for"); + if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) { + ip = request.getHeader("Proxy-Client-IP"); + } + if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) { + ip = request.getHeader("WL-Proxy-Client-IP"); + } + if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) { + ip = request.getRemoteAddr(); + } + + int idx; + if((idx = ip.indexOf(',')) != -1) { + ip = ip.substring(0, idx); + } + return ip; + } + + public void reviseAnswers() + { + System.out.println("Revise Answers:"); + answers = new ArrayList(); + if (match == null || sparql == null || match.answers == null || sparql.questionFocus == null) + return; + + HashSet answerSet = new HashSet(); + String questionFocus = sparql.questionFocus; + String sparqlString = sparql.toStringForGStore(); + //System.out.println("mal="+match.answers.length); + for (int i=0;i nodesList = null; + + public SemanticGraph dependencies = null; // Method 1: CoreNLP (discarded) + public GrammaticalStructure gs = null; // Method 2: Stanford Parser + public DependencyStructure maltGraph = null; // Method 3: MaltParser + + public HashMap> wordBaseFormIndex = null; + + public DependencyTree (Sentence sentence, CoreNLP coreNLPparser) { + SemanticGraph dependencies = coreNLPparser.getBasicDependencies(sentence.plainText); + this.dependencies = dependencies; + + Stack stack = new Stack(); + IndexedWord iwRoot = dependencies.getFirstRoot(); + + HashMap map = new HashMap(); + nodesList = new ArrayList(); + + stack.push(iwRoot); + root = this.setRoot(sentence.getWordByIndex(iwRoot.index())); + map.put(iwRoot, root); + + while (!stack.empty()) + { + IndexedWord curIWNode = stack.pop(); + DependencyTreeNode curDTNode = map.get(curIWNode); + + for (IndexedWord iwChild : dependencies.getChildList(curIWNode)) { + Word w = sentence.getWordByIndex(iwChild.index()); + DependencyTreeNode newDTNode = this.insert( + curDTNode, + w, + dependencies.reln(curIWNode, iwChild).getShortName()); + map.put(iwChild, newDTNode); + stack.push(iwChild); + } + + curDTNode.sortChildrenList(); + nodesList.add(curDTNode); + } + } + + public DependencyTree (Sentence sentence, StanfordParser stanfordParser) { + this.gs = stanfordParser.getGrammaticalStructure(sentence.plainText); + + HashMap map = new HashMap(); + nodesList = new ArrayList(); + + List tdl = gs.typedDependencies(false); + // 1. generate all nodes. + for (TypedDependency td : tdl) { + // gov + if (!map.containsKey(td.gov().index()) && !td.reln().getShortName().equals("root")) { + Word w = sentence.getWordByIndex(td.gov().index()); + DependencyTreeNode newNode = new DependencyTreeNode(w); + map.put(td.gov().index(), newNode); + nodesList.add(newNode); + } + // dep + if (!map.containsKey(td.dep().index())) { + Word w = sentence.getWordByIndex(td.dep().index()); + DependencyTreeNode newNode = new DependencyTreeNode(w); + map.put(td.dep().index(), newNode); + nodesList.add(newNode); + } + } + // 2. add edges. + for (TypedDependency td : tdl) { + if (td.reln().getShortName().equals("root")) { + this.root = map.get(td.dep().index()); + this.root.levelInTree = 0; + this.root.dep_father2child = "root"; + } + else { + DependencyTreeNode gov = map.get(td.gov().index()); + DependencyTreeNode dep = map.get(td.dep().index()); + + dep.father = gov; + gov.childrenList.add(dep); + dep.dep_father2child = td.reln().getShortName(); + } + } + + // add levelInTree, sort childrenList & nodesList + Stack stack = new Stack(); + stack.push(this.root); + while (!stack.empty()) { + DependencyTreeNode dtn = stack.pop(); + if (dtn.father != null) { + dtn.levelInTree = dtn.father.levelInTree + 1; + dtn.sortChildrenList(); + } + for (DependencyTreeNode chd : dtn.childrenList) { + stack.push(chd); + } + } + Collections.sort(nodesList, new DependencyTreeNodeComparator()); + for (DependencyTreeNode dtn : nodesList) { + dtn.linkNN(this); + } + } + + public DependencyTree (Sentence sentence, MaltParser maltParser)throws MaltChainedException { + try { + // the tokens are parsed in the following line + DependencyStructure graph = maltParser.getDependencyStructure(sentence); + this.maltGraph = graph; + //System.out.println(graph); + + HashMap map = new HashMap(); + ArrayList list = new ArrayList(); + Stack stack = new Stack(); + DependencyNode nroot = graph.getDependencyRoot(); + stack.add(nroot); + // 1. generate all nodes. + while (!stack.isEmpty()) { + DependencyNode n = stack.pop(); + DependencyNode sib = n.getRightmostDependent(); + int key = n.getIndex(); + //System.out.println("[current node][key="+key+"] "+n+" <"+n.getHeadEdge()+">"); + boolean flag = true; + while (sib != null) { + flag = false; + stack.push(sib); + sib = sib.getLeftSibling(); + } + if (flag) { + sib = n.getLeftmostDependent(); + while (sib != null) { + stack.push(sib); + sib = sib.getRightSibling(); + } + } + if (n.hasHead() && !map.containsKey(key)) { + //String snode = n.toString(); + String sedge = n.getHeadEdge().toString(); + //System.out.println("[" + snode + "] <" + sedge + ">"); + + /*int position = 0; + String wordOriginal = null; + String wordBase; + String postag = null;*/ + String dep = null; + int idx1, idx2; + + /*// position + idx1 = snode.indexOf("ID:")+3; + idx2 = snode.indexOf(' ', idx1); + position = Integer.parseInt(snode.substring(idx1, idx2)); + + // word + idx1 = snode.indexOf("FORM:", idx2)+5; + idx2 = snode.indexOf(' ', idx1); + wordOriginal = snode.substring(idx1, idx2); + wordBase = Globals.coreNLP.getBaseFormOfPattern(wordOriginal.toLowerCase()); + + // postag + idx1 = snode.indexOf("POSTAG:", idx2)+7; + idx2 = snode.indexOf(' ', idx1); + postag = snode.substring(idx1, idx2);*/ + + // dep + idx1 = sedge.lastIndexOf(':')+1; + idx2 = sedge.lastIndexOf(' '); + dep = sedge.substring(idx1, idx2); + if (dep.equals("null")) { + dep = null; + } + else if (dep.equals("punct")) {// No consider about punctuation + continue; + } + + DependencyTreeNode newNode = new DependencyTreeNode(sentence.getWordByIndex(key)); + newNode.dep_father2child = dep; + map.put(key, newNode); + list.add(newNode); + } + } + + + // 2. add edges + for (Integer k : map.keySet()) { + DependencyNode n = graph.getDependencyNode(k); + DependencyTreeNode dtn = map.get(k); + if (dtn.dep_father2child == null) { + this.setRoot(dtn); + this.root.levelInTree = 0; + this.root.dep_father2child = "root"; + } + else { + DependencyTreeNode father = map.get(n.getHead().getIndex()); + DependencyTreeNode child = map.get(n.getIndex()); + child.father = father; + father.childrenList.add(child); + } + } + + // Fix the tree for some cases. + if(list.size() > 11) + { + DependencyTreeNode dt1 = list.get(11), dt2 = list.get(5); + if(dt1!=null && dt2!=null && dt1.word.baseForm.equals("star") && dt1.father.word.baseForm.equals("be")) + { + if (dt2.word.baseForm.equals("film") || dt2.word.baseForm.equals("movie")) + { + dt1.father.childrenList.remove(dt1); + dt1.father = dt2; + dt2.childrenList.add(dt1); + } + } + } + + // add levelInTree, sort childrenList & nodesList + for (DependencyTreeNode dtn : list) { + if (dtn.father != null) { + dtn.levelInTree = dtn.father.levelInTree + 1; + dtn.sortChildrenList(); + } + } + + nodesList = list; + Collections.sort(nodesList, new DependencyTreeNodeComparator()); + for (DependencyTreeNode dtn : nodesList) { + dtn.linkNN(this); + } + } catch (MaltChainedException e) { + //e.printStackTrace(); + //System.err.println("MaltParser exception: " + e.getMessage()); + throw e; + } + } + + public DependencyTreeNode setRoot(Word w) { + root = new DependencyTreeNode(w, "root", null); + return root; + } + + public DependencyTreeNode setRoot(DependencyTreeNode root) { + this.root = root; + return this.root; + } + + public void buildWordBaseFormIndex () { + wordBaseFormIndex = new HashMap>(); + for (DependencyTreeNode dtn: nodesList) { + String w = dtn.word.baseForm; + if (!wordBaseFormIndex.keySet().contains(w)) + wordBaseFormIndex.put(w, new ArrayList()); + wordBaseFormIndex.get(w).add(dtn); + } + } + + public DependencyTreeNode insert(DependencyTreeNode father, Word w, String dep_father2child) { + if (father == null || w == null) + return null; + + DependencyTreeNode newNode = new DependencyTreeNode(w, dep_father2child, father); + father.childrenList.add(newNode); + return newNode; + } + + public DependencyTreeNode getRoot() { + return root; + } + + public ArrayList getNodesList(){ + return nodesList; + } + + public ArrayList getShortestNodePathBetween(DependencyTreeNode n1, DependencyTreeNode n2) + { + if(n1 == n2) { + return new ArrayList(); + } + + ArrayList path1 = getPath2Root(n1); + ArrayList path2 = getPath2Root(n2); + + int idx1 = path1.size()-1; + int idx2 = path2.size()-1; + DependencyTreeNode curNode1 = path1.get(idx1); + DependencyTreeNode curNode2 = path2.get(idx2); + + while (curNode1 == curNode2) { + idx1 --; + idx2 --; + if(idx1 < 0 || idx2 < 0) break; + curNode1 = path1.get(idx1); + curNode2 = path2.get(idx2); + } + + ArrayList shortestPath = new ArrayList(); + for (int i = 0; i <= idx1; i ++) { + shortestPath.add(path1.get(i)); + } + for (int i = idx2+1; i >= 0; i --) { + shortestPath.add(path2.get(i)); + } + + System.out.println("Shortest Path between <" + n1 + "> and <" + n2 + ">:"); + System.out.print("\t-"); + for (DependencyTreeNode dtn : shortestPath) { + System.out.print("<" + dtn + ">-"); + } + System.out.println(); + + return shortestPath; + } + + public ArrayList getPath2Root(DependencyTreeNode n1) { + ArrayList path = new ArrayList(); + DependencyTreeNode curNode = n1; + path.add(curNode); + while (curNode.father != null) { + curNode = curNode.father; + path.add(curNode); + } + return path; + } + + public ArrayList getTreeNodesListContainsWords(String words) { + ArrayList ret = new ArrayList(); + for (DependencyTreeNode dtn : nodesList) { + if (dtn.word.originalForm.equalsIgnoreCase(words) + || dtn.word.baseForm.equalsIgnoreCase(words) + || words.contains(dtn.word.originalForm) + || words.contains(dtn.word.baseForm)) + ret.add(dtn); + } + return ret; + } + + public DependencyTreeNode getNodeByIndex (int posi) { + for (DependencyTreeNode dt : nodesList) { + if (dt.word.position == posi) { + return dt; + } + } + return null; + } + + public DependencyTreeNode getFirstPositionNodeInList(ArrayList list) { + int firstPosi = Integer.MAX_VALUE; + DependencyTreeNode firstNode = null; + for (DependencyTreeNode dtn : list) { + if (dtn.word.position < firstPosi) { + firstPosi = dtn.word.position; + firstNode = dtn; + } + } + return firstNode; + } + + @Override + public String toString() { + String ret = ""; + + Stack stack = new Stack(); + stack.push(root); + while(!stack.empty()) { + DependencyTreeNode curNode = stack.pop(); + for (int i = 0; i <= curNode.levelInTree; i ++) + ret += " "; + ret += "-> "; + ret += curNode.word.baseForm; + ret += "-"; + ret += curNode.word.posTag; + ret += " ("; + ret += curNode.dep_father2child; + ret += ")"; + ret += "[" + curNode.word.position + "]\n"; + + for (DependencyTreeNode child : curNode.childrenList) { + stack.push(child); + } + } + return ret; + } +} diff --git a/src/nlp/ds/DependencyTreeNode.java b/src/nlp/ds/DependencyTreeNode.java new file mode 100644 index 0000000..8b61896 --- /dev/null +++ b/src/nlp/ds/DependencyTreeNode.java @@ -0,0 +1,150 @@ +package nlp.ds; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Stack; + +public class DependencyTreeNode { + public Word word = null; + public String dep_father2child = null; + + public DependencyTreeNode father = null; + public ArrayList childrenList = null; + + public int levelInTree = -1; + + /** + * The constructor for knowing its father + * + * @param w + * @param dep_father2child + * @param father + */ + public DependencyTreeNode(Word w, String dep_father2child, DependencyTreeNode father) + { + word = w; + this.dep_father2child = dep_father2child; + this.father = father; + this.childrenList = new ArrayList(); + + if(father==null) levelInTree = 0; + else levelInTree = father.levelInTree+1; + } + + /** + * The constructor for not knowing the father + * + * @param word + */ + public DependencyTreeNode(Word w) + { + this.word = w; + this.childrenList = new ArrayList(); + } + + public void sortChildrenList () { + childrenList.trimToSize(); + Collections.sort(childrenList, new DependencyTreeNodeComparator()); + } + + @Override + public String toString(){ + return word.originalForm + "-" + word.posTag + "(" + dep_father2child + ")[" + word.position + "]"; + } + + public static void sortArrayList(ArrayList list) { + Collections.sort(list, new DependencyTreeNodeComparator()); + } + + public DependencyTreeNode containDependencyWithChildren (String dep) { + for (DependencyTreeNode son : childrenList) { + if (son.dep_father2child.equals(dep)) return son; + } + return null; + } + + /** + * equal_or_startWith = true: equal + * equal_or_startWith = false: startWith + * + * @param posChild + * @param equal_or_startWith + * @return + */ + public DependencyTreeNode containPosInChildren (String posChild, boolean equal_or_startWith) { + for (DependencyTreeNode son : childrenList) { + if (equal_or_startWith) { + if (son.word.posTag.equals(posChild)) return son; + } + else { + if (son.word.posTag.startsWith(posChild)) return son; + } + } + return null; + } + + public DependencyTreeNode containWordBaseFormInChildren (String wordBaseFormChild) { + for (DependencyTreeNode son : childrenList) { + if (son.word.baseForm.equals(wordBaseFormChild)) return son; + } + return null; + } + + public DependencyTreeNode getNNTopTreeNode (DependencyTree T) { + if(this.father != null && (this.dep_father2child.equals("nn") || (this.word.posTag.startsWith("NN") && this.dep_father2child.equals("dep")))) { + return this.father.getNNTopTreeNode(T); + } + else return this; + } + + public Word linkNN(DependencyTree T) { + // (Now useless) backtracking the NN connections. + ArrayList nn = new ArrayList(); + + nn.add(this); + + if(this.father != null && (this.dep_father2child.equals("nn") + || (this.word.posTag.startsWith("NN") && this.dep_father2child.equals("dep") && this.father.word.posTag.startsWith("NN")))) { + nn.add(this.father); + for(DependencyTreeNode son : this.father.childrenList) { + if (son != this && son.dep_father2child.equals("nn")) { + nn.add(son); + } + } + } + + Stack stack = new Stack(); + stack.push(this); + while (!stack.empty()) { + DependencyTreeNode curNode = stack.pop(); + for(DependencyTreeNode son : curNode.childrenList) { + if (son.dep_father2child.equals("nn") + || (son.word.posTag.startsWith("NN") && son.dep_father2child.equals("dep") && son.father.word.posTag.startsWith("NN"))) { + nn.add(son); + stack.push(son); + } + } + } + + DependencyTreeNode.sortArrayList(nn); + + int size = nn.size() - 1; + for (int i = 0; i < size; i ++) { + nn.get(i).word.nnNext = nn.get(i+1).word; + nn.get(i+1).word.nnPrev = nn.get(i).word; + } + + return this.word.getNnHead(); + } + +}; + + +class DependencyTreeNodeComparator implements Comparator { + + public int compare(DependencyTreeNode n1, DependencyTreeNode n2) { + return n1.word.position - n2.word.position; + } + +} diff --git a/src/nlp/ds/Sentence.java b/src/nlp/ds/Sentence.java new file mode 100644 index 0000000..8f95b27 --- /dev/null +++ b/src/nlp/ds/Sentence.java @@ -0,0 +1,88 @@ +package nlp.ds; + +import java.util.ArrayList; +import java.util.HashMap; + +import qa.Globals; +import qa.Query; +import rdf.MergedWord; + +public class Sentence { + public String plainText = null; + public Word[] words = null; + public HashMap map = null; + + public DependencyTree dependencyTreeStanford = null; + public DependencyTree dependencyTreeMalt = null; + + public enum SentenceType {SpecialQuestion,GeneralQuestion,ImperativeSentence} + public SentenceType sentenceType = SentenceType.SpecialQuestion; + + public Sentence (String s) + { + plainText = s; + words = Globals.coreNLP.getTaggedWords(plainText); + map = new HashMap(); + for (Word w : words) + map.put(w.key, w); + } + + public Sentence (Query query, String s) + { + plainText = s; + words = Globals.coreNLP.getTaggedWords(plainText); + // inherit NodeRecognition's information + for(Word word: words) + { + for(MergedWord mWord: query.mWordList) + { + if(word.originalForm.equals(mWord.name)) + { + word.mayLiteral = mWord.mayLiteral; + word.mayEnt = mWord.mayEnt; + word.mayType = mWord.mayType; + word.mayCategory = mWord.mayCategory; + word.tmList = mWord.tmList; + word.emList = mWord.emList; + word.category = mWord.category; + } + } + } + map = new HashMap(); + for (Word w : words) + map.put(w.key, w); + } + public ArrayList getWordsByString (String w) { + ArrayList ret = new ArrayList(); + for (Word wo: words) { + if (wo.originalForm.equals(w)) ret.add(wo); + } + return ret; + } + + public Word getWordByIndex (int idx) { + return words[idx-1]; + } + + public Word getWordByKey (String k) { + return map.get(k); + } + + public boolean hasModifier(Word w) + { + for(Word word: words) + if(word!=w && word.modifiedWord==w) + return true; + return false; + } + + public void printNERResult () { + for (Word word : words) { + System.out.print(word + " "); + System.out.println("ner=" + word.ner); + } + } +} + + + diff --git a/src/nlp/ds/Word.java b/src/nlp/ds/Word.java new file mode 100644 index 0000000..0d38be3 --- /dev/null +++ b/src/nlp/ds/Word.java @@ -0,0 +1,126 @@ +package nlp.ds; + +import java.util.ArrayList; + +import rdf.EntityMapping; +import rdf.Triple; +import rdf.TypeMapping; + +public class Word implements Comparable +{ + public boolean mayCategory = false; + public boolean mayLiteral = false; + public boolean mayEnt = false; + public boolean mayType = false; + public boolean mayExtendVariable = false; + public String category = null; + public ArrayList emList = null; + public ArrayList tmList = null; + public Triple embbededTriple = null; + + public String baseForm = null; + public String originalForm = null; + public String posTag = null; + public int position = -1; // Notice the first word's position = 1 + public String key = null; + + public boolean isCovered = false; + public boolean isIgnored = false; + + //Notice: These variables are not used because we merge a phrase to a word if it is a node now. + public String ner = null; // record NER result + public Word nnNext = null; + public Word nnPrev = null; + public Word crr = null; // coreference resolution result + + public Word represent = null; // This word is represented by others, eg, "which book is ..." "which" + public boolean omitNode = false; // This word can not be node + public Word modifiedWord = null; // This word modify which word (it modify itself if it is not a modified word) + + public Word (String base, String original, String pos, int posi) { + baseForm = base; + originalForm = original; + posTag = pos; + position = posi; + key = new String(originalForm+"["+position+"]"); + } + + @Override + public String toString() { + return key; + } + + public int compareTo(Word another) { + return this.position-another.position; + } + + @Override + public int hashCode() { + return key.hashCode(); + } + + @Override + public boolean equals(Object o) { + return (o instanceof Word) + && originalForm.equals(((Word)o).originalForm) + && position == ((Word)o).position; + } + + // We now discard all NN information and return the word itself. | husen 2016 + public Word getNnHead() { + Word w = this; + return w; + +// if(w.mayEnt || w.mayType) +// return w; +// +// while (w.nnPrev != null) { +// w = w.nnPrev; +// } +// return w; + } + + public String getFullEntityName() { + Word w = this.getNnHead(); + return w.originalForm; + +// if(w.mayEnt || w.mayType) +// return w.originalForm; +// +// StringBuilder sb = new StringBuilder(""); +// while (w != null) { +// sb.append(w.originalForm); +// sb.append(' '); +// w = w.nnNext; +// } +// sb.deleteCharAt(sb.length()-1); +// return sb.toString(); + } + + public String getBaseFormEntityName() { + Word w = this.getNnHead(); + if(w.mayEnt || w.mayType) + return w.baseForm; + + StringBuilder sb = new StringBuilder(""); + while (w != null) { + sb.append(w.baseForm); + sb.append(' '); + w = w.nnNext; + } + sb.deleteCharAt(sb.length()-1); + return sb.toString(); + } + + public String isNER () { + return this.getNnHead().ner; + } + + public void setIsCovered () { + Word w = this.getNnHead(); + while (w != null) { + w.isCovered = true; + w = w.nnNext; + } + } +} diff --git a/src/nlp/tool/CoreNLP.java b/src/nlp/tool/CoreNLP.java new file mode 100644 index 0000000..ae8b355 --- /dev/null +++ b/src/nlp/tool/CoreNLP.java @@ -0,0 +1,202 @@ +package nlp.tool; + +import java.util.List; +import java.util.Properties; + +import nlp.ds.Word; +import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; +import edu.stanford.nlp.trees.semgraph.SemanticGraph; +import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; +import edu.stanford.nlp.util.CoreMap; + +public class CoreNLP { + + // CoreNLP can also recognize TIME and NUMBER (see SUTime) + private StanfordCoreNLP pipeline_lemma; + + public CoreNLP () { + // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution + /*Properties props_all = new Properties(); + props_all.put("annotators", "tokenize, ssplit, pos, lemma, parse"); // full list: "tokenize, ssplit, pos, lemma, ner, parse, dcoref" + pipeline_all = new StanfordCoreNLP(props_all);*/ + + Properties props_lemma = new Properties(); + props_lemma.put("annotators", "tokenize, ssplit, pos, lemma"); + pipeline_lemma = new StanfordCoreNLP(props_lemma); + + } + + // For more efficient usage, refer to "http://www.jarvana.com/jarvana/view/edu/stanford/nlp/stanford-corenlp/1.2.0/stanford-corenlp-1.2.0-javadoc.jar!/edu/stanford/nlp/process/Morphology.html" + public String getBaseFormOfPattern (String text) { + String ret = new String(""); + + // create an empty Annotation just with the given text + Annotation document = new Annotation(text); + // run all Annotators on this text + pipeline_lemma.annotate(document); + + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + // �������о��� + List sentences = document.get(SentencesAnnotation.class); + + int count = 0; + for(CoreMap sentence: sentences) { + // traversing the words in the current sentence + // a CoreLabel is a CoreMap with additional token-specific methods + for (CoreLabel token: sentence.get(TokensAnnotation.class)) { + // this is the base form (lemma) of the token + String lemma = token.getString(LemmaAnnotation.class); + ret += lemma; + ret += " "; + } + count ++; + if (count % 100 == 0) { + System.out.println(count); + } + } + + return ret.substring(0, ret.length()-1); + } + + public SemanticGraph getBasicDependencies (String s) { + // create an empty Annotation just with the given text + Annotation document = new Annotation(s); + + // run all Annotators on this text + pipeline_lemma.annotate(document); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + // this is the Stanford dependency graph of the current sentence + SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class); + return dependencies; + } + + return null; + } + + public Tree getParseTree (String text) { + // create an empty Annotation just with the given text + Annotation document = new Annotation(text); + + // run all Annotators on this text + pipeline_lemma.annotate(document); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + // this is the parse tree of the current sentence + return sentence.get(TreeAnnotation.class); + } + + return null; + } + + /** + * How to use: + * for (CoreLabel token : sentence.get(TokensAnnotation.class)) { + * // this is the text of the token + * String word = token.get(TextAnnotation.class); + * // this is the POS tag of the token + * String pos = token.get(PartOfSpeechAnnotation.class); + * } + * @param s + * @return + */ + public CoreMap getPOS (String s) { + // create an empty Annotation just with the given text + Annotation document = new Annotation(s); + + // run all Annotators on this text + pipeline_lemma.annotate(document); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + // this is the sentence with POS Tags + return sentence; + } + + return null; + } + + public Word[] getTaggedWords (String sentence) { + CoreMap taggedSentence = getPOS(sentence); + Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()]; + int count = 0; + for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = token.get(TextAnnotation.class); + // this is the POS tag of the token + String pos = token.get(PartOfSpeechAnnotation.class); + //System.out.println(word+"["+pos+"]"); + ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1); + count ++; + } + return ret; + } + + /*public void demo () { + // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); + StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + + // read some text in the text variable + String text = ... // Add your text here! + + // create an empty Annotation just with the given text + Annotation document = new Annotation(text); + + // run all Annotators on this text + pipeline.annotate(document); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + // traversing the words in the current sentence + // a CoreLabel is a CoreMap with additional token-specific methods + for (CoreLabel token: sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = token.get(TextAnnotation.class); + // this is the POS tag of the token + String pos = token.get(PartOfSpeechAnnotation.class); + // this is the NER label of the token + String ne = token.get(NamedEntityTagAnnotation.class); + } + + // this is the parse tree of the current sentence + Tree tree = sentence.get(TreeAnnotation.class); + + // this is the Stanford dependency graph of the current sentence + SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); + } + + // This is the coreference link graph + // Each chain stores a set of mentions that link to each other, + // along with a method for getting the most representative mention + // Both sentence and token offsets start at 1! + Map graph = + document.get(CorefChainAnnotation.class); + }*/ +} diff --git a/src/nlp/tool/Main.java b/src/nlp/tool/Main.java new file mode 100644 index 0000000..1a680a3 --- /dev/null +++ b/src/nlp/tool/Main.java @@ -0,0 +1,42 @@ +package nlp.tool; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import nlp.ds.DependencyTree; +import nlp.ds.Sentence; +import qa.Globals; + +public class Main { + public static void main (String[] args) { + Globals.init(); + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + try { + while (true) { + System.out.println("Test maltparser."); + System.out.print("Please input the NL question: "); + String question = br.readLine(); + if (question.length() <= 3) + break; + try { + long t1 = System.currentTimeMillis(); + Sentence s = new Sentence(question); + DependencyTree dt = new DependencyTree(s, Globals.stanfordParser); + System.out.println("====StanfordDependencies===="); + System.out.println(dt); + DependencyTree dt2 = new DependencyTree(s, Globals.maltParser); + System.out.println("====MaltDependencies===="); + System.out.println(dt2); + long t2 = System.currentTimeMillis(); + System.out.println("time=" + (t2-t1) + "ms"); + } catch (Exception e) { + e.printStackTrace(); + } + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +} diff --git a/src/nlp/tool/MaltParser.java b/src/nlp/tool/MaltParser.java new file mode 100644 index 0000000..56e16bc --- /dev/null +++ b/src/nlp/tool/MaltParser.java @@ -0,0 +1,70 @@ +package nlp.tool; + + +import nlp.ds.Sentence; +import nlp.ds.Word; + +import org.maltparser.MaltParserService; +import org.maltparser.core.exception.MaltChainedException; +import org.maltparser.core.syntaxgraph.DependencyStructure; + +import qa.Globals; + +public class MaltParser { + private MaltParserService service = null; + public MaltParser() { + try + { + System.out.print("Loading MaltParser ..."); + service = new MaltParserService(); + // Inititalize the parser model 'model0' and sets the working directory to '.' and sets the logging file to 'parser.log' + //service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w . -lfi parser.log"); + service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w "+Globals.localPath+"lib/maltparser-1.9.1 -lfi parser.log"); + firstParse(); + System.out.println("ok!"); + } catch (MaltChainedException e) { + e.printStackTrace(); + System.err.println("MaltParser exception: " + e.getMessage()); + } + } + + private void firstParse() { + String[] tokens = new String[12]; + tokens[0] = "1\tIn\t_\tIN\tIN\t_"; + tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; + tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; + tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; + tokens[4] = "5\tby\t_\tIN\tIN\t_"; + tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; + tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; + tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; + tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; + tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; + tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; + tokens[11] = "12\t?\t_\t.\t.\t_"; + try { + service.parse(tokens); + } catch (MaltChainedException e) { + e.printStackTrace(); + } + } + + public DependencyStructure getDependencyStructure (Sentence sentence) { + try { + return service.parse(getTaggedTokens(sentence)); + } catch (MaltChainedException e) { + e.printStackTrace(); + } + return null; + } + + private String[] getTaggedTokens (Sentence sentence) { + String[] ret = new String[sentence.words.length]; + int count = 0; + for (Word w : sentence.words) { + ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); + count ++; + } + return ret; + } +} diff --git a/src/nlp/tool/MaltParserCon.java b/src/nlp/tool/MaltParserCon.java new file mode 100644 index 0000000..02214d4 --- /dev/null +++ b/src/nlp/tool/MaltParserCon.java @@ -0,0 +1,73 @@ +package nlp.tool; + +import java.io.File; +import java.net.URL; + +import nlp.ds.Sentence; +import nlp.ds.Word; + +import org.maltparser.concurrent.ConcurrentMaltParserModel; +import org.maltparser.concurrent.ConcurrentMaltParserService; +import org.maltparser.concurrent.graph.ConcurrentDependencyGraph; +import org.maltparser.core.exception.MaltChainedException; +//import org.maltparser.core.syntaxgraph.DependencyStructure; + + +public class MaltParserCon { + private ConcurrentMaltParserModel model = null; + public ConcurrentDependencyGraph outputGraph = null; + + public MaltParserCon(){ + try{ + System.out.println("Loading Maltparser...\n"); + URL ModelURL = new File("output/engmalt.linear-1.7.mco").toURI().toURL(); + model = ConcurrentMaltParserService.initializeParserModel(ModelURL); + firstTest(); + System.out.println("ok!\n"); + }catch(Exception e){ + e.printStackTrace(); + System.err.println("MaltParser exception: " + e.getMessage()); + } + } + + private void firstTest(){ + String[] tokens = new String[12]; + tokens[0] = "1\tIn\t_\tIN\tIN\t_"; + tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; + tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; + tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; + tokens[4] = "5\tby\t_\tIN\tIN\t_"; + tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; + tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; + tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; + tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; + tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; + tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; + tokens[11] = "12\t?\t_\t.\t.\t_"; + try { + outputGraph = model.parse(tokens); + } catch (Exception e) { + e.printStackTrace(); + } + System.out.println(outputGraph); + } + + public ConcurrentDependencyGraph getDependencyStructure (Sentence sentence) { + try { + return model.parse(getTaggedTokens(sentence)); + } catch (MaltChainedException e) { + e.printStackTrace(); + } + return null; + } + + private String[] getTaggedTokens (Sentence sentence) { + String[] ret = new String[sentence.words.length]; + int count = 0; + for (Word w : sentence.words) { + ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); + count ++; + } + return ret; + } +} diff --git a/src/nlp/tool/NERecognizer.java b/src/nlp/tool/NERecognizer.java new file mode 100644 index 0000000..11928a3 --- /dev/null +++ b/src/nlp/tool/NERecognizer.java @@ -0,0 +1,53 @@ +package nlp.tool; + +import java.util.List; + +import qa.Globals; + +import nlp.ds.Sentence; +import nlp.ds.Word; + +import edu.stanford.nlp.ie.AbstractSequenceClassifier; +import edu.stanford.nlp.ie.crf.CRFClassifier; +import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation; +import edu.stanford.nlp.ling.CoreLabel; + +public class NERecognizer { + + static String serializedClassifier; + static AbstractSequenceClassifier classifier; + //public static String localPath="E:\\Hanshuo\\gAnswer\\"; + + public NERecognizer() { + serializedClassifier = Globals.localPath+"lib/stanford-ner-2012-11-11/classifiers/english.all.3class.distsim.crf.ser.gz"; + classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); + } + + /*public NERecognizer(String basePath, boolean flag) { + serializedClassifier = "WEB-INF\\lib\\stanford-ner-2012-11-11\\stanford-ner-2012-11-11\\classifiers\\english.all.3class.distsim.crf.ser.gz"; + }*/ + + public void recognize(Sentence sentence) { + List lcl = classifier.classify(sentence.plainText).get(0); + for (CoreLabel cl : lcl) { + int position = Integer.parseInt(cl.get(PositionAnnotation.class))+1; + Word w = sentence.getWordByIndex(position); + String ner = cl.get(AnswerAnnotation.class); + if (ner.equals("O")) w.ner = null; + else w.ner = ner; + } + } + + public static void main(String[] args) { + System.out.println("Test NER"); + Globals.init(); + + Sentence s = new Sentence("I go to school at Stanford University, which is located in California.");//"Which states of Germany are governed by the Social Democratic Party?" + Globals.nerRecognizer.recognize(s); + for (Word word : s.words) { + System.out.print(word + " "); + System.out.println("ner=" + word.ner); + } + } +} diff --git a/src/nlp/tool/StanfordParser.java b/src/nlp/tool/StanfordParser.java new file mode 100644 index 0000000..12e305c --- /dev/null +++ b/src/nlp/tool/StanfordParser.java @@ -0,0 +1,51 @@ +package nlp.tool; + +import java.io.StringReader; +import java.util.List; + +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.objectbank.TokenizerFactory; +import edu.stanford.nlp.parser.lexparser.LexicalizedParser; +import edu.stanford.nlp.process.CoreLabelTokenFactory; +import edu.stanford.nlp.process.PTBTokenizer; +import edu.stanford.nlp.trees.GrammaticalStructure; +import edu.stanford.nlp.trees.GrammaticalStructureFactory; +import edu.stanford.nlp.trees.PennTreebankLanguagePack; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreebankLanguagePack; + +public class StanfordParser { + private LexicalizedParser lp; + private TokenizerFactory tokenizerFactory; + private TreebankLanguagePack tlp; + private GrammaticalStructureFactory gsf; + + public StanfordParser() { + lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); + tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); + tlp = new PennTreebankLanguagePack(); + gsf = tlp.grammaticalStructureFactory(); + } + + public GrammaticalStructure getGrammaticalStructure (String sentence) { + List rawWords2 = + tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); + // Converts a Sentence/List/String into a Tree. + // In all circumstances, the input will be treated as a single sentence to be parsed. + Tree parse = lp.apply(rawWords2); + + return gsf.newGrammaticalStructure(parse); + /*List tdl = gs.typedDependencies(false); + for (TypedDependency td : tdl) { + System.out.println(td.reln().getShortName()+"("+td.gov()+","+td.dep()+")"); + System.out.println("gov="+td.gov() + +"\tgov.index=" + +td.gov().index() + +"\tgov.value=" + +td.gov().value() + +"\tgov.pos=" + +((TreeGraphNode)td.gov().parent()).value()); + }*/ + //System.out.println(tdl); + } +} diff --git a/src/nlp/tool/StopWordsList.java b/src/nlp/tool/StopWordsList.java new file mode 100644 index 0000000..7569879 --- /dev/null +++ b/src/nlp/tool/StopWordsList.java @@ -0,0 +1,614 @@ +package nlp.tool; + +import java.util.HashSet; +import java.util.Arrays; + +public class StopWordsList { + public static HashSet sw_list = new HashSet(); + + public StopWordsList() { + initiate(); + } + + public void initiate() { + sw_list.addAll(Arrays.asList(sw_array)); + + // some commas + /*sw_list.add("."); + sw_list.add(","); + sw_list.add(";"); + sw_list.add("?"); + sw_list.add("!"); + sw_list.add(":"); + sw_list.add("("); + sw_list.add(")"); + sw_list.add("-");*/ + } + + /** + * To judge whether a word is a stop-word + * @param word_lowercase: the word, should be in lower-case + * @return if the word is a stop-word, then true; otherwise, false. + */ + public boolean isStopWord(String word_lowercase) { + if (sw_list.contains(word_lowercase)) return true; + else return false; + } + + private static final String sw_array[] = new String[]{ + "a", + "able", + "about", + "across", + "after", + "all", + "almost", + "also", + "am", + "among", + "an", + "and", + "any", + "are", + "as", + "at", + //"be", + "because", + "been", + "but", + "by", + "can", + "cannot", + "could", + "dear", + "did", + "do", + "does", + "either", + "else", + "ever", + "every", + "for", + "from", + "get", + "got", + "had", + "has", + "have", + "he", + "her", + "hers", + "him", + "his", + "how", + "however", + "i", + "if", + "in", + "into", + "is", + "it", + "its", + "just", + "least", + "let", + "like", + "likely", + "may", + "me", + "might", + "most", + "must", + "my", + "neither", + "no", + "nor", + "not", + "of", + "off", + "often", + "on", + "only", + "or", + "other", + "our", + "own", + "rather", + "said", + "say", + "says", + "she", + "should", + "since", + "so", + "some", + "than", + "that", + "the", + "their", + "them", + "then", + "there", + "these", + "they", + "this", + "tis", + "to", + "too", + "twas", + "us", + "wants", + "was", + "we", + "were", + "what", + "when", + "where", + "which", + "while", + "who", + "whom", + "why", + "will", + "with", + "would", + "yet", + "you", + "your" + }; +}; + +/*// stop word 308 + +// http://norm.al/2009/04/14/list-of-english-stop-words/ + + private static final String sw_array[] = new String[]{ + "a", + "about", + "above", + "across", + "after", + "afterwards", + "again", + "against", + "all", + "almost", + "alone", + "along", + "already", + "also", + "although", + "always", + "am", + "among", + "amongst", + "amoungst", + "amount", + "an", + "and", + "another", + "any", + "anyhow", + "anyone", + "anything", + "anyway", + "anywhere", + "are", + "around", + "as", + "at", + "back", + "be", + "became", + "because", + "become", + "becomes", + "becoming", + "been", + "before", + "beforehand", + "behind", + "being", + "below", + "beside", + "besides", + "between", + "beyond", + "bill", + "both", + "bottom", + "but", + "by", + "call", + "can", + "cannot", + "cant", + "co", + "computer", + "con", + "could", + "couldnt", + "cry", + "de", + "describe", + "detail", + "do", + "did", + "done", + "down", + "due", + "during", + "each", + "eg", + "eight", + "either", + "eleven", + "else", + "elsewhere", + "empty", + "enough", + "etc", + "even", + "ever", + "every", + "everyone", + "everything", + "everywhere", + "except", + "few", + "fifteen", + "fify", + "fill", + "find", + "fire", + "first", + "five", + "for", + "former", + "formerly", + "forty", + "found", + "four", + "from", + "front", + "full", + "further", + "get", + "give", + "go", + "had", + "has", + "hasnt", + "have", + "he", + "hence", + "her", + "here", + "here", + "hereafter", + "hereby", + "herein", + "hereupon", + "hers", + "herself", + "him", + "himself", + "his", + "how", + "however", + "hundred", + "i", + "ie", + "if", + "in", + "inc", + "indeed", + "interest", + "into", + "is", + "it", + "its", + "itself", + "keep", + "last", + "latter", + "latterly", + "least", + "less", + "ltd", + "made", + "many", + "may", + "me", + "meanwhile", + "might", + "mill", + "mine", + "more", + "moreover", + "most", + "mostly", + "move", + "much", + "must", + "my", + "myself", + "name", + "namely", + "neither", + "never", + "nevertheless", + "next", + "nine", + "no", + "nobody", + "none", + "noone", + "nor", + "not", + "nothing", + "now", + "nowhere", + "of", + "off", + "often", + "on", + "once", + "one", + "only", + "onto", + "or", + "other", + "others", + "otherwise", + "our", + "ours", + "ourselves", + "out", + "over", + "own", + "part", + "per", + "perhaps", + "please", + "put", + "rather", + "re", + "same", + "see", + "seem", + "seemed", + "seeming", + "seems", + "serious", + "several", + "she", + "should", + "show", + "side", + "since", + "sincere", + "six", + "sixty", + "so", + "some", + "somehow", + "someone", + "something", + "sometime", + "sometimes", + "somewhere", + "still", + "such", + "system", + "take", + "ten", + "than", + "that", + "the", + "their", + "them", + "themselves", + "then", + "thence", + "there", + "thereafter", + "thereby", + "therefore", + "therein", + "thereupon", + "these", + "they", + "thick", + "thin", + "third", + "this", + "those", + "though", + "throughout", + "thru", + "thus", + "to", + "together", + "too", + "top", + "toward", + "towards", + "twelve", + "twenty", + "two", + "un", + "under", + "until", + "up", + "upon", + "us", + "very", + "via", + "was", + "we", + "we", + "well", + "were", + "what", + "whatever", + "when", + "whence", + "whenever", + "where", + "whereafter", + "whereas", + "whereby", + "wherein", + "whereupon", + "wherever", + "whether", + "which", + "while", + "whither", + "who", + "whoever", + "whole", + "whom", + "whose", + "why", + "will", + "with", + "within", + "without", + "would", + "yet", + "you", + "your", + "yours", + "yourself", + "yourselves" + }; +*/ + + +/* // stop words 119 + +// http://www.textfixer.com/resources/common-english-words.txt + private static final String sw_array[] = new String[]{ + "a", + "able", + "about", + "across", + "after", + "all", + "almost", + "also", + "am", + "among", + "an", + "and", + "any", + "are", + "as", + "at", + "be", + "because", + "been", + "but", + "by", + "can", + "cannot", + "could", + "dear", + "did", + "do", + "does", + "either", + "else", + "ever", + "every", + "for", + "from", + "get", + "got", + "had", + "has", + "have", + "he", + "her", + "hers", + "him", + "his", + "how", + "however", + "i", + "if", + "in", + "into", + "is", + "it", + "its", + "just", + "least", + "let", + "like", + "likely", + "may", + "me", + "might", + "most", + "must", + "my", + "neither", + "no", + "nor", + "not", + "of", + "off", + "often", + "on", + "only", + "or", + "other", + "our", + "own", + "rather", + "said", + "say", + "says", + "she", + "should", + "since", + "so", + "some", + "than", + "that", + "the", + "their", + "them", + "then", + "there", + "these", + "they", + "this", + "tis", + "to", + "too", + "twas", + "us", + "wants", + "was", + "we", + "were", + "what", + "when", + "where", + "which", + "while", + "who", + "whom", + "why", + "will", + "with", + "would", + "yet", + "you", + "your" + }; +*/ \ No newline at end of file diff --git a/src/paradict/ParaphraseDictionary.java b/src/paradict/ParaphraseDictionary.java new file mode 100644 index 0000000..1d87ff1 --- /dev/null +++ b/src/paradict/ParaphraseDictionary.java @@ -0,0 +1,441 @@ +package paradict; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; + + + + +import nlp.tool.CoreNLP; +import qa.Globals; + +public class ParaphraseDictionary { + public static String localDataPath; + public static String dbpedia_relation_paraphrases_baseform_withScore; + public static String dbpedia_relation_paraphrases_baseform_withScore_rerank; + public static String dbpedia_relation_paraphrases_handwrite; + public static String dbpedia_predicate_id; + public static String dbpedia_dbo_predicate; + + public HashMap predicate_2_id = null; + public HashMap id_2_predicate = null; + public HashSet dbo_predicate_id = null; + public HashMap> nlPattern_2_predicateList = null; + public HashMap> invertedIndex = null; + + public HashSet relns_subject; + public HashSet relns_object; + public HashSet prepositions; + public HashSet bannedTypes; + + //public final int typePredicateID = 1541; //dbpedia2015 =1541 + public final int typePredicateID = 5157; //Dbpedia 2016 =5166 + public int totalPredCount = 0; + public int paraphrasedPredCount = 0; + public int lineCount = 0; + + /** + * constructor + * @param parser + * @param ner + */ + public ParaphraseDictionary () { + String fixedPath = Globals.localPath; + + System.out.println(System.getProperty("user.dir")); + localDataPath = fixedPath + "data/DBpedia2016/parapharse/"; + dbpedia_relation_paraphrases_baseform_withScore_rerank = localDataPath + "dbpedia-relation-paraphrases-withScore-baseform-merge-sorted-rerank-slct.txt"; + dbpedia_relation_paraphrases_handwrite = localDataPath + "dbpedia-relation-paraphrase-handwrite.txt"; + + dbpedia_predicate_id = localDataPath + "16predicate_id.txt"; + dbpedia_dbo_predicate = localDataPath + "16dbo_predicates.txt"; + + bannedTypes = new HashSet(); + bannedTypes.add("Mayor"); + + relns_subject = new HashSet(); + relns_subject.add("subj"); + relns_subject.add("csubjpass"); + relns_subject.add("csubj"); + relns_subject.add("xsubj"); + relns_subject.add("nsubjpass"); + relns_subject.add("nsubj"); + relns_subject.add("poss"); // Obama's wife + relns_subject.add("dobj"); + + relns_object = new HashSet(); + relns_object.add("dobj"); + relns_object.add("iobj"); + relns_object.add("obj"); + relns_object.add("pobj"); + + prepositions = new HashSet(); + prepositions.add("in");//in at on with to from before after of for + prepositions.add("at"); + prepositions.add("on"); + prepositions.add("with"); + prepositions.add("to"); + prepositions.add("from"); + prepositions.add("before"); + prepositions.add("after"); + prepositions.add("of"); + prepositions.add("for"); + prepositions.add("as"); + + try { + loadPredicateId(); + loadDboPredicate(); + loadParaDict(); + buildInvertedIndex(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * Load the mapping between predicates and their IDs. + * @throws IOException + */ + public void loadPredicateId () throws IOException { + predicate_2_id = new HashMap(); + id_2_predicate = new HashMap(); + + String input_filename = dbpedia_predicate_id; + File file = new File(input_filename); + InputStreamReader in = null; + BufferedReader br = null; + try{ + in = new InputStreamReader(new FileInputStream(file), "utf-8"); + br = new BufferedReader(in); + String line = null; + while ((line = br.readLine())!= null) { + String[] lines = line.split("\t"); + predicate_2_id.put(lines[0], Integer.parseInt(lines[1])); + id_2_predicate.put(Integer.parseInt(lines[1]), lines[0]); + } + }catch(IOException e){ + System.out.println("NLPatterns.loadPredicateId() : IOException!"); + e.printStackTrace(); + }finally{ + if(br != null){ + try{ + br.close(); + }catch(IOException e){ + e.printStackTrace(); + } + } + } + System.out.println("NLPatterns.loadPredicateId() : ok!"); + } + + public void loadDboPredicate() throws IOException + { + dbo_predicate_id = new HashSet(); + int cnt = 0; + + String input_filename = dbpedia_dbo_predicate; + InputStreamReader in = null; + BufferedReader br = null; + try{ + File file = new File(input_filename); + in = new InputStreamReader(new FileInputStream(file), "utf-8"); + br = new BufferedReader(in); + String line = null; + while ((line = br.readLine())!= null) + { + if (!predicate_2_id.containsKey(line)) + { + cnt++; + //System.out.println("error: not found "+line+" id."); + continue; + } + dbo_predicate_id.add(predicate_2_id.get(line)); + } + }catch(IOException e){ + System.out.println("NLPatterns.loadDboPredicate() : IOException!"); + + }finally{ + if(br!=null){ + try{ + br.close(); + }catch(IOException e){ + e.printStackTrace(); + } + } + + } + System.out.println("Warning: DBO not found id count: "+cnt); + System.out.println("NLPatterns.loadDboPredicate() : ok!"); + } + + /** + * Get predicate by its id + * @param predicateID + * @return + */ + public String getPredicateById (int predicateID) { + return id_2_predicate.get(predicateID); + } + + public void loadParaDict () throws Exception { + nlPattern_2_predicateList = new HashMap>(); + HashSet missInDBP2014 = new HashSet(); + + InputStreamReader in = null; + BufferedReader br = null; + try{ + String inputFileName = dbpedia_relation_paraphrases_baseform_withScore_rerank; + File file = new File(inputFileName); + in = new InputStreamReader(new FileInputStream(file), "utf-8"); + br = new BufferedReader(in); + String line = null; + int lineCount = 0; + //line = br.readLine();//read the first line which indicates the format + while ((line = br.readLine()) != null) + { + if (line.startsWith("#")) continue; + lineCount ++; + String[] content = line.split("\t"); + + if(!predicate_2_id.containsKey(content[0])) + { + missInDBP2014.add(content[0]); + continue; + } + + int predicateID = predicate_2_id.get(content[0]); + String nlPattern = content[1].toLowerCase(); + int support = Integer.parseInt(content[2]); + //double score = Double.parseDouble(content[3]); + String []slctString = content[3].split(" "); + double[] slct = new double[slctString.length]; + for (int i=0; i < slct.length; i++) { + slct[i] = Double.parseDouble(slctString[i]); + } + + if (!nlPattern_2_predicateList.containsKey(nlPattern)) { + nlPattern_2_predicateList.put(nlPattern, new ArrayList()); + } + nlPattern_2_predicateList.get(nlPattern).add(new PredicateIDAndSupport(predicateID, support, slct)); + } + + System.out.println("Number of NL-Patterns-to-predicate mappings = " + lineCount); + System.out.println("NLPatterns.size = " + nlPattern_2_predicateList.size()); + System.out.println("Predicate.size = " + predicate_2_id.size()); + System.out.println("Warning: Predicates not in DBpedia 2014 count: "+missInDBP2014.size()); + + // Notice predicate itself and handwritten patterns have no wordSelectivity. + addPredicateAsNLPattern(); // This is very important. + addHandwriteAsNLPattern(); + + Iterator it = nlPattern_2_predicateList.keySet().iterator(); + while (it.hasNext()) { + Collections.sort(nlPattern_2_predicateList.get(it.next())); + } + + }catch(IOException e){ + System.out.println("NLPatterns.Paradict() : IOException!"); + }finally{ + if(br!=null){ + try{ + br.close(); + }catch(IOException e){ + e.printStackTrace(); + } + } + } + System.out.println("NLPatterns.Paradict() : ok!"); + } + + /** + * A set of very important NL patterns are the predicates themselves! + */ + public void addPredicateAsNLPattern () { + final int support = 200; + int predicate_id; + for (String p : predicate_2_id.keySet()) + { + // TODO: Omitting some bad relations (should be discarded in future) + if(p.equals("state") || p.equals("states")) + continue; + + predicate_id = predicate_2_id.get(p); + StringBuilder pattern = new StringBuilder(""); + + // Work/runtime 11,SpaceStation/volume 68 and some predicates have prefix (DBpedia 2015), discard the prefix when generating pattern + if(p.contains("/")) + { + if(p.charAt(0)>='A' && p.charAt(0)<='Z') + p = p.substring(p.indexOf("/")+1); + //gameW/l 1974 + else + p = p.replace("/", ""); + } + + int last = 0, i = 0; + for(i = 0; i < p.length(); i ++) { + // if it were not a small letter, then break it. + if(!(p.charAt(i)>='a' && p.charAt(i)<='z')) { + pattern.append(p.substring(last, i).toLowerCase()); + pattern.append(" "); + last = i; + } + } + pattern.append(p.substring(last, i).toLowerCase()); + for (i = 3; i < pattern.length(); i ++) { + // the blank between two digits should be deleted. + if (pattern.charAt(i)>='0' && pattern.charAt(i)<='9' + && pattern.charAt(i-1)==' ' + && pattern.charAt(i-2)>='0' && pattern.charAt(i-2)<='9') { + pattern.deleteCharAt(i-1); + } + // the blank between I and D should be deleted. + else if (pattern.charAt(i)=='d' + && pattern.charAt(i-1)==' ' + && pattern.charAt(i-2)=='i' + && pattern.charAt(i-3)==' ') { + pattern.deleteCharAt(i-1); + } + // the blank between D and B should be deleted. + else if (pattern.charAt(i)=='b' + && pattern.charAt(i-1)==' ' + && pattern.charAt(i-2)=='d' + && pattern.charAt(i-3)==' ') { + pattern.deleteCharAt(i-1); + } + } + + // pattern -> base form + /*String[] ptns = pattern.toString().split(" "); + pattern = new StringBuilder(""); + for (String s : ptns) { + pattern.append(Globals.coreNLPparser.getBaseFormOfPattern(s)); + pattern.append(" "); + } + pattern.deleteCharAt(pattern.length()-1); + String patternString = pattern.toString();*/ + + // Special case cannot use base form, eg, foundingYear //TODO: maybe Porter's Algorithm + String patternString = Globals.coreNLP.getBaseFormOfPattern(pattern.toString()); + //System.out.println(p + "-->" + patternString); + + if (!nlPattern_2_predicateList.containsKey(patternString)) { + nlPattern_2_predicateList.put(patternString, new ArrayList()); + } + nlPattern_2_predicateList.get(patternString).add( + new PredicateIDAndSupport(predicate_id, + support, + PredicateIDAndSupport.genSlct(patternString.split(" ").length))); + } + + System.out.println("NLPatterns.addPredicateAsNLPattern(): ok!"); + } + + public void addHandwriteAsNLPattern() throws IOException { + String inputFileName = dbpedia_relation_paraphrases_handwrite; + InputStreamReader in = null; + BufferedReader br = null; + + try{ + File file = new File(inputFileName); + in = new InputStreamReader(new FileInputStream(file), "utf-8"); + br = new BufferedReader(in); + + String line = null; + //int lineCount = 0; + //line = br.readLine();//read the first line which indicates the format + while ((line = br.readLine()) != null) { + if (line.startsWith("#") || line.isEmpty()) continue; + //lineCount ++; + String[] content = line.split("\t"); + + if(!predicate_2_id.containsKey(content[0])) + continue; + + int predicateID = predicate_2_id.get(content[0]); + String nlPattern = content[1].toLowerCase(); + int support = Integer.parseInt(content[2]); + + if (!nlPattern_2_predicateList.containsKey(nlPattern)) { + nlPattern_2_predicateList.put(nlPattern, new ArrayList()); + } + nlPattern_2_predicateList.get(nlPattern).add( + new PredicateIDAndSupport(predicateID, + support, + PredicateIDAndSupport.genSlct(nlPattern.split(" ").length))); + } + }catch(IOException e){ + System.out.println("NLPatterns.addHandwriteAsNLPattern(): IOException!"); + }finally{ + if(br!=null){ + try{ + br.close(); + }catch(IOException e){ + e.printStackTrace(); + } + } + } + + System.out.println("NLPatterns.addHandwriteAsNLPattern(): ok!"); + } + + /** + * Show the NLPatterns + */ + public void showNLPatterns () { + /*for (String s: syntacticMarker) { + System.out.println(s); + } + GlobalTools.systemPause();*/ + + System.out.println("predicate-->id"); + for (String s : predicate_2_id.keySet()) { + System.out.println(s + "-->" + predicate_2_id.get(s)); + } + Globals.systemPause(); + + int count = 1; + System.out.println("nlPattern-->predicate"); + for (String p : nlPattern_2_predicateList.keySet()) { + System.out.print("" + (count++) + ".\t" + p + "\t[" + nlPattern_2_predicateList.get(p).size() + "]\t"); + for (PredicateIDAndSupport i : nlPattern_2_predicateList.get(p)) { + System.out.print(id_2_predicate.get(i.predicateID) + "<" + i.support + ">" + ", "); + } + System.out.println(); + } + } + + /** + * Build the inverted index, where each word will be mapped to the patterns that it occurs + */ + public void buildInvertedIndex () { + invertedIndex = new HashMap>(); + // traversing all patterns + for (String p : nlPattern_2_predicateList.keySet()) { + String[] tokens = p.split(" "); + for (String token : tokens) { + if (token.length() < 1) continue; + if (!invertedIndex.containsKey(token)) { + invertedIndex.put(token, new ArrayList()); + } + invertedIndex.get(token).add(p); + } + } + + System.out.println("NLPatterns.buildInvertedIndex(): ok!"); + } + + public static void main (String[] args) { + Globals.coreNLP = new CoreNLP(); + Globals.pd = new ParaphraseDictionary(); + //Globals.pd.showNLPatterns(); + } +} diff --git a/src/paradict/PredicateIDAndSupport.java b/src/paradict/PredicateIDAndSupport.java new file mode 100644 index 0000000..92eb66b --- /dev/null +++ b/src/paradict/PredicateIDAndSupport.java @@ -0,0 +1,24 @@ +package paradict; + +public class PredicateIDAndSupport implements Comparable { + public int predicateID; + public int support; + public double[] wordSelectivity = null; // wordSelectivity helps PATTY patterns ranking more accurate. + + public PredicateIDAndSupport(int _pid, int _support, double[] _slct) { + predicateID = _pid; + support = _support; + wordSelectivity = _slct; + } + + public int compareTo(PredicateIDAndSupport o) { + return o.support - this.support; + } + + // only use for predicate itself and handwriting paraphrase + public static double[] genSlct(int size) { + double[] ret = new double[size]; + for (int i=0;i{ + public String questionFocusKey=null; + public String questionFocusValue=null; + public ArrayList otherInformationKey = null; + public ArrayList otherInformationValue = null; + + public Answer(String qf, String[] ans) { + otherInformationKey = new ArrayList(); + otherInformationValue = new ArrayList(); + int p1, p2; + for (String line : ans) { + System.out.println("line=" + line); + if (line.startsWith(qf)) { + questionFocusKey = qf; + p1 = line.indexOf('<'); + p2 = line.lastIndexOf('>'); + String value = null; + if (p1 != -1 && p2 != -1) { + value = line.substring(p1+1, p2); + } + else { + p1 = line.indexOf('\"'); + p2 = line.lastIndexOf('\"'); + if(p1 != -1 && p2 != -1) + value = line.substring(p1+1, p2); + else + { + p1 = line.indexOf(':'); + value = line.substring(p1+1); + } + } + questionFocusValue = value; + } + else { + + p1 = line.indexOf(':'); + String key = line.substring(0, p1); + + p1 = line.indexOf('<'); + p2 = line.lastIndexOf('>'); + String value = null; + if (p1 != -1 && p2 != -1) { + value = line.substring(p1+1, p2); + } + else { + p1 = line.indexOf('\"'); + p2 = line.lastIndexOf('\"'); + if(p1 != -1 && p2 != -1) + value = line.substring(p1+1, p2); + else + { + p1 = line.indexOf(':'); + value = line.substring(p1+1); + } + } + + otherInformationKey.add(key); + otherInformationValue.add(value); + } + } + + // Sove BUG: GStore return messy code in questionFocusKey + if (questionFocusKey==null || questionFocusValue==null) + { + questionFocusKey = qf; + String line = ans[0]; + p1 = line.indexOf('<'); + p2 = line.lastIndexOf('>'); + String value = null; + if (p1 != -1 && p2 != -1) { + value = line.substring(p1+1, p2); + } + else { + p1 = line.indexOf('\"'); + p2 = line.lastIndexOf('\"'); + if(p1 != -1 && p2 != -1) + value = line.substring(p1+1, p2); + else + { + p1 = line.indexOf(':'); + value = line.substring(p1+1); + } + } + questionFocusValue = value; + otherInformationKey.clear(); + otherInformationValue.clear(); + } + + /*System.out.println("otherInformationKey.size=" + otherInformationKey.size()); + for (String k : otherInformationKey) { + System.out.println("otherInfoKey = " + k); + }*/ + } + + public int compareTo (Answer p) + { + return questionFocusValue.compareTo(p.questionFocusValue); + } + +} diff --git a/src/qa/GAnswer.java b/src/qa/GAnswer.java new file mode 100644 index 0000000..c5462c1 --- /dev/null +++ b/src/qa/GAnswer.java @@ -0,0 +1,376 @@ +package qa; + +import java.io.*; +import java.net.Socket; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; + +import jgsc.GstoreConnector; +import log.QueryLogger; +import nlp.ds.Sentence; +import nlp.ds.Sentence.SentenceType; +import qa.parsing.QuestionParsing; +import qa.parsing.BuildQueryGraph; +import rdf.Sparql; +import utils.FileUtil; +import addition.AddtionalFix; +import qa.Globals; + +public class GAnswer { + + public static final int MAX_SPQ_NUM = 3; + + public static void init() { + System.out.println("gAnswer2 init ..."); + + Globals.init(); + + System.out.println("gAnswer2 init ... ok!"); + } + + public QueryLogger getSparqlList(String input) + { + QueryLogger qlog = null; + try + { + if (input.length() <= 5) + return null; + + System.out.println("[Input:] "+input); + + // step 0: Node (entity & type & literal) Recognition + long t0 = System.currentTimeMillis(), t, NRtime; + Query query = new Query(input); + qlog = new QueryLogger(query); + ArrayList rankedSparqls = new ArrayList(); + NRtime = (int)(System.currentTimeMillis()-t0); + System.out.println("step0 [Node Recognition] : "+ NRtime +"ms"); + + // Try to solve each NR plan, and combine the ranked SPARQLs. + // We only reserve LOG of BEST NR plan for convenience. + for(int i=query.sList.size()-1; i>=0; i--) + { + Sentence possibleSentence = query.sList.get(i); + qlog.reloadSentence(possibleSentence); +// qlog.isMaltParserUsed = true; + + // LOG + System.out.println("transQ: "+qlog.s.plainText); + qlog.NRlog = query.preLog; + qlog.SQGlog = "Id: "+query.queryId+"\nQuery: "+query.NLQuestion+"\n"; + qlog.SQGlog += qlog.NRlog; + qlog.timeTable.put("step0", (int)NRtime); + + // step 1: question parsing (dependency tree, sentence type) + t = System.currentTimeMillis(); + QuestionParsing step1 = new QuestionParsing(); + step1.process(qlog); + qlog.timeTable.put("step1", (int)(System.currentTimeMillis()-t)); + + // step 2: build query graph (structure construction, relation extraction, top-k join) + t = System.currentTimeMillis(); + BuildQueryGraph step2 = new BuildQueryGraph(); + step2.process(qlog); +// step2.processEXP(qlog); + qlog.timeTable.put("step2", (int)(System.currentTimeMillis()-t)); + + // step 3: some fix (such as "one-node" or "ask-one-triple") and aggregation + t = System.currentTimeMillis(); + AddtionalFix step3 = new AddtionalFix(); + step3.process(qlog); + + // Collect SPARQLs. + rankedSparqls.addAll(qlog.rankedSparqls); + qlog.timeTable.put("step3", (int)(System.currentTimeMillis()-t)); + } + + // deduplicate in SPARQL + for(Sparql spq: rankedSparqls) + spq.deduplicate(); + + // Sort (descending order). + Collections.sort(rankedSparqls); + qlog.rankedSparqls = rankedSparqls; + System.out.println("number of rankedSparqls = " + qlog.rankedSparqls.size()); + + // Detect question focus. + for (int i=0; i rawLines = new ArrayList(); +// DataInputStream dis = new DataInputStream(new BufferedInputStream(socket.getInputStream())); +// while (true) +// { +// String line = dis.readUTF(); +// if (line.equals("[[finish]]")) break; +// rawLines.add(line); +// } +// +// // ASK query was translated to SELECT query, whose answer need translation. +// // It is no need to translate, use "ASK WHERE" directly ! 2018-12-11 +// if(qlog.s.sentenceType == SentenceType.GeneralQuestion) +// { +// ret.answersNum = 1; +// ret.answers = new String[1][1]; +// if(rawLines.size() == 0) +// { +// ret.answers[0][0] = "general:false"; +// } +// else +// { +// ret.answers[0][0] = "general:true"; +// } +// System.out.println("general question answer:" + ret.answers[0][0]); +// dos.close(); +// dis.close(); +// socket.close(); +// return ret; +// } +// +// //select but no results +// if (rawLines.size() == 0) +// { +// ret.answersNum = 0; +// dos.close(); +// dis.close(); +// socket.close(); +// return ret; +// } +// +// int ansNum = rawLines.size(); +// int varNum = variables.size(); +// ArrayList valist = new ArrayList(variables); +// ret.answers = new String[ansNum][varNum]; +// +// System.out.println("ansNum=" + ansNum); +// System.out.println("varNum=" + varNum); +// for (int i=0;i inputList = FileUtil.readFile("E:/Linyinnian/qald6_special.txt"); + for(String input: inputList) + { + ArrayList outputs = new ArrayList(); + ArrayList spqs = new ArrayList(); + spqs.add("id:"+String.valueOf(i)); + i++; + + long parsing_st_time = System.currentTimeMillis(); + + QueryLogger qlog = ga.getSparqlList(input); + if(qlog == null || qlog.rankedSparqls == null) + continue; + + long parsing_ed_time = System.currentTimeMillis(); + System.out.println("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); + System.out.println("TripleCheck time: "+ qlog.timeTable.get("TripleCheck") + "ms"); + System.out.println("SparqlCheck time: "+ qlog.timeTable.get("SparqlCheck") + "ms"); + System.out.println("Ranked Sparqls: " + qlog.rankedSparqls.size()); + + outputs.add(qlog.SQGlog); + outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms"); + outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms"); + outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); + + long excuting_st_time = System.currentTimeMillis(); + Matches m = null; + System.out.println("[RESULT]"); + ArrayList lastSpqList = new ArrayList(); + int idx; + // Consider top-5 SPARQLs + for(idx=1; idx<=Math.min(qlog.rankedSparqls.size(), 5); idx++) + { + Sparql curSpq = qlog.rankedSparqls.get(idx-1); + String stdSPQwoPrefix = ga.getStdSparqlWoPrefix(qlog, curSpq); + lastSpqList.add(stdSPQwoPrefix); + + System.out.println("[" + idx + "]" + "score=" + curSpq.score); + System.out.println(stdSPQwoPrefix); + + // Print top-3 SPARQLs to file. + if(idx <= MAX_SPQ_NUM) +// spqs.add("[" + idx + "]" + "score=" + curSpq.score + "\n" + stdSPQwoPrefix); + outputs.add("[" + idx + "]" + "score=" + curSpq.score + "\n" + stdSPQwoPrefix); + +// // Execute by Virtuoso or GStore when answers not found + if(m == null || m.answers == null) + { + if (curSpq.tripleList.size()>0 && curSpq.questionFocus!=null) + { +// if(ga.isBGP(qlog, curSpq)) + m = ga.getAnswerFromGStore2(curSpq); +// else +// m = ga.getAnswerFromVirtuoso(qlog, curSpq); + } + if (m != null && m.answers != null) + { + // Found results using current SPQ, then we can break and print result. + qlog.sparql = curSpq; + qlog.match = m; + qlog.reviseAnswers(); + System.out.println("Query Executing time: "+ (int)(System.currentTimeMillis() - excuting_st_time)+ "ms"); + } + } + } + + // Some TYPEs can be omitted, (such as ) + if(!qlog.rankedSparqls.isEmpty()) + { + Sparql untypedSparql = ga.getUntypedSparql(qlog.rankedSparqls.get(0)); + if(untypedSparql != null) + { + String stdSPQwoPrefix = ga.getStdSparqlWoPrefix(qlog, untypedSparql); + if(!lastSpqList.contains(stdSPQwoPrefix)) +// spqs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n"); + outputs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n"); + } + } + outputs.add(qlog.match.toString()); + + FileUtil.writeFile(outputs, "E:/Linyinnian/qald6_special_out.txt", true); + } + + } +} diff --git a/src/qa/Globals.java b/src/qa/Globals.java new file mode 100644 index 0000000..d10c440 --- /dev/null +++ b/src/qa/Globals.java @@ -0,0 +1,118 @@ +package qa; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import lcn.EntityFragmentFields; +import fgmt.RelationFragment; +import fgmt.TypeFragment; +import paradict.ParaphraseDictionary; +import qa.mapping.DBpediaLookup; +import nlp.tool.NERecognizer; +import nlp.tool.CoreNLP; +import nlp.tool.MaltParser; +import nlp.tool.StanfordParser; +import nlp.tool.StopWordsList; + +public class Globals { + // nlp tools + public static CoreNLP coreNLP; + public static StanfordParser stanfordParser; + public static StopWordsList stopWordsList; + public static MaltParser maltParser; + public static NERecognizer nerRecognizer; + // relation paraphrase dictionary + public static ParaphraseDictionary pd; + // entity linking system + public static DBpediaLookup dblk; + public static int MaxAnswerNum = 100; + + /* + * evaluationMethod: + * 1. baseline(SQG), does not allow CIRCLE and WRONG edge. The structure may be different by changing the TARGET. + * 2. super SQG, allow CIRCLE and WRONG edge. The structure is decided by DS tree, and can be changed in query evaluation(TOP-K match) stage. + * */ + public static int evaluationMethod = 2; + public static boolean isRunAsWebServer = false; // Run Local: false; Run Server: true + public static String runningBenchmark = "QALD"; // WQ:WebQuestions; WQSP:WebQuestionsSP; CQ:ComplexQuestions + // using different method and Freebase Version (in Virtuoso.java) + public static boolean usingOperationCondition = false; // only for EXP: try state transition operations only when condition are satisfied. + + + public static String localPath = "/media/wip/husen/NBgAnswer/"; + public static String QueryEngineIP = "127.0.0.1"; // Notice, PORT number is in the evaluation function. + + public static void init () + { + System.out.println("====== gAnswer2.0 over DBpedia ======"); + + if(isRunAsWebServer == false) + { + localPath = "D:/husen/gAnswer/"; + QueryEngineIP = "172.31.222.72"; + } + + long t1, t2, t3, t4, t5, t6, t7, t8, t9; + + t1 = System.currentTimeMillis(); + coreNLP = new CoreNLP(); + + t2 = System.currentTimeMillis(); + stanfordParser = new StanfordParser(); + + t3 = System.currentTimeMillis(); + maltParser = new MaltParser(); + + t4 = System.currentTimeMillis(); + nerRecognizer = new NERecognizer(); + + t5 = System.currentTimeMillis(); + stopWordsList = new StopWordsList(); + + t6 = System.currentTimeMillis(); + pd = new ParaphraseDictionary(); + + t7 = System.currentTimeMillis(); + try + { + EntityFragmentFields.load(); + RelationFragment.load(); + TypeFragment.load(); + } + catch (Exception e1) { + System.out.println("EntityIDs and RelationFragment and TypeFragment loading error!"); + e1.printStackTrace(); + } + + t8 = System.currentTimeMillis(); + dblk = new DBpediaLookup(); + + t9 = System.currentTimeMillis(); + System.out.println("======Initialization======"); + System.out.println("CoreNLP(Lemma): " + (t2-t1) + "ms."); + System.out.println("StanfordParser: " + (t3-t2) + "ms."); + System.out.println("MaltParser: " + (t4-t3) + "ms."); + System.out.println("NERecognizer: " + (t5-t4) + "ms."); + System.out.println("StopWordsList: " + (t6-t5) + "ms."); + System.out.println("ParaphraseDict & posTagPattern: " + (t7-t6) + "ms."); + System.out.println("GraphFragments: " + (t8-t7) + "ms."); + System.out.println("DBpediaLookup: " + (t9-t8) + "ms."); + System.out.println("* Total *: " + (t9-t1) + "ms."); + System.out.println("=========================="); + } + + + /** + * Use as system("pause") in C + */ + public static void systemPause () { + System.out.println("System pause ..."); + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + try { + br.readLine(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/src/qa/Matches.java b/src/qa/Matches.java new file mode 100644 index 0000000..995b61f --- /dev/null +++ b/src/qa/Matches.java @@ -0,0 +1,9 @@ +package qa; + +public class Matches { + public String[][] answers = null; + public int answersNum = 0; + public long time = 0; + + public static final int pageNum = 3000; +} diff --git a/src/qa/Query.java b/src/qa/Query.java new file mode 100644 index 0000000..6ebada7 --- /dev/null +++ b/src/qa/Query.java @@ -0,0 +1,128 @@ +package qa; + +import java.util.ArrayList; + +import nlp.ds.Sentence; +import qa.extract.EntityRecognition; +import rdf.MergedWord; + +/** + * 1. preprocessing of question + * 2. Node Recognition + * @author husen + */ +public class Query +{ + public String NLQuestion = null; + public String TransferedQuestion = null; + public ArrayList MergedQuestionList = null; + public ArrayList sList = null; + + public String queryId = null; + public String preLog = ""; + + public ArrayList mWordList = null; + + public Query(){} + public Query(String _question) + { + NLQuestion = _question; + NLQuestion = removeQueryId(NLQuestion); + + TransferedQuestion = getTransferedQuestion(NLQuestion); + + // step1. NODE Recognition + MergedQuestionList = getMergedQuestionList(TransferedQuestion); + + // build Sentence + sList = new ArrayList(); + for(String mergedQuestion: MergedQuestionList) + { + Sentence sentence = new Sentence(this, mergedQuestion); + sList.add(sentence); + } + } + + public boolean isDigit(char ch) + { + if(ch>='0' && ch<='9') + return true; + return false; + } + + public boolean isUpperWord(char ch) + { + if(ch>='A' && ch<='Z') + return true; + return false; + } + + /** + * some words -> equivalent words + * 1、stanfordParser often parse incorrect. + * 2、Synonyms unify. eg, movie->film + * @param question + * @return transfered question + */ + public String getTransferedQuestion(String question) + { + //rule1: discard ".", because "." and "_" will be disconnected by parser. Discard word tail's "'", which may pollutes NER + question = question.replace("' ", " "); + String [] words = question.split(" "); + String ret = ""; + for(String word: words) + { + String retWord = word; + //TODO: now just check NUM in head/tail + if(word.length()>=2 && !isDigit(word.charAt(0)) && !isDigit(word.charAt(word.length()-1))) + { + retWord = retWord.replace(".", ""); + } + ret += retWord + " "; + } + if(ret.length()>1) + ret = ret.substring(0,ret.length()-1); + + ret = ret.replace("-", " "); + ret = ret.replace("in america", "in United States"); + + //rule2: as well as -> and + ret = ret.replace("as well as", "and"); + + //rule3: movie -> film + ret = ret.replace(" movie", " film"); + ret = ret.replace(" movies", " films"); + + return ret; + } + + /** + * Recognize entity & type & literal in KB and replace " " in Phrases with "_" + * @param question + * @return merged question list + */ + public ArrayList getMergedQuestionList(String question) + { + ArrayList mergedQuestionList = null; + //entity & type recognize + EntityRecognition er = new EntityRecognition(); + mergedQuestionList = er.process(question); + preLog = er.preLog; + mWordList = er.mWordList; + + return mergedQuestionList; + } + + public String removeQueryId(String question) + { + String ret = question; + int st = question.indexOf("\t"); + if(st!=-1 && question.length()>1 && question.charAt(0)>='0' && question.charAt(0)<='9') + { + queryId = question.substring(0,st); + ret = question.substring(st+1); + System.out.println("Extract QueryId :"+queryId); + } + return ret; + } +} diff --git a/src/qa/extract/CorefResolution.java b/src/qa/extract/CorefResolution.java new file mode 100644 index 0000000..10fea91 --- /dev/null +++ b/src/qa/extract/CorefResolution.java @@ -0,0 +1,153 @@ +package qa.extract; + +import java.util.ArrayList; +import java.util.HashSet; + +import qa.Globals; + +import log.QueryLogger; + +import nlp.ds.DependencyTree; +import nlp.ds.DependencyTreeNode; +import nlp.ds.Word; +import rdf.SimpleRelation; + +public class CorefResolution { + /** + * 1. a very simple reference resolution + * 2. Coref Resolution should be done after relation extraction and before items mapping + */ + public void process(ArrayList simpleRelations, QueryLogger qlog) { + if (qlog.s.words.length <= 4) return; // if the sentence is too short, skip the coref step. + System.out.println("=====Co-reference resolution======="); + ArrayList deleteList = new ArrayList(); + + for(SimpleRelation sr : simpleRelations) { + Word w1=null, w2=null; + + if (sr.extractingMethod == 'S') { + w1 = getRefWord(sr.arg1Word.getNnHead(), qlog.s.dependencyTreeStanford, qlog); + w2 = getRefWord(sr.arg2Word.getNnHead(), qlog.s.dependencyTreeStanford, qlog); + } + else if (sr.extractingMethod == 'M') { + w1 = getRefWord(sr.arg1Word.getNnHead(), qlog.s.dependencyTreeMalt, qlog); + w2 = getRefWord(sr.arg2Word.getNnHead(), qlog.s.dependencyTreeMalt, qlog); + } + else { + continue; + } + + if (w1 != null) { + sr.arg1Word_beforeCRR = sr.arg1Word; + sr.arg1Word = w1; + } + if (w2 != null) { + sr.arg2Word_beforeCRR = sr.arg2Word; + sr.arg2Word = w2; + } + + if (sr.arg1Word == sr.arg2Word) + deleteList.add(sr); + } + + simpleRelations.removeAll(deleteList); + + printCRR(qlog); + System.out.println("==================================="); + } + + // return the reference word of w + public Word getRefWord (Word w, DependencyTree dt, QueryLogger qlog) { + w = w.getNnHead(); + + if (w.crr != null) { + return w.crr; + } + + /* + * method: (suitable for stanford parser (old version)) + * (1) WDT --det--> [] eg: Which city is located in China? + * (2) WDT -------> V/J --rcmod--> [] eg: Who is married to someone that was born in Rome? + * "when is the sth" is conflict with this rule, so discarded. (3) W -------> be <------- [] eg: Who is the author of WikiLeaks? + * (4) WDT -------> V --ccomp--> [] eg: The actor that married the child of a politician. + * (5) DT(that, which) --dep--> V eg:The actors that married an athlete. // DS parser error. + * (6) W(position=1) ------> NN eg:What are the language used in China? // DS parser error, should eliminate "WRB":When was Carlo Giuliani shot? + * (7) where <--advmod-- V <--advcl-- V --prep/pobj--> [] eg: Who graduate from the school where Keqiang Li graduates? + */ + + DependencyTreeNode dtn = dt.getNodeByIndex(w.position); + + // no need for root + if (dtn.father == null) return null; + + try { + if(dtn.word.posTag.equals("WDT") && dtn.dep_father2child.equals("det")) { // (1) + if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead()); + w.crr = dtn.father.word.getNnHead(); + } + else if(dtn.word.posTag.startsWith("W") && !dtn.word.posTag.equals("WRB") && dtn.word.position == 1 && dtn.father.word.posTag.equals("NN")) { // (6) + if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead()); + w.crr = dtn.father.word.getNnHead(); + } + else if(dtn.word.posTag.equals("DT") + && dtn.dep_father2child.equals("dep") + && (dtn.word.baseForm.equals("that")||dtn.word.baseForm.equals("which"))) { // (5) + if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead()); + w.crr = dtn.father.word.getNnHead(); + } +// else if(dtn.word.posTag.startsWith("W") +// && dtn.father.word.baseForm.equals("be")) { // (3) //&& dtn.dep_father2child.equals("attr") +// DependencyTreeNode target = dtn.father.containDependencyWithChildren("nsubj"); +// if (target != null) { +// if(qlog.MODE_debug) System.out.println(w + "-->" + target.word.getNnHead()); +// w.crr = target.word.getNnHead(); +// } +// } + else if(dtn.word.posTag.equals("WDT") + && (dtn.father.word.posTag.startsWith("V") || dtn.father.word.posTag.startsWith("J")) + && dtn.father.dep_father2child.equals("rcmod")) { // (2) + if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.father.word.getNnHead()); + w.crr = dtn.father.father.word.getNnHead(); + } + else if(dtn.word.posTag.equals("WDT") + && dtn.father.word.posTag.startsWith("V") + && dtn.father.dep_father2child.equals("ccomp")) { // (4) + if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.father.word.getNnHead()); + w.crr = dtn.father.father.word.getNnHead(); + } + else if (dtn.word.baseForm.equals("where") + && dtn.dep_father2child.equals("advmod") + && dtn.father.dep_father2child.equals("advcl")) { // (7) + DependencyTreeNode target = dtn.father.father.containDependencyWithChildren("prep"); + if (target != null) { + target = target.containDependencyWithChildren("pobj"); + } + else { + for (DependencyTreeNode n : dtn.father.father.childrenList) { + if (Globals.pd.relns_object.contains(n.dep_father2child)) { + target = n; + } + } + } + if (target != null) { + if(qlog.MODE_debug) System.out.println(w + "-->" + target.word.getNnHead()); + w.crr = target.word.getNnHead(); + } + } + } catch (Exception e) {} + + return w.crr; + } + + public void printCRR (QueryLogger qlog) { + HashSet printed = new HashSet(); + for (Word w : qlog.s.words) { + w = w.getNnHead(); + if (printed.contains(w)) + continue; + if (w.crr != null) + System.out.println("\""+w.getFullEntityName() + "\" is resoluted to \"" + w.crr.getFullEntityName() + "\""); + printed.add(w); + } + } +} diff --git a/src/qa/extract/EntityRecognition.java b/src/qa/extract/EntityRecognition.java new file mode 100644 index 0000000..0901d06 --- /dev/null +++ b/src/qa/extract/EntityRecognition.java @@ -0,0 +1,918 @@ +package qa.extract; + +import java.io.BufferedReader; +//import java.io.File; +//import java.io.FileInputStream; +//import java.io.FileNotFoundException; +//import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +//import java.io.OutputStreamWriter; +//import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; + +import lcn.EntityFragmentFields; +import fgmt.EntityFragment; +import nlp.ds.Word; +import qa.Globals; +import rdf.EntityMapping; +import rdf.NodeSelectedWithScore; +import rdf.TypeMapping; +import rdf.MergedWord; +import utils.FileUtil; +import addition.*; + +/** + * Core class of Node Recognition + * @author husen + */ +public class EntityRecognition { + public String preLog = ""; + public String stopEntFilePath = Globals.localPath + "data/DBpedia2016/parapharse/stopEntDict.txt"; + + double EntAcceptedScore = 26; + double TypeAcceptedScore = 0.5; + double AcceptedDiffScore = 1; + + public HashMap m2e = null; + public ArrayList mWordList = null; + public ArrayList stopEntList = null; + public ArrayList badTagListForEntAndType = null; + ArrayList> selectedList = null; + + TypeRecognition tr = null; + AddtionalFix af = null; + + public EntityRecognition() + { + // LOG + preLog = ""; + loadStopEntityDict(); + + // Bad posTag for entity + badTagListForEntAndType = new ArrayList(); + badTagListForEntAndType.add("RBS"); + badTagListForEntAndType.add("JJS"); + badTagListForEntAndType.add("W"); + badTagListForEntAndType.add("."); + badTagListForEntAndType.add("VBD"); + badTagListForEntAndType.add("VBN"); + badTagListForEntAndType.add("VBZ"); + badTagListForEntAndType.add("VBP"); + badTagListForEntAndType.add("POS"); + + // !Handwriting entity linking; (lower case) + m2e = new HashMap(); + m2e.put("bipolar_syndrome", "Bipolar_disorder"); + m2e.put("battle_in_1836_in_san_antonio", "Battle_of_San_Jacinto"); + m2e.put("federal_minister_of_finance_in_germany", "Federal_Ministry_of_Finance_(Germany)"); + + // Additional fix for CATEGORY (in DBpedia) + af = new AddtionalFix(); + tr = new TypeRecognition(); + + System.out.println("EntityRecognizer Initial : ok!"); + } + + public void loadStopEntityDict() + { + stopEntList = new ArrayList(); + try + { + List inputs = FileUtil.readFile(stopEntFilePath); + for(String line: inputs) + { + if(line.startsWith("#")) + continue; + stopEntList.add(line); + } + } + catch (Exception e) { + e.printStackTrace(); + } + } + + public ArrayList process(String question) + { + ArrayList fixedQuestionList = new ArrayList(); + ArrayList literalList = new ArrayList(); + HashMap entityScores = new HashMap(); + HashMap entityMappings = new HashMap(); + HashMap typeScores = new HashMap(); + HashMap typeMappings = new HashMap(); + HashMap mappingScores = new HashMap(); + ArrayList mustSelectedList = new ArrayList(); + + System.out.println("--------- entity/type recognition start ---------"); + + Word[] words = Globals.coreNLP.getTaggedWords(question); + mWordList = new ArrayList(); + + long t1 = System.currentTimeMillis(); + int checkEntCnt = 0, checkTypeCnt = 0, hitEntCnt = 0, hitTypeCnt = 0, allCnt = 0; + boolean needRemoveCommas = false; + + // Check entity & type + // Notice, ascending order by length + StringBuilder tmpOW = new StringBuilder(); + StringBuilder tmpBW = new StringBuilder(); + for(int len=1; len<=words.length; len++) + { + for(int st=0,ed=st+len; ed<=words.length; st++,ed++) + { + String originalWord = "", baseWord = "", allUpperWord = ""; + //String[] posTagArr = new String[len]; + for(int j=st; j0 && tmp.charAt(0) >='a' && tmp.charAt(0)<='z') + { + String pre = tmp.substring(0,1).toUpperCase(); + tmp = pre + tmp.substring(1); + } + allUpperWord += tmp; + + if(j < ed-1) + { + //originalWord += "_"; + //baseWord += "_"; + tmpOW.append("_"); + tmpBW.append("_"); + } + } + originalWord = tmpOW.toString(); + baseWord=tmpBW.toString(); + tmpOW.setLength(0); + tmpBW.setLength(0); + + allCnt++; +/* + * Filters to save time and drop some bad cases. +*/ + boolean entOmit = false, typeOmit = false; + int prep_cnt=0; + + // Upper words can pass filter. eg: "Melbourne , Florida" + int UpperWordCnt = 0; + for(int i=st;i='A' && words[i].originalForm.charAt(0)<='Z') + || ((words[i].posTag.equals(",") || words[i].originalForm.equals("'")) && i>st && i0) + { + Word formerWord = words[st-1]; + //as princess + if(formerWord.baseForm.equals("as")) + entOmit = true; + //how many dogs? + if(formerWord.baseForm.equals("many")) + entOmit = true; + + //obama's daughter ; your height | len=1 to avoid: Asimov's Foundation series + if(len == 1 && (formerWord.posTag.startsWith("POS") || formerWord.posTag.startsWith("PRP"))) + entOmit = true; + //the father of you + if(ed='A' && nextWord.originalForm.charAt(0)<='Z') + entOmit = true; + } + + for(int i=st;i= 3) + { + entOmit = true; + typeOmit = true; + } + } +/* + * Filter done. +*/ + + // Search category | highest priority + String category = null; + if(af.pattern2category.containsKey(baseWord)) + { + typeOmit = true; + entOmit = true; + category = af.pattern2category.get(baseWord); + } + + // Search type + int hitMethod = 0; // 1=dbo(baseWord), 2=dbo(originalWord), 3=yago|extend() + ArrayList tmList = new ArrayList(); + if(!typeOmit) + { + System.out.println("Type Check: "+originalWord); + //checkTypeCnt++; + //search standard type + tmList = tr.getTypeIDsAndNamesByStr(baseWord); + if(tmList == null || tmList.size() == 0) + { + tmList = tr.getTypeIDsAndNamesByStr(originalWord); + if(tmList != null && tmList.size()>0) + hitMethod = 2; + } + else + hitMethod = 1; + + //Search extend type (YAGO type) + if(tmList == null || tmList.size() == 0) + { + tmList = tr.getExtendTypeByStr(allUpperWord); + if(tmList != null && tmList.size() > 0) + { + preLog += "++++ Extend Type detect: "+baseWord+": "+" prefferd relaiton:"+tmList.get(0).prefferdRelation+"\n"; + hitMethod = 3; + } + } + } + + // Search entity + ArrayList emList = new ArrayList(); + if(!entOmit && !stopEntList.contains(baseWord)) + { + System.out.println("Ent Check: "+originalWord); + checkEntCnt++; + // Notice, the second parameter is whether use DBpedia Lookup. + emList = getEntityIDsAndNamesByStr(originalWord, (UpperWordCnt>=len-1 || len==1),len); + if(emList == null || emList.size() == 0) + { + emList = getEntityIDsAndNamesByStr(baseWord, (UpperWordCnt>=len-1 || len==1), len); + } + if(emList!=null && emList.size()>10) + { + ArrayList tmpList = new ArrayList(); + for(int i=0;i<10;i++) + { + tmpList.add(emList.get(i)); + } + emList = tmpList; + } + } + + MergedWord mWord = new MergedWord(st,ed,originalWord); + + // Add category + if(category != null) + { + mWord.mayCategory = true; + mWord.category = category; + int key = st*(words.length+1) + ed; + mustSelectedList.add(key); + } + + // Add literal + if(len==1 && checkLiteralWord(words[st])) + { + mWord.mayLiteral = true; + int key = st*(words.length+1) + ed; + literalList.add(key); + } + + // Add type mappings + if(tmList!=null && tmList.size()>0) + { + // Drop by score threshold + if(tmList.get(0).score < TypeAcceptedScore) + typeOmit = true; + + // Only allow EXACT MATCH when method=1|2 + // TODO: consider approximate match and taxonomy. eg, actor->person + String likelyType = tmList.get(0).typeName.toLowerCase(); + String candidateBase = baseWord.replace("_", ""), candidateOriginal = originalWord.replace("_", "").toLowerCase(); + if(!candidateBase.equals(likelyType) && hitMethod == 1) + typeOmit = true; + if(!candidateOriginal.equals(likelyType) && hitMethod == 2) + typeOmit = true; + + if(!typeOmit) + { + mWord.mayType = true; + mWord.tmList = tmList; + + int key = st*(words.length+1) + ed; + typeMappings.put(key, tmList.get(0).typeName); + typeScores.put(key, tmList.get(0).score); + } + } + + // Add entity mappings + if(emList!=null && emList.size()>0) + { + // Drop by score threshold + if(emList.get(0).score < EntAcceptedScore) + entOmit = true; + + // Drop: the [German Shepherd] dog + else if(len > 2) + { + for(int key: entityMappings.keySet()) + { + //int te=key%(words.length+1); + int ts=key/(words.length+1); + if(ts == st+1 && ts <= ed) + { + //DT in lowercase (allow uppercase, such as: [The Pillars of the Earth]) + if(words[st].posTag.startsWith("DT") && !(words[st].originalForm.charAt(0)>='A'&&words[st].originalForm.charAt(0)<='Z')) + { + entOmit = true; + } + } + } + } + + // Record info in merged word + if(!entOmit) + { + mWord.mayEnt = true; + mWord.emList = emList; + + // use to remove duplicate and select + int key = st*(words.length+1) + ed; + entityMappings.put(key, emList.get(0).entityID); + + // fix entity score | conflict resolution + double score = emList.get(0).score; + String likelyEnt = emList.get(0).entityName.toLowerCase().replace(" ", "_"); + String lowerOriginalWord = originalWord.toLowerCase(); + // !Award: whole match + if(likelyEnt.equals(lowerOriginalWord)) + score *= len; + // !Award: COVER (eg, Robert Kennedy: [Robert] [Kennedy] [Robert Kennedy]) + //像Social_Democratic_Party,这三个word任意组合都是ent,导致方案太多;相比较“冲突选哪个”,“连or不应该连”显得更重要(而且实际错误多为连或不连的错误),所以这里直接抛弃被覆盖的小ent + //像Abraham_Lincoln,在“不连接”的方案中,会把他们识别成两个node,最后得分超过了正确答案的得分;故对于这种词设置为必选 + if(len>1) + { + boolean[] flag = new boolean[words.length+1]; + ArrayList needlessEntList = new ArrayList(); + double tmpScore=0; + for(int preKey: entityMappings.keySet()) + { + if(preKey == key) + continue; + int te=preKey%(words.length+1),ts=preKey/(words.length+1); + for(int i=ts;i= te) + { + needlessEntList.add(preKey); + tmpScore += entityScores.get(preKey); + } + } + int hitCnt = 0; + for(int i=st;i 0.6 && (double)UpperWordCnt/(double)len > 0.6) || UpperWordCnt == len || len>=4) + { + //如中间有逗号,则要求两边的词都在mapping的entity中出现 + //例如 Melbourne_,_Florida: Melbourne, Florida 是必须选的,而 California_,_USA: Malibu, California,认为不一定正确 + boolean commaTotalRight = true; + if(originalWord.contains(",")) + { + String candidateCompactString = originalWord.replace(",","").replace("_", "").toLowerCase(); + String likelyCompactEnt = likelyEnt.replace(",","").replace("_", ""); + if(!candidateCompactString.equals(likelyCompactEnt)) + commaTotalRight = false; + else + { + mWord.name = mWord.name.replace("_,_","_"); + needRemoveCommas = true; + } + } + if(commaTotalRight) + { + mustSelectedList.add(key); + if(tmpScore>score) + score = tmpScore+1; + for(int preKey: needlessEntList) + { + entityMappings.remove(preKey); + mustSelectedList.remove(Integer.valueOf(preKey)); + } + } + } + } + //NOTICE: score in mWord have no changes. we only change the score in entityScores. + entityScores.put(key,score); + } + } + + if(mWord.mayCategory || mWord.mayEnt || mWord.mayType || mWord.mayLiteral) + mWordList.add(mWord); + } + } + + /* Print all candidates (use fixed score).*/ + System.out.println("------- Result ------"); + for(MergedWord mWord: mWordList) + { + int key = mWord.st * (words.length+1) + mWord.ed; + if(mWord.mayCategory) + { + System.out.println("Detect category mapping: "+mWord.name+": "+ mWord.category +" score: 100.0"); + preLog += "++++ Category detect: "+mWord.name+": "+mWord.category+" score: 100.0\n"; + } + if(mWord.mayEnt) + { + System.out.println("Detect entity mapping: "+mWord.name+": ["); + for(EntityMapping em: mWord.emList) + System.out.print(em.entityName + ", "); + System.out.println("]"); + preLog += "++++ Entity detect: "+mWord.name+": "+mWord.emList.get(0).entityName+" score:"+entityScores.get(key)+"\n"; + hitEntCnt++; + } + if(mWord.mayType) + { + System.out.println("Detect type mapping: "+mWord.name+": ["); + for(TypeMapping tm: mWord.tmList) + System.out.print(tm.typeName + ", "); + System.out.println("]"); + preLog += "++++ Type detect: "+mWord.name+": "+mWord.tmList.get(0).typeName +" score:"+typeScores.get(key)+"\n"; + hitTypeCnt++; + } + if(mWord.mayLiteral) + { + System.out.println("Detect literal: "+mWord.name); + preLog += "++++ Literal detect: "+mWord.name+"\n"; + } + } + + /* + * Sort by score and remove duplicate. + * eg, <"video_game" "ent:Video game" "50.0"> <"a_video_game" "ent:Video game" "45.0">. + * Notice, reserve all information in mWordList. + */ + // one ENT maps different mergedWord in query, reserve the higher score. + ByValueComparator bvc = new ByValueComparator(entityScores,words.length+1); + List keys = new ArrayList(entityMappings.keySet()); + Collections.sort(keys, bvc); + for(Integer key : keys) + { + if(!mappingScores.containsKey(entityMappings.get(key))) + mappingScores.put(entityMappings.get(key), entityScores.get(key)); + else + entityMappings.remove(key); + } + + selectedList = new ArrayList>(); + ArrayList selected = new ArrayList(); + + // Some phrases must be selected. + selected.addAll(mustSelectedList); + for(Integer key: typeMappings.keySet()) + { + // !type(len>1) (Omit len=1 because: [Brooklyn Bridge] is a entity. + int ed = key%(words.length+1), st = key/(words.length+1); + if(st+1 < ed) + { + boolean beCovered = false; + //Entity cover type, eg:[prime_minister of Spain] + for(int preKey: entityMappings.keySet()) + { + int te=preKey%(words.length+1),ts=preKey/(words.length+1); + //Entiy should longer than type + if(ts <= st && te >= ed && ed-st < te-ts) + { + beCovered = true; + } + } + + if(!beCovered) + selected.add(key); + } + } + + // Conflict resolution + ArrayList noConflictSelected = new ArrayList(); + + //select longer one when conflict + boolean[] flag = new boolean[words.length]; + ByLenComparator blc = new ByLenComparator(words.length+1); + Collections.sort(selected,blc); + + for(Integer key : selected) + { + int ed = key%(words.length+1), st = (key-ed)/(words.length+1); + boolean omit = false; + for(int i=st;i top-k decision + dfs(keys,0,noConflictSelected,words.length+1); + ArrayList nodeSelectedWithScoreList = new ArrayList(); + for(ArrayList select: selectedList) + { + double score = 0; + for(Integer key: select) + { + if(entityScores.containsKey(key)) + score += entityScores.get(key); + if(typeScores.containsKey(key)) + score += typeScores.get(key); + } + NodeSelectedWithScore tmp = new NodeSelectedWithScore(select, score); + nodeSelectedWithScoreList.add(tmp); + } + Collections.sort(nodeSelectedWithScoreList); + + // Replace + int cnt = 0; + for(int k=0; k= nodeSelectedWithScoreList.size()) + break; + selected = nodeSelectedWithScoreList.get(k).selected; + + Collections.sort(selected); + int j = 0; + String res = question; + if(selected.size()>0) + { + res = words[0].originalForm; + int tmp = selected.get(j++), st = tmp/(words.length+1), ed = tmp%(words.length+1); + for(int i=1;ist && i= ed && j= 3) // top-3 + break; + } + long t2 = System.currentTimeMillis(); +// preLog += "Total hit/check/all ent num: "+hitEntCnt+" / "+checkEntCnt+" / "+allCnt+"\n"; +// preLog += "Total hit/check/all type num: "+hitTypeCnt+" / "+checkTypeCnt+" / "+allCnt+"\n"; + preLog += "Node Recognition time: "+ (t2-t1) + "ms\n"; + System.out.println("Total check time: "+ (t2-t1) + "ms"); + System.out.println("--------- pre entity/type recognition end ---------"); + + return fixedQuestionList; + } + + public void dfs(List keys,int dep,ArrayList selected,int size) + { + if(dep == keys.size()) + { + ArrayList tmpList = (ArrayList) selected.clone(); + selectedList.add(tmpList); + } + else + { + //off: dep-th mWord + dfs(keys,dep+1,selected,size); + //on: no conflict + boolean conflict = false; + for(int preKey: selected) + { + int curKey = keys.get(dep); + int preEd = preKey%size, preSt = (preKey-preEd)/size; + int curEd = curKey%size, curSt = (curKey-curEd)/size; + if(!(preSt getEntityIDsAndNamesByStr(String entity, boolean useDblk, int len) + { + String n = entity; + ArrayList ret= new ArrayList(); + + //1. Handwriting + if(m2e.containsKey(entity)) + { + String eName = m2e.get(entity); + EntityMapping em = new EntityMapping(EntityFragmentFields.entityName2Id.get(eName), eName, 1000); + ret.add(em); + return ret; //handwriting is always correct + } + + //2. Lucene index + ret.addAll(EntityFragment.getEntityMappingList(n)); + + //3. DBpedia Lookup (some cases) + if (useDblk) + { + ret.addAll(Globals.dblk.getEntityMappings(n, null)); + } + + Collections.sort(ret); + + if (ret.size() > 0) return ret; + else return null; + } + + public int preferDBpediaLookupOrLucene(String entityName) + { + int cntUpperCase = 0; + int cntSpace = 0; + int cntPoint = 0; + int length = entityName.length(); + for (int i=0; i='A' && c<='Z') + cntUpperCase++; + } + + if ((cntUpperCase>0 || cntPoint>0) && cntSpace<3) + return 1; + if (cntUpperCase == length) + return 1; + return 0; + } + + static class ByValueComparator implements Comparator { + HashMap base_map; + int base_size; + double eps = 1e-8; + + int dblcmp(double a,double b) + { + if(a+eps < b) + return -1; + return b+eps base_map, Integer size) { + this.base_map = base_map; + this.base_size = size; + } + + public int compare(Integer arg0, Integer arg1) { + if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) { + return 0; + } + + if (dblcmp(base_map.get(arg0),base_map.get(arg1))<0) { + return 1; + } + else if (dblcmp(base_map.get(arg0),base_map.get(arg1))==0) + { + int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; + if (len0 < len1) { + return 1; + } else if (len0 == len1) { + return 0; + } else { + return -1; + } + } + else { + return -1; + } + } + } + + static class ByLenComparator implements Comparator { + int base_size; + + public ByLenComparator(int size) { + this.base_size = size; + } + + public int compare(Integer arg0, Integer arg1) { + int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; + if (len0 < len1) { + return 1; + } else if (len0 == len1) { + return 0; + } else { + return -1; + } + } + } + + public boolean isDigit(char ch) + { + if(ch>='0' && ch<='9') + return true; + return false; + } + + //TODO: other literal words. + public boolean checkLiteralWord(Word word) + { + boolean ok = false; + if(word.posTag.equals("CD")) + ok = true; + return ok; + } + + public static void main (String[] args) + { + Globals.init(); + EntityRecognition er = new EntityRecognition(); + try + { + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + while (true) + { + System.out.println("Please input the question: "); + String question = br.readLine(); + + er.process(question); + } + +// File inputFile = new File("D:\\husen\\gAnswer\\data\\test\\test_in.txt"); +// File outputFile = new File("D:\\husen\\gAnswer\\data\\test\\test_out.txt"); +// BufferedReader fr = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile),"utf-8")); +// OutputStreamWriter fw = new OutputStreamWriter(new FileOutputStream(outputFile,true),"utf-8"); +// +// String input; +// while((input=fr.readLine())!=null) +// { +// String[] strArray = input.split("\t"); +// String id = ""; +// String question = strArray[0]; +// if(strArray.length>1) +// { +// question = strArray[1]; +// id = strArray[0]; +// } +// //Notice "?" may leads lucene/dbpedia lookup error +// if(question.length()>1 && question.charAt(question.length()-1)=='.' || question.charAt(question.length()-1)=='?') +// question = question.substring(0,question.length()-1); +// if(question.isEmpty()) +// continue; +// er.process(question); +// fw.write("Id: "+id+"\nQuery: "+question+"\n"); +// fw.write(er.preLog+"\n"); +// } +// +// fr.close(); +// fw.close(); + + } catch (IOException e) { + e.printStackTrace(); + } + } + +} diff --git a/src/qa/extract/ExtractImplicitRelation.java b/src/qa/extract/ExtractImplicitRelation.java new file mode 100644 index 0000000..598788a --- /dev/null +++ b/src/qa/extract/ExtractImplicitRelation.java @@ -0,0 +1,467 @@ +package qa.extract; + +import java.io.BufferedReader; +//import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; + +import paradict.ParaphraseDictionary; +import qa.Globals; +import rdf.Sparql; +import rdf.Triple; +import rdf.ImplicitRelation; +import lcn.EntityFragmentFields; +import log.QueryLogger; +import fgmt.EntityFragment; +import fgmt.TypeFragment; +import nlp.ds.Word; +import nlp.tool.CoreNLP; + +public class ExtractImplicitRelation { + + static final int SamplingNumber = 100; // the maximum sampling number in calculation + static final int k = 3; // select top-k when many suitable relations; select top-k entities for a word + public HashMap implicitEntRel = new HashMap(); + + /* + * Implicit Relations: + * eg, Which is the film directed by Obama and starred by a Chinese ?x + * 1. [What] is in a [chocolate_chip_cookie] ?var + ent + * 2. What [country] is [Sitecore] from ?type + ent = [?var p ent + ?var<-type] + * 3. Czech movies | Chinese actor ent + ?type + * 4. President Obama type + ent + * 5. Andy Liu's Hero(film) ent + ent + * */ + public ExtractImplicitRelation() + { + //orignal word to lower case + implicitEntRel.put("american", Globals.pd.predicate_2_id.get("country")); + implicitEntRel.put("united_states", Globals.pd.predicate_2_id.get("country")); + } + + // Notice, it is usually UNNECESSARY for two constant, so we unimplemented this function. + // eg, "president Obama", "Andy Liu's Hero(film)". + public ArrayList getPrefferdPidListBetweenTwoConstant(Word w1, Word w2) + { + ArrayList res = new ArrayList(); + int w1Role = 0, w2Role = 0; // 0:var 1:ent 2:type + if(w1.mayEnt && w1.emList.size()>0) + w1Role = 1; + if(w1.mayType && w1.tmList.size()>0) + w1Role = 2; + if(w2.mayEnt && w2.emList.size()>0) + w2Role = 1; + if(w2.mayType && w2.tmList.size()>0) + w2Role = 2; + + //Reject variables | two types + if(w1Role == 0 || w2Role == 0 || (w1Role == 2 && w2Role == 2)) + return null; + + //ent1 & ent2 + //if(w1Role == 1 && w2Role == 1) + //{ + //EntityFragment ef = null; + // TODO: implement. + //} + + return res; + } + + public ArrayList supplementTriplesByModifyWord(QueryLogger qlog) + { + ArrayList res = new ArrayList(); + ArrayList typeVariableList = new ArrayList(); + + // Modifier + for(Word word: qlog.s.words) + { + if(word.modifiedWord != null && word.modifiedWord != word) + { + ArrayList irList = null; + // ent -> typeVariable | eg, Chinese actor, Czech movies | TODO: consider more types of modifier + if(word.mayEnt && word.modifiedWord.mayType) + { + typeVariableList.add(word.modifiedWord); + int tId = word.modifiedWord.tmList.get(0).typeID; // select the top-1 type + String tName = word.modifiedWord.originalForm; + for(int i=0; i0) + { + ImplicitRelation ir = irList.get(0); + String subjName = null, objName = null; + Word subjWord = null, objWord = null; + if(ir.subjId == eId) + { + subjName = eName; + objName = "?"+tName; + subjWord = word; + objWord = word.modifiedWord; + } + else + { + subjName = "?"+tName; + objName = eName; + subjWord = word.modifiedWord; + objWord = word; + } + Triple triple = new Triple(ir.subjId, subjName, ir.pId, ir.objId, objName, null, ir.score, subjWord, objWord); + res.add(triple); + break; + } + } + } + } + } + + if(qlog.rankedSparqls == null || qlog.rankedSparqls.size() == 0) + { + if(res != null && res.size() > 0) + { + Sparql spq = new Sparql(); + for(Triple t: res) + spq.addTriple(t); + + // Add type info + for(Word typeVar: typeVariableList) + { + Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+typeVar.originalForm, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeVar.tmList.get(0).typeName, null, 100); + spq.addTriple(triple); + } + + qlog.rankedSparqls.add(spq); + } + } + else + { + // Supplement implicit relations (modified) for each SPARQL. + for(Sparql spq: qlog.rankedSparqls) + { + for(Triple t: res) + spq.addTriple(t); + } + } + + return res; + } + + /* + * eg:Czech|ent movies|?type Chinese|ent actor|?type + * type variable + entity -> entities belong to type + entity + * */ + public ArrayList getPrefferdPidListBetween_Entity_TypeVariable(Integer entId, Integer typeId) + { + ArrayList res = new ArrayList(); + + TypeFragment tf = TypeFragment.typeFragments.get(typeId); + EntityFragment ef2 = EntityFragment.getEntityFragmentByEntityId(entId); + if(tf == null || ef2 == null) + { + System.out.println("Error in getPrefferdPidListBetween_TypeVariable_Entity :Type(" + + TypeFragment.typeId2ShortName.get(typeId) + ") or Entity(" + EntityFragmentFields.entityId2Name.get(entId) + ") no fragments."); + return null; + } + + // select entities belong to type, count relations | TODO: random select + int samplingCnt = 0; + HashMap irCount = new HashMap(); + for(int candidateEid: tf.entSet) + { + EntityFragment ef1 = EntityFragment.getEntityFragmentByEntityId(candidateEid); + if(ef1 == null) + continue; + + ArrayList tmp = getPrefferdPidListBetween_TwoEntities(ef1, ef2); + if(tmp == null || tmp.size() == 0) + continue; + + if(samplingCnt++ > SamplingNumber) + break; + + for(ImplicitRelation ir: tmp) + { + if(ir.subjId == candidateEid) + ir.setSubjectId(Triple.VAR_ROLE_ID); + else if(ir.objId == candidateEid) + ir.setObjectId(Triple.VAR_ROLE_ID); + + if(irCount.containsKey(ir)) + irCount.put(ir, irCount.get(ir)+1); + else + irCount.put(ir, 1); + } + } + + //sort, get top-k + ByValueComparator bvc = new ByValueComparator(irCount); + List keys = new ArrayList(irCount.keySet()); + Collections.sort(keys, bvc); + for(ImplicitRelation ir: keys) + { + res.add(ir); + if(res.size() >= k) + break; + } + + return res; + } + + public ArrayList getPrefferdPidListBetween_Entity_TypeVariable(String entName, String typeName) + { + if(!TypeFragment.typeShortName2IdList.containsKey(typeName) || !EntityFragmentFields.entityName2Id.containsKey(entName)) + return null; + return getPrefferdPidListBetween_Entity_TypeVariable(EntityFragmentFields.entityName2Id.get(entName), TypeFragment.typeShortName2IdList.get(typeName).get(0)); + } + + static class ByValueComparator implements Comparator { + HashMap base_map; + + public ByValueComparator(HashMap base_map) { + this.base_map = base_map; + } + + public int compare(ImplicitRelation arg0, ImplicitRelation arg1) { + if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) + return 0; + if (base_map.get(arg0) < base_map.get(arg1)) + return 1; + else if (base_map.get(arg0) == base_map.get(arg1)) + return 0; + else + return -1; + } + } + + /* + * Notice, this function has not been used in fact. + * eg:[What] is in a [chocolate_chip_cookie] + * Just guess by single entity: select the most frequent edge. + * */ + public ArrayList getPrefferdPidListBetween_Entity_Variable(Integer entId, String var) + { + ArrayList res = new ArrayList(); + + EntityFragment ef = null; + ef = EntityFragment.getEntityFragmentByEntityId(entId); + + if(ef == null) + { + System.out.println("Error in getPrefferdPidListBetween_Entity_Variable: Entity No Fragments!"); + return null; + } + + // find most frequent inEdge + int pid = findMostFrequentEdge(ef.inEntMap, ef.inEdges); + if(pid != -1) + res.add(new ImplicitRelation(Triple.VAR_ROLE_ID, entId, pid, 100)); + + // find most frequent outEdge + pid = findMostFrequentEdge(ef.outEntMap, ef.outEdges); + if(pid != -1) + res.add(new ImplicitRelation(entId, Triple.VAR_ROLE_ID, pid, 100)); + + return res; + } + + public ArrayList getPrefferdPidListBetween_Entity_Variable(String entName, String var) + { + return getPrefferdPidListBetween_Entity_Variable(EntityFragmentFields.entityName2Id.get(entName), var); + } + + public int findMostFrequentEdge(HashMap> entMap, HashSet edges) + { + int mfPredicateId = -1, maxCount = 0; + HashMap edgeCount = new HashMap(); + for(int key: entMap.keySet()) + { + for(int edge: entMap.get(key)) + { + if(!edgeCount.containsKey(edge)) + edgeCount.put(edge, 1); + else + edgeCount.put(edge, edgeCount.get(edge)+1); + if(maxCount < edgeCount.get(edge)) + { + maxCount = edgeCount.get(edge); + mfPredicateId = edge; + } + } + } + + return mfPredicateId; + } + + // Unnecessary. + public ArrayList getPrefferdPidListBetween_TypeConstant_Entity(Integer typeId, Integer entId) + { + ArrayList res = new ArrayList(); + TypeFragment tf = TypeFragment.typeFragments.get(typeId); + + if(tf == null) + { + System.out.println("Error in getPrefferdPidListBetween_TypeConstant_Entity: Type No Fragments!"); + return null; + } + + // subj : ent1 + if(tf.entSet.contains(entId)) + { + ImplicitRelation ir = new ImplicitRelation(entId, typeId, Globals.pd.typePredicateID, 100); + res.add(ir); + } + + return res; + } + + public ArrayList getPrefferdPidListBetween_TwoEntities(String eName1, String eName2) + { + return getPrefferdPidListBetween_TwoEntities(EntityFragmentFields.entityName2Id.get(eName1), EntityFragmentFields.entityName2Id.get(eName2)); + } + + public ArrayList getPrefferdPidListBetween_TwoEntities(Integer eId1, Integer eId2) + { + EntityFragment ef1 = null, ef2 = null; + ef1 = EntityFragment.getEntityFragmentByEntityId(eId1); + ef2 = EntityFragment.getEntityFragmentByEntityId(eId2); + + if(ef1 == null || ef2 == null) + { + System.out.println("Error in GetPrefferdPidListBetweenTwoEntities: Entity No Fragments!"); + return null; + } + + return getPrefferdPidListBetween_TwoEntities(ef1,ef2); + } + + public ArrayList getPrefferdPidListBetween_TwoEntities(EntityFragment ef1, EntityFragment ef2) + { + ArrayList res = new ArrayList(); + if(ef1 == null || ef2 == null) + return null; + + int eId1 = ef1.eId; + int eId2 = ef2.eId; + + // subj : ent1 + if(ef1.outEntMap.containsKey(eId2)) + { + ArrayList pidList = ef1.outEntMap.get(eId2); + for(int pid: pidList) + { + // TODO: other score strategy + ImplicitRelation ir = new ImplicitRelation(eId1, eId2, pid, 100); + res.add(ir); + } + } + // subj : ent2 + else if(ef2.outEntMap.containsKey(eId1)) + { + ArrayList pidList = ef2.outEntMap.get(eId1); + for(int pid: pidList) + { + ImplicitRelation ir = new ImplicitRelation(eId2, eId1, pid, 100); + res.add(ir); + } + } + + return res; + } + + public static void main(String[] args) throws Exception { + + Globals.coreNLP = new CoreNLP(); + Globals.pd = new ParaphraseDictionary(); + try + { + EntityFragmentFields.load(); + TypeFragment.load(); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + ExtractImplicitRelation eir = new ExtractImplicitRelation(); + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + + String name1,name2; + while(true) + { + System.out.println("Input two node to extract their implicit relations:"); + name1 = br.readLine(); + name2 = br.readLine(); + + ArrayList irList = null; + + irList = eir.getPrefferdPidListBetween_TwoEntities(name1, name2); + if(irList == null || irList.size()==0) + System.out.println("Can't find!"); + else + { + for(ImplicitRelation ir: irList) + { + int pId = ir.pId; + String p = Globals.pd.getPredicateById(pId); + System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId); + System.out.println(ir.subj+"\t"+p+"\t"+ir.obj); + } + } + +// irList = eir.getPrefferdPidListBetween_TypeConstant_Entity(name1, name2); +// if(irList == null || irList.size()==0) +// System.out.println("Can't find!"); +// else +// { +// for(ImplicitRelation ir: irList) +// { +// int pId = ir.pId; +// String p = Globals.pd.getPredicateById(pId); +// System.out.println(ir.subj+"\t"+p+"\t"+ir.obj); +// } +// } + +// irList = eir.getPrefferdPidListBetween_Entity_Variable(name1, name2); +// if(irList == null || irList.size()==0) +// System.out.println("Can't find!"); +// else +// { +// for(ImplicitRelation ir: irList) +// { +// int pId = ir.pId; +// String p = Globals.pd.getPredicateById(pId); +// System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId); +// } +// } + +// irList = eir.getPrefferdPidListBetween_Entity_TypeVariable(name1, name2); +// if(irList == null || irList.size()==0) +// System.out.println("Can't find!"); +// else +// { +// for(ImplicitRelation ir: irList) +// { +// int pId = ir.pId; +// String p = Globals.pd.getPredicateById(pId); +// System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId); +// } +// } + } + } +} diff --git a/src/qa/extract/ExtractRelation.java b/src/qa/extract/ExtractRelation.java new file mode 100644 index 0000000..a04f3d3 --- /dev/null +++ b/src/qa/extract/ExtractRelation.java @@ -0,0 +1,472 @@ +package qa.extract; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Queue; + +import log.QueryLogger; +import nlp.ds.DependencyTree; +import nlp.ds.DependencyTreeNode; +//import nlp.ds.Word; +import paradict.ParaphraseDictionary; +import qa.Globals; +import rdf.SimpleRelation; +import rdf.PredicateMapping; +import rdf.SemanticRelation; +import rdf.SemanticUnit; + +public class ExtractRelation { + + public static final int notMatchedCountThreshold = 1; // the bigger, the looser (more relations can be extracted) + public static final int notCoverageCountThreshold = 2; + + /* + * Find relations by dependency tree & paraphrases. + * */ + public ArrayList findRelationsBetweenTwoUnit(SemanticUnit su1, SemanticUnit su2, QueryLogger qlog) + { + DependencyTree T = qlog.s.dependencyTreeStanford; + if(qlog.isMaltParserUsed) + T = qlog.s.dependencyTreeMalt; + + DependencyTreeNode n1 = T.getNodeByIndex(su1.centerWord.position), n2 = T.getNodeByIndex(su2.centerWord.position); + ArrayList shortestPath = T.getShortestNodePathBetween(n1,n2); + + ArrayList ret = new ArrayList(); + HashSet BoW_T = new HashSet(); + HashSet SubBoW_T = new HashSet(); + + // (Fix shortest path) Some cases consider the words not in shortest path | eg: What [be] [ent] (famous) for? + // what-be-[ent], the word [be] is useless but we need (famous) + if(shortestPath.size() == 3 && shortestPath.get(1).word.baseForm.equals("be") && T.nodesList.size() > shortestPath.get(2).word.position) + { + shortestPath.remove(1); + shortestPath.add(1, T.getNodeByIndex(shortestPath.get(1).word.position + 1)); + } + + // Shortest path -> SubBag of Words + for(DependencyTreeNode curNode: shortestPath) + { + String text = curNode.word.baseForm; + if(!curNode.word.isIgnored && !Globals.stopWordsList.isStopWord(text)) + { + //!split words |eg, soccer club -> soccer_club(after node recognition) -> soccer club(used in matching paraphrase) + if(curNode.word.mayEnt || curNode.word.mayType) + { + String [] strArray = curNode.word.baseForm.split("_"); + for(String str: strArray) + SubBoW_T.add(str); + } + else + { + SubBoW_T.add(text); + } + } + } + + // DS tree -> Bag of Words + for (DependencyTreeNode curNode : T.getNodesList()) + { + if (!curNode.word.isIgnored) + { + String text = curNode.word.baseForm; + if(curNode.word.mayEnt || curNode.word.mayType) + { + String [] strArray = curNode.word.baseForm.split("_"); + for(String str: strArray) + BoW_T.add(str); + } + else + { + BoW_T.add(text); + } + } + } + // Find candidate patterns by SubBoW_T & inveretdIndex + HashSet candidatePatterns = new HashSet(); + for (String curWord : SubBoW_T) + { + ArrayList postingList = Globals.pd.invertedIndex.get(curWord); + if (postingList != null) + { + candidatePatterns.addAll(postingList); + } + } + + // Check patterns by BoW_P & subtree matching + int notMatchedCount = 0; + HashSet validCandidatePatterns = new HashSet(); + for (String p : candidatePatterns) + { + String[] BoW_P = p.split(" "); + notMatchedCount = 0; // not match number between pattern & question + for (String s : BoW_P) + { + if (s.length() < 2) + continue; + if (s.startsWith("[")) + continue; + if (Globals.stopWordsList.isStopWord(s)) + continue; + if (!BoW_T.contains(s)) + { + notMatchedCount ++; + if (notMatchedCount > notMatchedCountThreshold) + break; + } + } + if (notMatchedCount <= notMatchedCountThreshold) + { + validCandidatePatterns.add(p); + //TODO: to support matching like [soccer_club] + subTreeMatching(p, BoW_P, shortestPath, T, qlog, ret, 'S'); + } + } + + // Another chance for [soccer_club] (the relation embedded in nodes) + if(validCandidatePatterns.size() > 0) + { + if(n1.word.originalForm.contains("_") || n2.word.originalForm.contains("_")) + { + for (String p : validCandidatePatterns) + { + String[] BoW_P = p.split(" "); + notMatchedCount = 0; + int mappedCharacterCount = 0; + int matchedWordInArg = 0; + + boolean[] matchedFlag = new boolean[BoW_P.length]; + for(int idx = 0; idx < BoW_P.length; idx ++) {matchedFlag[idx] = false;} + int idx = 0; + for (String s : BoW_P) + { + if(n1.word.baseForm.contains(s) || n2.word.baseForm.contains(s)) // Hit nodes + matchedWordInArg++; + if(BoW_T.contains(s)) + { + mappedCharacterCount += s.length(); + matchedFlag[idx] = true; + } + idx++; + if (s.length() < 2) + continue; + if (s.startsWith("[")) + continue; + if (Globals.stopWordsList.isStopWord(s)) + continue; + if (!BoW_T.contains(s)) + notMatchedCount ++; + } + // Success if has 2 hits + if(matchedWordInArg >= 2) + { + double matched_score = ((double)(BoW_P.length-notMatchedCount))/((double)(BoW_P.length)); + if (matched_score > 0.95) + matched_score *= 10; // award for WHOLE match + + // TODO: this will make LONGER one has LARGER score, sometimes unsuitable | eg, be bear die in + matched_score = matched_score * Math.sqrt(mappedCharacterCount); + + SimpleRelation sr = new SimpleRelation(); + sr.arg1Word = n1.word; + sr.arg2Word = n2.word; + sr.relationParaphrase = p; + sr.matchingScore = matched_score; + sr.extractingMethod = 'X'; + + if (n1.dep_father2child.endsWith("subj")) + sr.preferredSubj = sr.arg1Word; + + sr.arg1Word.setIsCovered(); + sr.arg2Word.setIsCovered(); + + sr.setPasList(p, matched_score, matchedFlag); + sr.setPreferedSubjObjOrder(T); + + ret.add(sr); + } + } + } + } + return ret; + } + + // Core function of paraphrase matching + private void subTreeMatching (String pattern, String[] BoW_P, + ArrayList shortestPath, + DependencyTree T, QueryLogger qlog, + ArrayList ret, char extractingMethod) + { + DependencyTreeNode n1 = shortestPath.get(0); + DependencyTreeNode n2 = shortestPath.get(shortestPath.size()-1); + + ParaphraseDictionary pd = Globals.pd; + Queue queue = new LinkedList(); + queue.add(T.getRoot()); + + for(DependencyTreeNode curOuterNode: shortestPath) + { + outer: + for(String s: BoW_P) + { + if(s.equals(curOuterNode.word.baseForm)) + { + // try to match all nodes + ArrayList subTreeNodes = new ArrayList(); + Queue queue2 = new LinkedList(); + queue2.add(curOuterNode); + + int unMappedLeft = BoW_P.length; + int mappedCharacterCount = 0; + int hitPathCnt = 0; // words in pattern hit the shortest path + int hitPathBetweenTwoArgCnt = 0; //words in pattern hit the shortest path and excluding the two target nodes + double mappedCharacterCountPunishment = 0; // punishment when contains [[]] (function word) + + DependencyTreeNode curNode; + boolean[] matchedFlag = new boolean[BoW_P.length]; + for(int idx = 0; idx < BoW_P.length; idx ++) {matchedFlag[idx] = false;} + + while (unMappedLeft > 0 && (curNode=queue2.poll())!=null) + { + if (curNode.word.isIgnored) continue; + int idx = 0; + for (String ss : BoW_P) + { + // words in pattern only can be matched once + if (!matchedFlag[idx]) + { + // check word + if (ss.equals(curNode.word.baseForm)) + { + unMappedLeft --; + subTreeNodes.add(curNode); + queue2.addAll(curNode.childrenList); + matchedFlag[idx] = true; + mappedCharacterCount += ss.length(); + if(shortestPath.contains(curNode)) + { + hitPathCnt++; + if(curNode!=n1 && curNode!=n2) + hitPathBetweenTwoArgCnt++; + } + break; + } + // check POS tag + else if (ss.startsWith("[") && posSame(curNode.word.posTag, ss)) + { + unMappedLeft --; + subTreeNodes.add(curNode); + queue2.addAll(curNode.childrenList); + matchedFlag[idx] = true; + mappedCharacterCount += curNode.word.baseForm.length(); + mappedCharacterCountPunishment += 0.01; + break; + } + } + idx ++; + } + } + int unMatchedNoneStopWordCount = 0; + int matchedNoneStopWordCount = 0; + for (int idx = 0; idx < BoW_P.length; idx ++) { + if (BoW_P[idx].startsWith("[")) continue; + if (!matchedFlag[idx]) { + if (!Globals.stopWordsList.isStopWord(BoW_P[idx])) // unmatched + unMatchedNoneStopWordCount ++; + } + else { + if (!Globals.stopWordsList.isStopWord(BoW_P[idx])) // matched + matchedNoneStopWordCount ++; + } + } + + if (unMatchedNoneStopWordCount > notMatchedCountThreshold) { + if(qlog.MODE_debug) System.out.println("----But the pattern\"" + pattern + "\" is not a subtree."); + break outer; + } + + // MUST have notional words matched, non stop words > 0 + if (matchedNoneStopWordCount == 0){ + if(qlog.MODE_debug) System.out.println("----But the matching for pattern \"" + pattern + "\" does not have content words."); + break outer; + } + + // IF partial match and be covered by other pattern, give up the current pattern + if (unMappedLeft > 0) { + StringBuilder subpattern = new StringBuilder(); + for (int idx = 0; idx < BoW_P.length; idx ++) { + if (matchedFlag[idx]) { + subpattern.append(BoW_P[idx]); + subpattern.append(' '); + } + } + subpattern.deleteCharAt(subpattern.length()-1); + if (pd.nlPattern_2_predicateList.containsKey(subpattern)) { + if(qlog.MODE_debug) System.out.println("----But the partially matched pattern \"" + pattern + "\" is another pattern."); + break outer; + } + } + + // !Preposition | suppose only have one preposition + // TODO: consider more preposition | the first preposition may be wrong + DependencyTreeNode prep = null; + for (DependencyTreeNode dtn : subTreeNodes) { + outer2: + for (DependencyTreeNode dtn_child : dtn.childrenList) { + if(pd.prepositions.contains(dtn_child.word.baseForm)) { + prep = dtn_child; + break outer2; + } + } + } + boolean isContained = false; + for(DependencyTreeNode dtn_contain : subTreeNodes) { + if(dtn_contain == prep) isContained = true; + } + if(!isContained && prep != null) { + subTreeNodes.add(prep); + } + + // Relation extracted, set COVER flags + for (DependencyTreeNode dtn : subTreeNodes) + { + dtn.word.isCovered = true; + } + + int cnt = 0; + double matched_score = ((double)(BoW_P.length-unMappedLeft))/((double)(BoW_P.length)); + if (matched_score > 0.95) + matched_score *= 10; // Award for WHOLE match + + // The match ratio between pattern and path larger, the score higher; especially when uncovered with the two target nodes + if(hitPathCnt != 0) + { + double hitScore = 1 + (double)hitPathCnt/(double)BoW_P.length; + if(hitPathBetweenTwoArgCnt == hitPathCnt) + hitScore += 1; + else if(shortestPath.size() >= 4) // If path long enough, pattern still cover with the target nodes, punishment + { + //hitScore = 0.5; + if(hitPathBetweenTwoArgCnt == 0) // If path long enough, pattern cover with target nodes totally, punishment a lot + hitScore = 0.25; + } + matched_score *= hitScore; + } + + matched_score = matched_score * Math.sqrt(mappedCharacterCount) - mappedCharacterCountPunishment; // the longer, the better (unsuitable in some cases) + if (qlog.MODE_debug) System.out.println("☆" + pattern + ", score=" + matched_score); + + DependencyTreeNode subject = n1; + DependencyTreeNode object = n2; + if (subject != object) + { + SimpleRelation sr = new SimpleRelation(); + sr.arg1Word = subject.word; + sr.arg2Word = object.word; + sr.relationParaphrase = pattern; + sr.matchingScore = matched_score; + sr.extractingMethod = extractingMethod; + + if (subject.dep_father2child.endsWith("subj")) + sr.preferredSubj = sr.arg1Word; + + sr.arg1Word.setIsCovered(); + sr.arg2Word.setIsCovered(); + + sr.setPasList(pattern, matched_score, matchedFlag); + sr.setPreferedSubjObjOrder(T); + + ret.add(sr); + cnt ++; + //String binaryRelation = "<" + subjectString + "> <" + pattern + "> <" + objectString + ">"; + } + if (cnt == 0) break outer; + } + } + } + + } + + // [[det]], [[num]], [[adj]], [[pro]], [[prp]], [[con]], [[mod]] + public boolean posSame(String tag, String posWithBracket) { + if ( (posWithBracket.charAt(2) == 'd' && tag.equals("DT")) + || (posWithBracket.charAt(2) == 'n' && tag.equals("CD")) + || (posWithBracket.charAt(2) == 'a' && (tag.startsWith("JJ") || tag.startsWith("RB"))) + || (posWithBracket.charAt(2) == 'c' && tag.startsWith("CC"))//TODO: how about "IN: subordinating conjunction"? + || (posWithBracket.charAt(2) == 'm' && tag.equals("MD"))) { + return true; + } + else if (posWithBracket.charAt(2) == 'p') { + if ( (posWithBracket.charAt(4) == 'o' && tag.startsWith("PR")) + || (posWithBracket.charAt(4) == 'p' && (tag.equals("IN") || tag.equals("TO")))) { + return true; + } + } + return false; + } + + public HashMap groupSimpleRelationsByArgsAndMapPredicate (ArrayList simpleRelations) { + System.out.println("==========Group Simple Relations========="); + + HashMap ret = new HashMap(); + HashMap> key2pasMap = new HashMap>(); + for(SimpleRelation simr : simpleRelations) + { + int key = simr.getHashCode(); + if (!ret.keySet().contains(key)) + { + ret.put(key, new SemanticRelation(simr)); + key2pasMap.put(key, new HashMap()); + } + SemanticRelation semr = ret.get(key); + HashMap pasMap = key2pasMap.get(key); + + // Just use to display. + if (simr.matchingScore > semr.LongestMatchingScore) + { + semr.LongestMatchingScore = simr.matchingScore; + semr.relationParaphrase = simr.relationParaphrase; + } + + // for pid=x, no wonder from which pattern, we only record the highest score and the related pattern. + for (int pid : simr.pasList.keySet()) { + double score = simr.pasList.get(pid); + if (!pasMap.containsKey(pid)) { + pasMap.put(pid, new StringAndDouble(simr.relationParaphrase, score)); + } + else if (score > pasMap.get(pid).score) { + pasMap.put(pid, new StringAndDouble(simr.relationParaphrase, score)); + } + } + } + + for (Integer key : key2pasMap.keySet()) { + SemanticRelation semr = ret.get(key); + HashMap pasMap = key2pasMap.get(key); + semr.predicateMappings = new ArrayList(); + //System.out.print("<"+semr.arg1Word.getFullEntityName() + "," + semr.arg2Word.getFullEntityName() + ">:"); + for (Integer pid : pasMap.keySet()) + { + semr.predicateMappings.add(new PredicateMapping(pid, pasMap.get(pid).score, pasMap.get(pid).str)); + //System.out.print("[" + Globals.pd.getPredicateById(pid) + "," + pasMap.get(pid).str + "," + pasMap.get(pid).score + "]"); + } + Collections.sort(semr.predicateMappings); + } + System.out.println("========================================="); + return ret; + } + + +} + +class StringAndDouble { + public String str; + public double score; + public StringAndDouble (String str, double score) { + this.str = str; + this.score = score; + } +} diff --git a/src/qa/extract/TypeRecognition.java b/src/qa/extract/TypeRecognition.java new file mode 100644 index 0000000..18f4496 --- /dev/null +++ b/src/qa/extract/TypeRecognition.java @@ -0,0 +1,358 @@ +package qa.extract; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; + +import nlp.ds.Word; +import nlp.tool.StopWordsList; +//import fgmt.RelationFragment; +import fgmt.TypeFragment; +import lcn.SearchInTypeShortName; +import log.QueryLogger; +import qa.Globals; +import rdf.PredicateMapping; +import rdf.SemanticRelation; +import rdf.Triple; +import rdf.TypeMapping; + +/* + * 2016-6-17 + * 1. Recognize types (include YAGO type) + * 2、Add some type mapping manually, eg, "US State"-"yago:StatesOfTheUnitedStates" + * 3、Add some extend variable, (generalization of [variable with inherit type] -> [variable with inherit triples]) eg, ?canadian + * */ +public class TypeRecognition { + // dbpedia 2014 + //public static final int[] type_Person = {180,279}; + //public static final int[] type_Place = {49,228}; + //public static final int[] type_Organisation = {419,53}; + + //dbpedia 2016 + public static final int[] type_Person = {5828,15985}; + public static final int[] type_Place = {11197,2188}; + public static final int[] type_Organisation = {1335,4716}; + + public static HashMap extendTypeMap = null; + public static HashMap extendVariableMap = null; + + SearchInTypeShortName st = new SearchInTypeShortName(); + + static + { + extendTypeMap = new HashMap(); + extendVariableMap = new HashMap(); + Triple triple = null; + + //!Handwriting for convenience | TODO: approximate/semantic match of type + extendTypeMap.put("NonprofitOrganizations", "dbo:Non-ProfitOrganisation"); + extendTypeMap.put("GivenNames", "dbo:GivenName"); + extendTypeMap.put("JamesBondMovies","yago:JamesBondFilms"); + extendTypeMap.put("TVShows", "dbo:TelevisionShow"); + extendTypeMap.put("USState", "yago:StatesOfTheUnitedStates"); + extendTypeMap.put("USStates", "yago:StatesOfTheUnitedStates"); + extendTypeMap.put("Europe", "yago:EuropeanCountries"); + extendTypeMap.put("Africa", "yago:AfricanCountries"); + + //!The following IDs are based on DBpedia 2014. + //!extend variable (embedded triples) | eg, [?E|surfers]-?uri dbo:occupation res:Surfing | canadians�� + //1) | [country people] [country] + triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 1639, 2112902, "Canada", null, 100); + extendVariableMap.put("canadian", triple); + triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 1639, 883747, "Germany", null, 100); + extendVariableMap.put("german", triple); + //2) ?bandleader + triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 6690, 5436853, "Bandleader", null, 100); + extendVariableMap.put("bandleader", triple); + triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 6690, 5436854, "Surfing>", null, 100); + extendVariableMap.put("surfer", triple); + } + + public static void recognizeExtendVariable(Word w) + { + String key = w.baseForm; + if(extendVariableMap.containsKey(key)) + { + w.mayExtendVariable = true; + Triple triple = extendVariableMap.get(key).copy(); + if(triple.subjId == Triple.VAR_ROLE_ID && triple.subject.equals(Triple.VAR_NAME)) + triple.subject = "?" + w.originalForm; + if(triple.objId == Triple.VAR_ROLE_ID && triple.object.equals(Triple.VAR_NAME)) + triple.object = "?" + w.originalForm; + w.embbededTriple = triple; + } + } + + public ArrayList getExtendTypeByStr(String allUpperFormWord) + { + ArrayList tmList = new ArrayList(); + + //Do not consider SINGLE-word type (most are useless) | eg, Battle, War, Daughter + if(allUpperFormWord.length() > 1 && allUpperFormWord.substring(1).equals(allUpperFormWord.substring(1).toLowerCase())) + return null; + + //search in YAGO type + if(TypeFragment.yagoTypeList.contains(allUpperFormWord)) + { + //YAGO prefix + String typeName = "yago:"+allUpperFormWord; + TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); + tmList.add(tm); + } + else if(extendTypeMap.containsKey(allUpperFormWord)) + { + String typeName = extendTypeMap.get(allUpperFormWord); + TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); + tmList.add(tm); + } + if(tmList.size()>0) + return tmList; + else + return null; + } + + public ArrayList getTypeIDsAndNamesByStr (String baseform) + { + ArrayList tmList = new ArrayList(); + + try + { + tmList = st.searchTypeScore(baseform, 0.4, 0.8, 10); + Collections.sort(tmList); + if (tmList.size()>0) + return tmList; + else + return null; + + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + public ArrayList recognize (String baseform) { + + char c = baseform.charAt(baseform.length()-1); + if (c >= '0' && c <= '9') { + baseform = baseform.substring(0, baseform.length()-2); + } + + try { + ArrayList ret = st.searchType(baseform, 0.4, 0.8, 10); + ArrayList ret_in = new ArrayList(); + for (String s : ret) { + System.out.println("["+s+"]"); + ret_in.addAll(TypeFragment.typeShortName2IdList.get(s)); + } + if (ret_in.size()>0) return ret_in; + else return null; + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + public static void AddTypesOfWhwords (HashMap semanticRelations) { + ArrayList ret = null; + for (Integer it : semanticRelations.keySet()) + { + SemanticRelation sr = semanticRelations.get(it); + if(!sr.arg1Word.mayType) + { + ret = recognizeSpecial(sr.arg1Word.baseForm); + if (ret != null) + { + sr.arg1Word.tmList = ret; + } + } + if(!sr.arg2Word.mayType) + { + ret = recognizeSpecial(sr.arg2Word.baseForm); + if (ret != null) + { + sr.arg2Word.tmList = ret; + } + } + } + } + + public static ArrayList recognizeSpecial (String wordSpecial) + { + ArrayList tmList = new ArrayList(); + if (wordSpecial.toLowerCase().equals("who")) + { + for (Integer i : type_Person) + { + tmList.add(new TypeMapping(i,"Person",1)); + } + //"who" can also means organization + for (Integer i : type_Organisation) + { + tmList.add(new TypeMapping(i,"Organization",1)); + } + return tmList; + } + else if (wordSpecial.toLowerCase().equals("where")) + { + for (Integer i : type_Place) + { + tmList.add(new TypeMapping(i,"Place",1)); + } + for (Integer i : type_Organisation) + { + tmList.add(new TypeMapping(i,"Organization",1)); + } + return tmList; + } + //TODO: When ... + return null; + } + + /* + * 1. Priority: mayEnt(Uppercase)>mayType>mayEnt + * 2. mayEnt=1: Constant + * 3. mayType=1: + * (1)Variable, a triple will be added when evaluation. | eg, Which [books] by Kerouac were published by Viking Press? + * (2)Constant, it modify other words. | eg, Are tree frogs a type of [amphibian]? + * 4、extend variable (a variable embedded triples) + * */ + public static void constantVariableRecognition(HashMap semanticRelations, QueryLogger qlog) + { + Word[] words = qlog.s.words; + //NOTICE: modifiers(implicit relation) have not been considered. + for (Integer it : semanticRelations.keySet()) + { + SemanticRelation sr = semanticRelations.get(it); + int arg1WordPos = sr.arg1Word.position - 1; + int arg2WordPos = sr.arg2Word.position - 1; + + // extend variable recognition + recognizeExtendVariable(sr.arg1Word); + recognizeExtendVariable(sr.arg2Word); + + // constant or variable + if(sr.arg1Word.mayExtendVariable) + { + //eg, ?canadian (both extendVariable & type) + if(sr.arg1Word.mayType) + sr.arg1Word.mayType = false; + + if(sr.arg1Word.mayEnt) + { + //rule: [extendVaraible & ent] + noun -> ent |eg, Canadian movies -> ent:Canada + if(arg1WordPos+1 < words.length && words[arg1WordPos+1].posTag.startsWith("N")) + { + sr.arg1Word.mayExtendVariable = false; + sr.isArg1Constant = true; + } + else + { + sr.arg1Word.mayEnt = false; + } + } + } + // type + else if(sr.arg1Word.mayType) + { + //rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries + if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of")) + && !words[arg1WordPos-2].posTag.startsWith("V")) + { + sr.isArg1Constant = true; + double largerScore = 1000; + if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) + largerScore = sr.predicateMappings.get(0).score * 2; + PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); + sr.predicateMappings.add(0,nPredicate); + + //constant type should be object + sr.preferredSubj = sr.arg2Word; + } + } + //ent: constant + else if(sr.arg1Word.mayEnt) + { + sr.isArg1Constant = true; + } + + // constant or variable + if(sr.arg2Word.mayExtendVariable) + { + if(sr.arg2Word.mayType) + sr.arg2Word.mayType = false; + + if(sr.arg2Word.mayEnt) + { + if(arg2WordPos+1 < words.length && words[arg2WordPos+1].posTag.startsWith("N")) + { + sr.arg2Word.mayExtendVariable = false; + sr.isArg2Constant = true; + } + else + { + sr.arg2Word.mayEnt = false; + } + } + } + // type + else if(sr.arg2Word.mayType) + { + //rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries + if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of")) + && !words[arg2WordPos-2].posTag.startsWith("V") ) + { + sr.isArg2Constant = true; + double largerScore = 1000; + if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) + largerScore = sr.predicateMappings.get(0).score * 2; + PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); + sr.predicateMappings.add(0,nPredicate); + + sr.preferredSubj = sr.arg1Word; + } + //rule: Be ... a type? + if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a")) + { + sr.isArg2Constant = true; + double largerScore = 1000; + if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) + largerScore = sr.predicateMappings.get(0).score * 2; + PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); + sr.predicateMappings.add(0,nPredicate); + + sr.preferredSubj = sr.arg1Word; + } + } + else if(sr.arg2Word.mayEnt) + { + sr.isArg2Constant = true; + } + + if(sr.arg1Word != sr.preferredSubj) + sr.swapArg1Arg2(); + } + } + + public static void main (String[] args) + { + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + String type = "space mission"; + try + { + TypeFragment.load(); + Globals.stopWordsList = new StopWordsList(); + TypeRecognition tr = new TypeRecognition(); + while(true) + { + System.out.print("Input query type: "); + type = br.readLine(); + tr.recognize(type); + } + + } catch (Exception e) { + e.printStackTrace(); + } + } +} diff --git a/src/qa/mapping/CompatibilityChecker.java b/src/qa/mapping/CompatibilityChecker.java new file mode 100644 index 0000000..ef4d974 --- /dev/null +++ b/src/qa/mapping/CompatibilityChecker.java @@ -0,0 +1,690 @@ +package qa.mapping; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; + +import qa.Globals; +import rdf.Sparql; +import rdf.Triple; +import fgmt.EntityFragment; +import fgmt.RelationFragment; +import fgmt.TypeFragment; +import fgmt.VariableFragment; + +/** + * Notice: one compatiblityChecker can be only used once to check a SPARQL. + * @author husen + */ +public class CompatibilityChecker { + + static int EnumerateThreshold = 1000; + public EntityFragmentDict efd = null; + public HashMap variable_fragment = null; + + public CompatibilityChecker(EntityFragmentDict efd) { + this.efd = efd; + variable_fragment = new HashMap(); + } + + // Run this check function after pass "single triple check" (recoded) + // Recoded: variable will find suitable entities, depend on the inMemory INDEX. Notice when variable = literal + public boolean isSparqlCompatible3 (Sparql spq) + { + boolean[] isFixed = new boolean[spq.tripleList.size()]; // record triple's compatibility whether need check + for (int i = 0; i < spq.tripleList.size(); i ++) { + isFixed[i] = false; + } + + //System.out.println("tripleList size="+spq.tripleList.size()); + Iterator it; + boolean shouldContinue = true; + // shouldContinue when: triple with variables updates variable fragment, then use updated variable fragment check the previous triples + while (shouldContinue) + { + shouldContinue = false; + it = spq.tripleList.iterator(); + int t_cnt = 0; + while (it.hasNext()) { + Triple t = it.next(); + + switch (getTripleType(t)) { + case 1: // (1) E1, P, E2 + if (!isFixed[t_cnt]) + { + int ret = hs_check1_E1PE2(t); + if (ret == 0) + isFixed[t_cnt] = true; + else if (ret == 5) + return false; + } + break; + case 2: // (2) E, P, V + if(!isFixed[t_cnt]) + { + int ret = hs_check2_EPV(t); + if (ret == 5) + return false; + else + { + isFixed[t_cnt] = true; // Now V has set entities or literal; notice E/P->V maybe not unique, eg, xx's starring + if (ret == 1) + shouldContinue = true; + } + } + break; + case 3: // (3) E, , T + if (!isFixed[t_cnt]) + { + int ret = check3_Etype1T(t); + if (ret == -2) return false; + if (ret == 0) isFixed[t_cnt] = true; + } + break; + case 4: // (4) V, P, E + if(!isFixed[t_cnt]) + { + int ret = hs_check4_VPE(t); + if (ret == 5) + return false; + else + { + isFixed[t_cnt] = true; // Now V has set entities or literal; notice E/P->V maybe not unique, eg, xx's starring + if (ret == 1) + shouldContinue = true; + } + } + break; + case 5: // (5) V1, P, V2 (The most important and time consuming) + if(!isFixed[t_cnt]) + { + int ret = hs_check5_V1PV2(t); + if (ret == 5) + return false; + else + { + isFixed[t_cnt] = true; // Just set once and no re-check + if (ret == 1) + shouldContinue = true; + } + } + break; + case 6: // (6) V, , T + if (!isFixed[t_cnt]) + { + int ret = hs_check6_Vtype1T(t); + if (ret == -2) return false; + else + { + isFixed[t_cnt] = true; + if (ret == 1) + shouldContinue = true; + } + } + break; + case 7: + // do nothing + break; + case 8: + default: + return false; + } + t_cnt ++; + } + } + return true; + } + + /** + * Get Triple's category + * (1) E1, P, E2 + * (2) E, P, V + * (3) E, , T + * (4) V, P, E + * (5) V1, P, V2 + * (6) V, , T + * (7) E, , V + * (8) error + * + * E: Entity + * P: Predicate (exclude ) + * V: Variable + * T: Type + * + * @param t + * @return + */ + public int getTripleType (Triple t) { + if (t.predicateID == Globals.pd.typePredicateID) { + boolean s = t.subject.startsWith("?"); + boolean o = t.object.startsWith("?"); + if (s && !o) return 6; + else if (o && !s) return 7; + else if (!s && !o) return 3; + else return 8; + } + else if (t.subject.startsWith("?")) { + if (t.object.startsWith("?")) return 5; + else return 4; + } + else { + if (t.object.startsWith("?")) return 2; + else return 1; + } + } + + + public int hs_check1_E1PE2(Triple t) + { + int pid = t.predicateID; + EntityFragment E1 = efd.getEntityFragmentByEid(t.subjId); + EntityFragment E2 = efd.getEntityFragmentByEid(t.objId); + + // E2 is E1's one depth neighbor, connected with predicate "p" + if(E1.outEntMap.containsKey(E2.eId)) + { + ArrayList pList = E1.outEntMap.get(E2.eId); + if(pList.contains(pid)) + return 0; + } + + return 5; + } + + public int hs_check2_EPV(Triple t) + { + int pid = t.predicateID; + EntityFragment E = efd.getEntityFragmentByEid(t.subjId); + VariableFragment V = variable_fragment.get(t.object); + + // P ∈ E.outEdges + if (!E.outEdges.contains(pid)) { + return 5; + } + + // Set V, notice maybe literal + if(V == null) + { + variable_fragment.put(t.object, new VariableFragment()); + V = variable_fragment.get(t.object); + for(int vid: E.outEntMap.keySet()) + { + if(E.outEntMap.get(vid).contains(pid)) + { + V.candEntities.add(vid); + } + } + // E's outEdges contain p, but cannot find neighbor ENT by p, then V maybe literal + if(V.candEntities.size() == 0) + { + V.mayLiteral = true; + return 0; + } + } + else + { + // just okay if V is literal, because fragment has not stored the literal information + if(V.mayLiteral) + return 0; + + // Update V's binding by current neighbor of E + HashSet newCandEntities = new HashSet(); + if(V.candEntities.size() > 0 && V.candEntities.size() < E.outEntMap.size()) + { + for(int vid: V.candEntities) + { + if(E.outEntMap.containsKey(vid) && E.outEntMap.get(vid).contains(pid)) + { + newCandEntities.add(vid); + } + } + } + else + { + for(int vid: E.outEntMap.keySet()) + { + if(E.outEntMap.get(vid).contains(pid) && (V.candEntities.size() == 0 || V.candEntities.contains(vid))) + { + newCandEntities.add(vid); + } + } + } + V.candEntities = newCandEntities; + } + + if(V.candEntities.size() > 0) + return 0; + else + return 5; + } + + public int check3_Etype1T(Triple t) { + String[] T = t.object.split("\\|"); // ע��"|"��Ҫת�� + EntityFragment E = efd.getEntityFragmentByEid(t.subjId); + + String newTypeString = ""; + boolean contained = false; + + // check whether each type int T is proper for E + if (T.length == 0) return -2; + for (String s : T) { + contained = false; + for (Integer i : TypeFragment.typeShortName2IdList.get(s)) { + if (E.types.contains(i)) { + if (!contained) { + contained = true; + newTypeString += s; + newTypeString += "|"; + } + } + } + } + + if (newTypeString.length() > 1) { + t.object = newTypeString.substring(0, newTypeString.length()-1); + return 0; + } + else return -2; + } + + + public int hs_check4_VPE(Triple t) + { + int pid = t.predicateID; + EntityFragment E = efd.getEntityFragmentByEid(t.objId); + VariableFragment V = variable_fragment.get(t.subject); + TypeFragment subjTf = SemanticItemMapping.getTypeFragmentByWord(t.getSubjectWord()); + + // P ∈ E.inEdges + if (!E.inEdges.contains(pid)) { + return 5; + } + + // Set V, notice V cannot be literal, because now V is subject + if(V == null) + { + variable_fragment.put(t.subject, new VariableFragment()); + V = variable_fragment.get(t.subject); + + for(int vid: E.inEntMap.keySet()) + { + if(E.inEntMap.get(vid).contains(pid) && (subjTf == null || subjTf.entSet.contains(vid))) + { + V.candEntities.add(vid); + } + } + // E's inEdges contain p, but cannot find neighbor ENT by p, now V is subject and cannot be literal, so match fail + if(V.candEntities.size() == 0) + { + return 5; + } + } + else + { + // if V is literal, fail because subject cannot be literal + if(V.mayLiteral) + return 5; + + // update V's binding by current E's neighbors + HashSet newCandEntities = new HashSet(); + if(V.candEntities.size() > 0 && V.candEntities.size() < E.inEntMap.size()) + { + for(int vid: V.candEntities) + { + if(E.inEntMap.containsKey(vid) && E.inEntMap.get(vid).contains(pid)) + { + newCandEntities.add(vid); + } + } + } + else + { + for(int vid: E.inEntMap.keySet()) + { + if(E.inEntMap.get(vid).contains(pid) && (V.candEntities.size() == 0 || V.candEntities.contains(vid))) + { + newCandEntities.add(vid); + } + } + } + V.candEntities = newCandEntities; + } + + if(V.candEntities.size() > 0) + return 0; + else + return 5; + } + + public int check5_V1PV2(Triple t) { + ArrayList pidList = new ArrayList(); + pidList.add(t.predicateID); + VariableFragment V1 = variable_fragment.get(t.subject); + VariableFragment V2 = variable_fragment.get(t.object); + + // V1 & V2's types, equal with types of one fragment of P + Iterator it_int = pidList.iterator(); + ArrayList> newCandTypes1 = new ArrayList>(); + ArrayList> newCandTypes2 = new ArrayList>(); + while (it_int.hasNext()) { + Integer i = it_int.next(); + ArrayList flist = RelationFragment.relFragments.get(i); + Iterator it_rln = flist.iterator(); + while (it_rln.hasNext()) { + RelationFragment rf = it_rln.next(); + if (V1 == null && V2 == null) { + newCandTypes1.add(rf.inTypes); + newCandTypes2.add(rf.outTypes); + } + else if (V1 == null && V2 != null) { + if (V2.containsAll(rf.outTypes)) { + newCandTypes1.add(rf.inTypes); + newCandTypes2.add(rf.outTypes); + } + } + else if (V2 == null && V1 != null) { + if (V1.containsAll(rf.inTypes)) { + newCandTypes1.add(rf.inTypes); + newCandTypes2.add(rf.outTypes); + } + } + else { + if (V1.containsAll(rf.inTypes) && V2.containsAll(rf.outTypes)) + { + newCandTypes1.add(rf.inTypes); + newCandTypes2.add(rf.outTypes); + } + } + } + } + + if (newCandTypes1.size() > 0 && newCandTypes2.size() > 0) { + if (V1 == null && V2 == null) { + variable_fragment.put(t.subject, new VariableFragment()); + variable_fragment.get(t.subject).candTypes = newCandTypes1; + + variable_fragment.put(t.object, new VariableFragment()); + variable_fragment.get(t.object).candTypes = newCandTypes2; + return 1; + } + else if (V1 == null && V2 != null) { + variable_fragment.put(t.subject, new VariableFragment()); + variable_fragment.get(t.subject).candTypes = newCandTypes1; + + if (V2.candTypes.size() > newCandTypes2.size()) { + V2.candTypes = newCandTypes2; + return 1; + } + else return 0; + } + else if (V2 == null && V1 != null) { + variable_fragment.put(t.object, new VariableFragment()); + variable_fragment.get(t.object).candTypes = newCandTypes2; + + if (V1.candTypes.size() > newCandTypes1.size()) { + V1.candTypes = newCandTypes1; + return 1; + } + else return 0; + } + else { + if (V1.candTypes.size() > newCandTypes1.size() || V2.candTypes.size() > newCandTypes2.size()) { + V1.candTypes = newCandTypes1; + V2.candTypes = newCandTypes2; + return 1; + } + else return 0; + } + } + else return 5; + } + + public int hs_check5_V1PV2(Triple t) + { + int pid = t.predicateID; + VariableFragment V1 = variable_fragment.get(t.subject); + VariableFragment V2 = variable_fragment.get(t.object); + + if(V1 == null && V2 == null) // The WORST case, current relation fragment has no records of two target entities, cannot check without types, so we should put this triple in the end + { + return 0; // in fact should return 1, just expect the unchecked triples can provide candidates of V1,V2 then can check in the next turn + } + else if(V2 == null) + { + if(V1.mayLiteral) + return 5; + + variable_fragment.put(t.object, new VariableFragment()); + V2 = variable_fragment.get(t.object); + + HashSet newV1cands = new HashSet(); + int cnt = 0; + for(int v1id: V1.candEntities) + { + cnt++; + if(cnt > EnumerateThreshold) + break; + EntityFragment E = efd.getEntityFragmentByEid(v1id); + if(E != null && E.outEdges.contains(pid)) + { + newV1cands.add(v1id); + for(int v2id: E.outEntMap.keySet()) + { + if(E.outEntMap.get(v2id).contains(pid)) + V2.candEntities.add(v2id); + } + } + } + V1.candEntities = newV1cands; + } + else if(V1 == null) + { + if(V2.mayLiteral) + return 0; + + variable_fragment.put(t.subject, new VariableFragment()); + V1 = variable_fragment.get(t.subject); + + HashSet newV2cands = new HashSet(); + int cnt = 0; + for(int v2id: V2.candEntities) + { + cnt++; + if(cnt > EnumerateThreshold) + break; + EntityFragment E = efd.getEntityFragmentByEid(v2id); + if(E != null && E.inEdges.contains(pid)) + { + newV2cands.add(v2id); + for(int v1id: E.inEntMap.keySet()) + { + if(E.inEntMap.get(v1id).contains(pid)) + V1.candEntities.add(v1id); + } + } + } + V2.candEntities = newV2cands; + } + else + { + if(V1.mayLiteral) + return 5; + if(V2.mayLiteral) + return 0; + + HashSet newV1cands = new HashSet(); + HashSet newV2cands = new HashSet(); + for(int v1id: V1.candEntities) + { + EntityFragment E1 = efd.getEntityFragmentByEid(v1id); + if(E1 != null && E1.outEdges.contains(pid)) + newV1cands.add(v1id); + } + V1.candEntities = newV1cands; + for(int v2id: V2.candEntities) + { + EntityFragment E2 = efd.getEntityFragmentByEid(v2id); + if(E2 != null && E2.inEdges.contains(pid)) + newV2cands.add(v2id); + } + V2.candEntities = newV2cands; + + newV1cands = new HashSet(); + newV2cands = new HashSet(); + for(int v1id: V1.candEntities) + { + EntityFragment E1 = efd.getEntityFragmentByEid(v1id); + for(int v2id: V2.candEntities) + { + if(E1.outEntMap.containsKey(v2id) && E1.outEntMap.get(v2id).contains(pid)) + { + newV1cands.add(v1id); + newV2cands.add(v2id); + } + } + } + V1.candEntities = newV1cands; + V2.candEntities = newV2cands; + } + + if(V1.candEntities.size() == 0 || (V2.candEntities.size() == 0 && !RelationFragment.isLiteral(pid))) + return 5; + else + return 0; + } + + public int check6_Vtype1T(Triple t) { + + String[] T = t.object.split("\\|"); // notice "|" need "\\|" + VariableFragment V = variable_fragment.get(t.subject); + + String newTypeString = ""; + boolean contained = false; + + // check whether each type in T is proper for V + if (T.length == 0) return -2; + + ArrayList> newCandTypes = new ArrayList>(); + for (String s : T) + { + contained = false; + + //YAGO type (uncoded types), just return because we have no INDEX to check it + if(!TypeFragment.typeShortName2IdList.containsKey(s)) + return 0; + + for (Integer i : TypeFragment.typeShortName2IdList.get(s)) + { + if (V == null) { + // constraint V by user given types, flag it due to possible incomplete type + HashSet set = new HashSet(); + set.add(i); + set.add(VariableFragment.magic_number); + newCandTypes.add(set); + if (!contained) { + contained = true; + newTypeString += s; + newTypeString += "|"; + } + } + else if (V.contains(i)) { + if (!contained) { + contained = true; + newTypeString += s; + newTypeString += "|"; + } + } + } + } + + // check whether each fragment in V is proper for T + // if not, delete the fragment (that means we can narrow the scope) + ArrayList> deleteCandTypes = new ArrayList>(); + if (V != null) + { + Iterator> it = V.candTypes.iterator(); + while(it.hasNext()) { + HashSet set = it.next(); + boolean isCandTypeOkay = false; + //v get [constraint types] through other triples, at least one type can reserve, otherwise delete the [constriant types] + for (String s : T) + { + for (Integer i : TypeFragment.typeShortName2IdList.get(s)) { + if (set.contains(i)) { + isCandTypeOkay = true; + break; + } + } + } + if (!isCandTypeOkay) { + deleteCandTypes.add(set); + } + } + V.candTypes.removeAll(deleteCandTypes); + } + + + if (V == null) { + variable_fragment.put(t.subject, new VariableFragment()); + variable_fragment.get(t.subject).candTypes = newCandTypes; + } + if (newTypeString.length() > 1) { + t.object = newTypeString.substring(0, newTypeString.length()-1); + if (deleteCandTypes.size() > 0) { + return 1; + } + else { + return 0; + } + } + else return -2; + } + + public int hs_check6_Vtype1T(Triple t) + { + String[] tList = t.object.split("\\|"); // ע��"|"��Ҫת�� + VariableFragment V = variable_fragment.get(t.subject); + + if (tList.length == 0) return -2; + + // Simplify, only consider the first one + if(!TypeFragment.typeShortName2IdList.containsKey(tList[0])) + return 0; + + int tid = TypeFragment.typeShortName2IdList.get(tList[0]).get(0); + TypeFragment T = TypeFragment.typeFragments.get(tid); + if(V == null) + { + variable_fragment.put(t.subject, new VariableFragment()); + V = variable_fragment.get(t.subject); + V.candEntities = T.entSet; + } + else + { + if(V.mayLiteral) //literal cannot be subject + return -2; + + HashSet newVcands = new HashSet(); + for(int vid: V.candEntities) + { + EntityFragment E = efd.getEntityFragmentByEid(vid); + if(E.types.contains(tid)) + newVcands.add(vid); + } + V.candEntities = newVcands; + } + + if(V.candEntities.size() == 0) + return -2; + else + return 0; + } + + public void swapTriple (Triple t) { + String temp = t.subject; + t.subject = t.object; + t.object = temp; + } +}; \ No newline at end of file diff --git a/src/qa/mapping/DBpediaLookup.java b/src/qa/mapping/DBpediaLookup.java new file mode 100644 index 0000000..54e027b --- /dev/null +++ b/src/qa/mapping/DBpediaLookup.java @@ -0,0 +1,164 @@ +package qa.mapping; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; + +import lcn.EntityFragmentFields; +import log.QueryLogger; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.methods.GetMethod; + +import fgmt.EntityFragment; +import rdf.EntityMapping; + +public class DBpediaLookup { + //There are two websites of the DBpediaLookup online service. + //public static final String baseURL = "http://en.wikipedia.org/w/api.php?action=opensearch&format=xml&limit=10&search="; + //public static final String baseURL = "http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=5&QueryString="; + public static final String baseURL = "http://172.31.222.72:1234/api/search/KeywordSearch?MaxHits=5&QueryString="; + + public HttpClient ctripHttpClient = null; + + //public static final String begin = ""; + //public static final String begin = "\n "; + public static final String end = ""; + public static final int end_length = end.length(); + + public static HashMapentMentionDict = null; // TODO: base on redirect data & wikipedia click data to build mention2ent's dictionary, now just manually + + public DBpediaLookup() + { + ctripHttpClient = new HttpClient(); + ctripHttpClient.setTimeout(3000); + + entMentionDict = new HashMap(); + entMentionDict.put("Prince_Charles", "Charles,_Prince_of_Wales"); + } + + public ArrayList getEntityMappings(String searchString, QueryLogger qlog) + { + ArrayList slist = new ArrayList(); + if(entMentionDict.containsKey(searchString)) + slist.add(entMentionDict.get(searchString)); + else + slist = lookForEntityNames(searchString, qlog); + + if (slist.size() == 0 && searchString.contains(". ")) + slist.addAll(lookForEntityNames(searchString.replaceAll(". ", "."), qlog)); + + ArrayList emlist = new ArrayList(); + + // Now string use "_" as delimiter (original) + String[] sa = searchString.split("_"); + int UpperCnt = 0; + for(String str: sa) + { + if( (str.charAt(0)>='A'&&str.charAt(0)<='Z') || (str.charAt(0)>='0'&&str.charAt(0)<='9') ) + UpperCnt ++; + } + + System.out.print("DBpediaLookup find: " + slist + ", "); + + int count = 40; + for (String s : slist) + { + //consider ABBR only when all UPPER; drop when too long edit distance + if(UpperCnt < sa.length && EntityFragment.calEditDistance(s, searchString.replace("_", ""))>searchString.length()/2) + continue; + + int eid = -1; + s = s.replace(" ", "_"); + if(EntityFragmentFields.entityName2Id.containsKey(s)) + { + eid = EntityFragmentFields.entityName2Id.get(s); + emlist.add(new EntityMapping(eid, s, count)); + count -=2 ; + } + else + { + System.out.print("Drop "+s+" because it not in Entity Dictionary. "); + } + } + System.out.println("DBpediaLookup select: " + emlist); + + return emlist; + } + + public ArrayList lookForEntityNames (String searchString, QueryLogger qlog) { + // URL transition: " " -> %20 + GetMethod getMethod = new GetMethod((baseURL+searchString).replaceAll(" ", "%20")); + ArrayList ret = new ArrayList(); + int statusCode; + + try { + statusCode = ctripHttpClient.executeMethod(getMethod); + } catch (HttpException e) { + e.printStackTrace(); + return ret; + } catch (IOException e) { + e.printStackTrace(); + return ret; + } + + if (statusCode!=200) return null; + + String response = getMethod.getResponseBodyAsString(); + if (qlog != null && qlog.MODE_debug) { + System.out.println("searchString=" + searchString); + System.out.println("statusCode=" + statusCode); + System.out.println("response=" + getMethod.getResponseBodyAsString()); + } + getMethod.releaseConnection(); + + //System.out.println(response); + + if (response == null || response.isEmpty()) + return ret; + int idx1 = response.indexOf(begin); + while (idx1 != -1) { + int idx2 = response.indexOf(end, idx1+begin_length); + String ss = response.substring(idx1+begin_length, idx2); + ret.add(ss); + //System.out.println(ss); + idx1 = response.indexOf(begin, idx2 + end_length); + } + + return ret; + } + + public static void main(String argv[]){ + + DBpediaLookup dbplook = new DBpediaLookup(); + + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + try { + while (true) { + System.out.println("Test DBpediaLookup."); + System.out.print("Please input the search string: "); + String searchString = br.readLine(); + try { + long t1 = System.currentTimeMillis(); + ArrayList res = dbplook.lookForEntityNames(searchString, null); + long t2 = System.currentTimeMillis(); + System.out.println(res); + System.out.println("time=" + (t2-t1) + "ms"); + } catch (Exception e) { + e.printStackTrace(); + } + } + } catch (IOException e) { + e.printStackTrace(); + } + + + return; + } +} diff --git a/src/qa/mapping/EntityFragmentDict.java b/src/qa/mapping/EntityFragmentDict.java new file mode 100644 index 0000000..302912f --- /dev/null +++ b/src/qa/mapping/EntityFragmentDict.java @@ -0,0 +1,44 @@ +package qa.mapping; + +import java.util.HashMap; + +//import lcn.EntityFragmentFields; +//import qa.Globals; +import fgmt.EntityFragment; + +public class EntityFragmentDict { + //public HashMap entityFragmentDictionary = new HashMap(); + public HashMap entityFragmentDictionary = new HashMap(); + + public EntityFragment getEntityFragmentByEid (Integer eid) + { + if (!entityFragmentDictionary.containsKey(eid)) + { + entityFragmentDictionary.put(eid, EntityFragment.getEntityFragmentByEntityId(eid)); + } + return entityFragmentDictionary.get(eid); + + } + + /* + * Old version, search by name + * */ +// public EntityFragment getEntityFragmentByName (String name) { +// if (name.startsWith("?")) { +// return null; +// } +// if (!entityFragmentDictionary.containsKey(name)) { +// String fgmt = EntityFragment.getEntityFgmtStringByName(name); +// if (fgmt != null) +// { +// int eid = EntityFragmentFields.entityName2Id.get(name); +// entityFragmentDictionary.put(name, new EntityFragment(eid, fgmt)); +// } +// else { +// entityFragmentDictionary.put(name, null); +// } +// } +// return entityFragmentDictionary.get(name); +// +// } +} diff --git a/src/qa/mapping/SemanticItemMapping.java b/src/qa/mapping/SemanticItemMapping.java new file mode 100644 index 0000000..1db1097 --- /dev/null +++ b/src/qa/mapping/SemanticItemMapping.java @@ -0,0 +1,811 @@ +package qa.mapping; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; + +import nlp.ds.Word; +import nlp.ds.Sentence.SentenceType; +import fgmt.EntityFragment; +import fgmt.RelationFragment; +import fgmt.TypeFragment; +import log.QueryLogger; +import qa.Globals; +import rdf.EntityMapping; +import rdf.PredicateMapping; +import rdf.SemanticRelation; +import rdf.Sparql; +import rdf.Triple; +import rdf.TypeMapping; + +public class SemanticItemMapping { + + public HashMap> entityDictionary = new HashMap>(); + public static int k = 10; // useless now + public static int t = 10; // Depth of enumerating candidates of each node/edge. O(t^n). + ArrayList rankedSparqls = new ArrayList(); + HashSet checkedSparqlStrs = new HashSet(); + + public ArrayList> entityPhrasesList = new ArrayList>(); + public ArrayList entityWordList = new ArrayList(); + public HashMap currentEntityMappings = new HashMap(); + + public ArrayList> predicatePhraseList = new ArrayList>(); + public ArrayList predicateSrList = new ArrayList(); + public HashMap currentPredicateMappings = new HashMap(); + + public HashMap semanticRelations = null; + public QueryLogger qlog = null; + + public EntityFragmentDict efd = new EntityFragmentDict(); + + public boolean isAnswerFound = false; + public int tripleCheckCallCnt = 0; + public int sparqlCheckCallCnt = 0; + public int sparqlCheckId = 0; + + SemanticRelation firstFalseSr = null; + long tripleCheckTime = 0; + long sparqlCheckTime = 0; + + /* + * A best-first top-down method, enumerate all possible query graph and sort. + * Notice, we use fragment checking to simulate graph matching and generate the TOP-k SPARQL queries, which can be executed via GStore or Virtuoso. + * */ + public void process(QueryLogger qlog, HashMap semRltn) + { + semanticRelations = semRltn; + this.qlog = qlog; + long t1; + t = 10; // Notice, t is adjustable. + + entityPhrasesList.clear(); + entityWordList.clear(); + currentEntityMappings.clear(); + predicatePhraseList.clear(); + predicateSrList.clear(); + currentPredicateMappings.clear(); + + // 1. collect info of constant nodes(entities) + Iterator> it = semanticRelations.entrySet().iterator(); + while(it.hasNext()) + { + Map.Entry entry = it.next(); + SemanticRelation sr = entry.getValue(); + + //We now only tackle Constant of Entity & Type. TODO: consider Literal. + if(sr.isArg1Constant && !sr.arg1Word.mayType && !sr.arg1Word.mayEnt || sr.isArg2Constant && !sr.arg2Word.mayType && !sr.arg2Word.mayEnt) + { + it.remove(); + continue; + } + + //Type constant will be solved in ScoreAndRanking function. + if(sr.isArg1Constant && sr.arg1Word.mayEnt) + { + if(!entityDictionary.containsKey(sr.arg1Word)) + entityDictionary.put(sr.arg1Word, sr.arg1Word.emList); + entityPhrasesList.add(sr.arg1Word.emList); + entityWordList.add(sr.arg1Word); + } + if(sr.isArg2Constant && !sr.arg2Word.mayType) + { + if (!entityDictionary.containsKey(sr.arg2Word)) + entityDictionary.put(sr.arg2Word, sr.arg2Word.emList); + entityPhrasesList.add(sr.arg2Word.emList); + entityWordList.add(sr.arg2Word); + } + } + + // 2. collect info of edges(relations). + for (Integer key : semanticRelations.keySet()) + { + SemanticRelation sr = semanticRelations.get(key); + predicatePhraseList.add(sr.predicateMappings); + predicateSrList.add(sr); + + // Reduce t when structure enumeration needed. + if(Globals.evaluationMethod > 1 && !sr.isSteadyEdge) + t = 5; + } + + // 3. top-k join + t1 = System.currentTimeMillis(); + if(semanticRelations.size()>0) + topkJoin(semanticRelations); + else + System.out.println("No Valid SemanticRelations."); + + qlog.timeTable.put("TopkJoin", (int)(System.currentTimeMillis()-t1)); + qlog.timeTable.put("TripleCheck", (int)tripleCheckTime); + qlog.timeTable.put("SparqlCheck", (int)sparqlCheckTime); + + Collections.sort(rankedSparqls); + // Notice, use addAll because we may have more than one node recognition decision. + qlog.rankedSparqls.addAll(rankedSparqls); + qlog.entityDictionary = entityDictionary; + + System.out.println("Check query graph count: " + tripleCheckCallCnt + "\nPass single check: " + sparqlCheckCallCnt + "\nPass final check: " + rankedSparqls.size()); + System.out.println("TopkJoin time=" + qlog.timeTable.get("TopkJoin")); + } + + public void topkJoin (HashMap semanticRelations) + { + dfs_entityName(0); + } + + // Each level for a CERTAIN entity + public void dfs_entityName (int level_i) + { + // All entities ready. + if (level_i == entityPhrasesList.size()) + { + dfs_predicate(0); + return; + } + + ArrayList list = entityPhrasesList.get(level_i); + Word w = entityWordList.get(level_i); + int tcount = 0; + for(EntityMapping em : list) + { + if (tcount == t || isAnswerFound) break; + currentEntityMappings.put(w.hashCode(), em); + dfs_entityName(level_i+1); + currentEntityMappings.remove(w.hashCode()); + tcount ++; + } + } + + public void dfs_predicate(int level_i) + { + // All entities & predicates ready, start generate SPARQL. + if (level_i == predicatePhraseList.size()) + { + scoringAndRanking(); + return; + } + + ArrayList list = predicatePhraseList.get(level_i); + SemanticRelation sr = predicateSrList.get(level_i); + if (sr.dependOnSemanticRelation != null) + { + dfs_predicate(level_i+1); + } + else + { + int tcount=0; + for (PredicateMapping pm : list) + { + if (tcount==t || isAnswerFound) break; + currentPredicateMappings.put(sr.hashCode(), pm); + dfs_predicate(level_i+1); + currentPredicateMappings.remove(sr.hashCode()); + tcount++; + + // Pruning (If we do not change predicate of firstFalseSr, it will still false, so just return) + if(firstFalseSr != null) + { + if(firstFalseSr != sr) return; + else firstFalseSr = null; + } + } + + // "null" means we drop this edge, this is how we enumerate structure. + if(Globals.evaluationMethod == 2 && sr.isSteadyEdge == false) + { + currentPredicateMappings.put(sr.hashCode(), null); + dfs_predicate(level_i+1); + currentPredicateMappings.remove(sr.hashCode()); + tcount++; + } + } + } + + /* + * Run this function when all nodes/edges have set value (through currentEntityMappings、currentPredicateMappings) + * Generate SPARQL according current ENTs and RELATIONs, then fragment checking + * Notice: add embedded type information: + * eg, ?who ?how --add--> ?who | ?book --add--> ?book + * Notice: add constant type information: + * eg, ask: + * Notice: add embedded triple information: + * eg, ?Canadians --add--> ?Canadians + * */ + public void scoringAndRanking() + { + firstFalseSr = null; + Sparql sparql = new Sparql(semanticRelations); + + // A simple way to judge connectivity (may incorrect when nodes number >= 6) + //TODO: a standard method to judge CONNECTIVITY + HashMap count = new HashMap(); + int edgeCnt = 0; + for (Integer key : semanticRelations.keySet()) + { + SemanticRelation sr = semanticRelations.get(key); + if(currentPredicateMappings.get(sr.hashCode()) == null) + continue; + + edgeCnt++; + int v1 = sr.arg1Word.hashCode(), v2 = sr.arg2Word.hashCode(); + if(!count.containsKey(v1)) + count.put(v1, 1); + else + count.put(v1, count.get(v1)+1); + if(!count.containsKey(v2)) + count.put(v2, 1); + else + count.put(v2, count.get(v2)+1); + } + if(count.size() < qlog.semanticUnitList.size()) + return; + if(edgeCnt == 0) + return; + if(edgeCnt > 1) + { + for (Integer key : semanticRelations.keySet()) + { + SemanticRelation sr = semanticRelations.get(key); + if(currentPredicateMappings.get(sr.hashCode()) == null) + continue; + int v1 = sr.arg1Word.hashCode(), v2 = sr.arg2Word.hashCode(); + if(count.get(v1) == 1 && count.get(v2) == 1) + return; + } + } + + // Now the graph is connected, start to generate SPARQL. + HashSet typeSetFlag = new HashSet(); + for (Integer key : semanticRelations.keySet()) + { + SemanticRelation sr = semanticRelations.get(key); + String sub, obj; + int subjId = -1, objId = -1; + int pid; + double score = 1; + boolean isSubjObjOrderSameWithSemRltn = true; + +// argument1 + if(sr.isArg1Constant && (sr.arg1Word.mayEnt || sr.arg1Word.mayType) ) // Constant + { + // For subject, entity has higher priority. + if(sr.arg1Word.mayEnt) + { + EntityMapping em = currentEntityMappings.get(sr.arg1Word.hashCode()); + subjId = em.entityID; + sub = em.entityName; + score *= em.score; + } + else + { + TypeMapping tm = sr.arg1Word.tmList.get(0); + subjId = Triple.TYPE_ROLE_ID; + sub = tm.typeName; + score *= (tm.score*100); // Generalization. type score: [0,1], entity score: [0,100]. + } + } + else // Variable + { + subjId = Triple.VAR_ROLE_ID; + sub = "?" + sr.arg1Word.originalForm; + } + // Embedded Type info of argument1(variable type) | eg, ?book + // Notice, mayType & mayExtendVariable is mutual-exclusive. (see constantVariableRecognition) + // Notice, we do NOT consider types of [?who,?where...] now. + Triple subt = null; + if (!sr.isArg1Constant && sr.arg1Word.mayType && sr.arg1Word.tmList != null && sr.arg1Word.tmList.size() > 0 && !typeSetFlag.contains(sub)) + { + StringBuilder type = new StringBuilder(""); + for (TypeMapping tm: sr.arg1Word.tmList) + { + Integer tt = tm.typeID; + if(tt != -1) + type.append(TypeFragment.typeId2ShortName.get(tt)); + else + type.append(tm.typeName); + type.append('|'); + } + String ttt = type.substring(0, type.length()-1); + subt = new Triple(subjId, sub, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, ttt, null, 10); + subt.typeSubjectWord = sr.arg1Word; + + if(sr.arg1Word.tmList.get(0).prefferdRelation == -1) + subt = null; + } +// predicate + SemanticRelation dep = sr.dependOnSemanticRelation; + PredicateMapping pm = null; + if (dep == null) + pm = currentPredicateMappings.get(sr.hashCode()); + else + pm = currentPredicateMappings.get(dep.hashCode()); + if(pm == null) + continue; + + pid = pm.pid; + score *= pm.score; +// argument2 + if(sr.isArg2Constant && (sr.arg2Word.mayEnt || sr.arg2Word.mayType) ) + { + if(!sr.arg2Word.mayType) + { + EntityMapping em = currentEntityMappings.get(sr.arg2Word.hashCode()); + objId = em.entityID; + obj = em.entityName; + score *= em.score; + } + else + { + TypeMapping tm = sr.arg2Word.tmList.get(0); + objId = Triple.TYPE_ROLE_ID; + obj = tm.typeName; + score *= (tm.score*100); + } + } + else + { + objId = Triple.VAR_ROLE_ID; + obj = "?" + sr.arg2Word.getFullEntityName(); + } + // Type info of argument2 + Triple objt = null; + if (sr.arg2Word.tmList != null && sr.arg2Word.tmList.size() > 0 && !typeSetFlag.contains(obj) && !sr.isArg2Constant) + { + StringBuilder type = new StringBuilder(""); + for (TypeMapping tm : sr.arg2Word.tmList) + { + Integer tt = tm.typeID; + if(tt != -1) + type.append(TypeFragment.typeId2ShortName.get(tt)); + else + type.append(tm.typeName); + type.append('|'); + } + String ttt = type.substring(0, type.length()-1); + objt = new Triple(objId, obj, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, ttt, null, 10); + objt.typeSubjectWord = sr.arg2Word; + + if(sr.arg2Word.tmList.get(0).prefferdRelation == -1) + objt = null; + } + + // Prune. + if(objId == Triple.TYPE_ROLE_ID && pid != Globals.pd.typePredicateID) + return; + + // Consider orders rely on LITERAL relations | at least one argument has TYPE info + if (RelationFragment.isLiteral(pid) && (subt != null || objt != null)) + { + if (sub.startsWith("?") && obj.startsWith("?")) // two variables + { + // two variables have both possibility as object literal + if (subt != null) { + subt.object += ("|" + "literal_HRZ"); + } + if (objt != null) { + objt.object += ("|" + "literal_HRZ"); + } + + if (subt==null && objt!=null) + { + // if object has type, subject has no type, more possible to change sub/obj because literal has no type in general [however maybe have yago:type] + String temp = sub; + int tmpId = subjId; + sub = obj; + subjId = objId; + obj = temp; + objId = tmpId; + isSubjObjOrderSameWithSemRltn=!isSubjObjOrderSameWithSemRltn; + } + + } + else if (sub.startsWith("?") && !obj.startsWith("?")) { + // need change subj/obj order + if (subt != null) { + subt.object += ("|" + "literal_HRZ"); + } + String temp = sub; + int tmpId = subjId; + sub = obj; + subjId = objId; + obj = temp; + objId = tmpId; + isSubjObjOrderSameWithSemRltn=!isSubjObjOrderSameWithSemRltn; + //System.out.println("here: "+sub+obj); + + } + else if (obj.startsWith("?") && !sub.startsWith("?")) { + if (objt != null) { + objt.object += ("|" + "literal_HRZ"); + } + } + } + + Triple t = new Triple(subjId, sub, pid, objId, obj, sr, score,isSubjObjOrderSameWithSemRltn); + //System.out.println("triple: "+t+" "+isTripleCompatibleCanSwap(t)); + + sparql.addTriple(t); + + // score of subject/object's type should correlative with the score of triple itself + if (subt != null) + { + subt.score += t.score*0.2; + sparql.addTriple(subt); + typeSetFlag.add(subt.subject); // be cautious to NOT use sub, it may has changed subj/obj order + } + if (objt != null) + { + objt.score += t.score*0.2; + sparql.addTriple(objt); + typeSetFlag.add(objt.subject); + } + + // add argument' embedded triple, eg, ?canadian + if(!sr.isArg1Constant && sr.arg1Word.mayExtendVariable && sr.arg1Word.embbededTriple != null) + { + sparql.addTriple(sr.arg1Word.embbededTriple); + } + if(!sr.isArg2Constant && sr.arg2Word.mayExtendVariable && sr.arg2Word.embbededTriple != null) + { + sparql.addTriple(sr.arg2Word.embbededTriple); + } + + sparql.adjustTriplesOrder(); + } + + // deduplicate + sparql.deduplicate(); + if(checkedSparqlStrs.contains(sparql.toStringForGStore2())) + return; + checkedSparqlStrs.add(sparql.toStringForGStore2()); + + if (!qlog.MODE_fragment) { + // Method 1: do NOT check compatibility + rankedSparqls.add(sparql); + isAnswerFound = true; + } + else { + // Method 2: check compatibility by FRAGMENT (offline index) + //1. single-triple check (a quickly prune), allow to swap subject and object. Try to adjust to the best order. + tripleCheckCallCnt++; + long t1 = System.currentTimeMillis(); + for (Triple t : sparql.tripleList) + if(t.predicateID!=Globals.pd.typePredicateID && !isTripleCompatibleCanSwap(t)) + { + firstFalseSr = t.semRltn; + return; + } + tripleCheckTime += (System.currentTimeMillis()-t1); + + //2. SPARQL check (consider the interact between all triples), allow to swap subject and object. + t1 = System.currentTimeMillis(); + sparqlCheckCallCnt++; + enumerateSubjObjOrders(sparql, new Sparql(sparql.semanticRelations), 0); + sparqlCheckTime += (System.currentTimeMillis()-t1); + } + + } + + /* + * Notice: + * typeId=-1 then no data fragment + * */ + public static TypeFragment getTypeFragmentByWord(Word word) + { + TypeFragment tf = null; + if(word!=null && word.tmList!=null && word.tmList.size()>0) + { + int typeId = word.tmList.get(0).typeID; + if(typeId != -1) + tf = TypeFragment.typeFragments.get(typeId); + } + return tf; + } + + /* + * (Just PRE CHECK [single triple check] in this function, the final check in enumerateSubjObjOrders which utilize more INDEX) + * notice: predicate = type cannot entrance this function + * */ + public boolean isTripleCompatibleCanSwap (Triple t) { + + if (qlog.s.sentenceType==SentenceType.GeneralQuestion) + { + if (fragmentCompatible2(t.subjId, t.predicateID, t.objId) > + fragmentCompatible2(t.objId, t.predicateID, t.subjId)) + t.swapSubjObjOrder(); + + if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) + return true; + return false; + + } + else + { + //var & var + if(t.subject.startsWith("?") && t.object.startsWith("?")) + { + Word subjWord = t.getSubjectWord(), objWord = t.getObjectWord(); + TypeFragment subjTf = getTypeFragmentByWord(subjWord), objTf = getTypeFragmentByWord(objWord); + + //based on whether the two varabile's type fragment's in/out edge contain predicate, calculate whether need change order + //just vote + int nowOrderCnt = 0, reverseOrderCnt = 0; + if(subjTf == null || subjTf.outEdges.contains(t.predicateID)) + nowOrderCnt ++; + if(objTf == null || objTf.inEdges.contains(t.predicateID)) + nowOrderCnt ++; + if(subjTf == null || subjTf.inEdges.contains(t.predicateID)) + reverseOrderCnt ++; + if(objTf == null || objTf.outEdges.contains(t.predicateID)) + reverseOrderCnt ++; + + if(nowOrderCnt<2 && reverseOrderCnt<2) + return false; + + else if(nowOrderCnt > reverseOrderCnt) + { + // do nothing + } + else if(reverseOrderCnt > nowOrderCnt) + { + t.swapSubjObjOrder(); + } + else //now order and reverse order both passed type fragment checking, need SELECT one + { + //rule1: ?inventor ?occupation || ... ?name -> more similar string will be put latter + String p = Globals.pd.getPredicateById(t.predicateID); + int ed1 = EntityFragment.calEditDistance(subjWord.baseForm, p); + int ed2 = EntityFragment.calEditDistance(objWord.baseForm, p); + if(ed1 < ed2) + { + t.swapSubjObjOrder(); + } + } + return true; + } + ///ent & ent || var & ent + else + { + boolean flag = false; + if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) { + flag = true; + } + else if (fragmentCompatible(t.objId, t.predicateID, t.subjId)) { + t.swapSubjObjOrder(); + flag = true; + } + + // Var & Ent | ?city & ?city : is invalid for City | Notice: the data often dirty and can not prune correctly. + if(flag == true && (t.subject.startsWith("?") || t.object.startsWith("?"))) + { + Word subjWord = t.getSubjectWord(), objWord = t.getObjectWord(); + TypeFragment subjTf = getTypeFragmentByWord(subjWord), objTf = getTypeFragmentByWord(objWord); + if(subjTf != null) + { + if(subjTf.outEdges.contains(t.predicateID)) + flag = true; + else if(subjTf.inEdges.contains(t.predicateID)) + { + t.swapSubjObjOrder(); + flag = true; + } + else + flag = false; + } + else if(objTf != null) + { + if(objTf.inEdges.contains(t.predicateID)) + flag = true; + else if(objTf.outEdges.contains(t.predicateID)) + { + t.swapSubjObjOrder(); + flag = true; + } + else + flag = false; + } + } + + return flag; + } + + } + } + + public boolean isTripleCompatibleNotSwap (Triple t) { + if (t.predicateID == Globals.pd.typePredicateID) { + return true; + } + else if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) { + return true; + } + else { + return false; + } + } + + public boolean fragmentCompatible (int id1, int pid, int id2) { + EntityFragment ef1 = efd.getEntityFragmentByEid(id1); + EntityFragment ef2 = efd.getEntityFragmentByEid(id2); + + // valid entity MUST has fragment + if (id1!=Triple.TYPE_ROLE_ID && id1!=Triple.VAR_ROLE_ID && ef1 == null) return false; + if (id2!=Triple.TYPE_ROLE_ID && id2!=Triple.VAR_ROLE_ID && ef2 == null) return false; + + boolean ef1_constant = (ef1==null)?false:true; + boolean ef2_constant = (ef2==null)?false:true; + int entityCnt=0,compatibleCnt=0; + if(ef1_constant) { + entityCnt++; + if (ef1.outEdges.contains(pid)) + compatibleCnt++; +// else // Ϊ false pair +// { +// falseEntPres.add(new Pair(id1,pid)); +// } + } + + if (ef2_constant) { + entityCnt++; + if (ef2.inEdges.contains(pid)) + compatibleCnt++; +// else // Ϊfalse pair +// { +// falsePreEnts.add(new Pair(pid,id2)); +// } + } + + // for SELECT sparql, EXCAT match between predicate and subject and object, ASK sparql can be relaxed + if (qlog.s.sentenceType==SentenceType.GeneralQuestion) + return entityCnt-compatibleCnt<=1; + else + return entityCnt==compatibleCnt; + + } + + public int fragmentCompatible2 (int id1, int pid, int id2) { + EntityFragment ef1 = efd.getEntityFragmentByEid(id1); + EntityFragment ef2 = efd.getEntityFragmentByEid(id2); + + int entityCnt=0,compatibleCnt=0; + if(id1 != Triple.VAR_ROLE_ID && id1 != Triple.TYPE_ROLE_ID) { + entityCnt++; + if (ef1!=null && ef1.outEdges.contains(pid)) + compatibleCnt++; + } + + if (id2 != Triple.VAR_ROLE_ID && id2 != Triple.TYPE_ROLE_ID) { + entityCnt++; + if (ef2!=null && ef2.inEdges.contains(pid)) + compatibleCnt++; + } + + return entityCnt-compatibleCnt; + } + + public boolean checkConstantConsistency (Sparql spql) { + HashMap constants = new HashMap(); + for (Triple t : spql.tripleList) { + if (!t.subject.startsWith("?")) { + String e = t.getSubjectWord().getFullEntityName(); + if (!constants.containsKey(e)) + constants.put(e, t.subject); + else { + if (!constants.get(e).equals(t.subject)) + return false; + } + } + if (!t.object.startsWith("?")) { + String e = t.getObjectWord().getFullEntityName(); + if (!constants.containsKey(e)) + constants.put(e, t.object); + else { + if (!constants.get(e).equals(t.object)) + return false; + } + } + } + return true; + } + + public void reviseScoreByTripleOrders(Sparql spq) + { + Triple shouldDel = null; + for(Triple triple: spq.tripleList) + { + // eg, ?who need punished (or dropped). + if(triple.subject.toLowerCase().equals("?who")) + { + String rel = Globals.pd.id_2_predicate.get(triple.predicateID); + if(rel.equals("president") || rel.equals("starring") || rel.equals("producer")) + { + spq.score -= triple.score; + triple.score /= 10; + spq.score += triple.score; + if(triple.semRltn!=null && triple.semRltn.isSteadyEdge == false) + shouldDel = triple; + } + } + } + if(shouldDel != null) + spq.delTriple(shouldDel); + } + + // enumerate subject/object order, fragment check + // Modify score of "ask one triple" + public boolean enumerateSubjObjOrders (Sparql originalSpq, Sparql currentSpq, int level) + { + if (level == originalSpq.tripleList.size()) + { + if(currentSpq.tripleList.size() == 0) + return false; + + CompatibilityChecker cc = new CompatibilityChecker(efd); + + if (qlog.s.sentenceType==SentenceType.GeneralQuestion) //ask where sparql: no need for fragment check + { + if(cc.isSparqlCompatible3(currentSpq)) //reward score for "TRUE" + { + for(Triple triple: currentSpq.tripleList) + triple.addScore(triple.getScore()); + } + rankedSparqls.add(currentSpq.copy()); + return true; + } + try + { + sparqlCheckId++; + if (cc.isSparqlCompatible3(currentSpq)) + { + //eg, ?who + //When query graph contains circle, we just prune this edge + Sparql sparql = currentSpq.copy(); + reviseScoreByTripleOrders(sparql); + if(!rankedSparqls.contains(sparql)) + rankedSparqls.add(sparql); + return true; + } + } + catch (Exception e) { + System.out.println("[CompatibilityChecker ERROR]"+currentSpq); + e.printStackTrace(); + } + return false; + } + + Triple cur_t = originalSpq.tripleList.get(level); + + // first try default order + currentSpq.addTriple(cur_t); + boolean flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1); + currentSpq.removeLastTriple(); + + // !deprecated: not change triple order for [literal relation] +// if (RelationFragment.isLiteral(cur_t.predicateID)) return false; + + // Enumerate reserve/drop the type info + if (cur_t.predicateID == Globals.pd.typePredicateID) + { + flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1); + return flag; + } + else + { + // single triple check after swap + Triple swapped_t = cur_t.copySwap(); + swapped_t.score = swapped_t.score*0.8; + if (isTripleCompatibleNotSwap(swapped_t)) + { + currentSpq.addTriple(swapped_t); + flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1); + currentSpq.removeLastTriple(); + } + return flag; + } + } + +} diff --git a/src/qa/parsing/BuildQueryGraph.java b/src/qa/parsing/BuildQueryGraph.java new file mode 100644 index 0000000..c3cf7f7 --- /dev/null +++ b/src/qa/parsing/BuildQueryGraph.java @@ -0,0 +1,1201 @@ +package qa.parsing; + +//import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.PriorityQueue; +import java.util.Queue; + +import fgmt.EntityFragment; +import fgmt.TypeFragment; +import log.QueryLogger; +import nlp.ds.*; +import nlp.ds.Sentence.SentenceType; +import qa.Globals; +import qa.extract.*; +import qa.mapping.SemanticItemMapping; +import rdf.PredicateMapping; +import rdf.SemanticQueryGraph; +import rdf.Triple; +import rdf.SemanticRelation; +import rdf.SimpleRelation; +import rdf.SemanticUnit; +//import paradict.ParaphraseDictionary; + +/* + * The core class to build query graph, i.e, to generate SPARQL queries. + * */ +public class BuildQueryGraph +{ + public ArrayList semanticUnitList = new ArrayList(); + public ArrayList whList = new ArrayList(); + public ArrayList stopNodeList = new ArrayList(); + public ArrayList modifierList = new ArrayList(); + public HashSet visited = new HashSet(); + public HashMap matchedSemanticRelations = new HashMap(); + + public int aggregationType = -1; // 1:how many 2:latest/first... + + public BuildQueryGraph() + { + whList.add("what"); + whList.add("which"); + whList.add("who"); + whList.add("whom"); + whList.add("when"); + whList.add("how"); + whList.add("where"); + + // Bad words for NODE. (base form) + stopNodeList.add("list"); + stopNodeList.add("give"); + stopNodeList.add("show"); + stopNodeList.add("star"); + stopNodeList.add("theme"); + stopNodeList.add("world"); + stopNodeList.add("independence"); + stopNodeList.add("office"); + stopNodeList.add("year"); + stopNodeList.add("work"); + } + + public void fixStopWord(QueryLogger qlog, DependencyTree ds) + { + String qStr = qlog.s.plainText.toLowerCase(); + + //... [which] + for(int i=2;i process(QueryLogger qlog) + { + try + { + semanticUnitList = new ArrayList(); + + DependencyTree ds = qlog.s.dependencyTreeStanford; + if(qlog.isMaltParserUsed) + ds = qlog.s.dependencyTreeMalt; + + long t = System.currentTimeMillis(); + +/* Prepare for building query graph: + * 0)Fix stop nodes. + * 1)Detect modified node(the center of semantic unit, compose the basic structure of query graph); + * Detect modifier (include ent/type/adj, NOT appear in basic structure, may be SUPPLEMNT info of query graph, degree always be 1). + * 2)Detect the target, also the start node to build query graph. + * 3)Coreference resolution. + * */ + //0) Fix stop words + fixStopWord(qlog, ds); + + //1) Detect Modifier/Modified + //rely on sentence (rather than dependency tree) + //with some ADJUSTMENT (eg, ent+noun(noType&&noEnt) -> noun.omitNode=TRUE) + for(Word word: qlog.s.words) + getTheModifiedWordBySentence(qlog.s, word); //Find continuous modifier + for(Word word: qlog.s.words) + getDiscreteModifiedWordBySentence(qlog.s, word); //Find discrete modifier + for(Word word: qlog.s.words) + if(word.modifiedWord == null) //Other words modify themselves. NOTICE: only can be called after detecting all modifier. + word.modifiedWord = word; + + //print log + for(Word word: qlog.s.words) + { + if(word.modifiedWord != null && word.modifiedWord != word) + { + modifierList.add(word); + qlog.SQGlog += "++++ Modify detect: "+word+" --> " + word.modifiedWord + "\n"; + } + } + + //2) Detect target & 3) Coreference resolution + DependencyTreeNode target = detectTarget(ds,qlog); + qlog.SQGlog += "++++ Target detect: "+target+"\n"; + + if(target == null) + return null; + + qlog.target = target.word; + // !target can NOT be entity. (except general question)| which [city] has most people? + if(qlog.s.sentenceType != SentenceType.GeneralQuestion && target.word.emList!=null) + { + //Counter example:Give me all Seven_Wonders_of_the_Ancient_World | (in fact, it not ENT, but CATEGORY, ?x subject Seve...) + target.word.mayEnt = false; + target.word.emList.clear(); + } + + //3) Coreference resolution, now we just OMIT the represented one. + //TODO: In some cases, the two node should be MERGED, instead of OMITTING directly. + CorefResolution cr = new CorefResolution(); + + qlog.timeTable.put("BQG_prepare", (int)(System.currentTimeMillis()-t)); +/* Prepare Done */ + + t = System.currentTimeMillis(); + DependencyTreeNode curCenterNode = target; + ArrayList expandNodeList; + Queue queue = new LinkedList(); + HashSet expandedNodes = new HashSet(); + queue.add(target); + expandedNodes.add(target); + visited.clear(); + + //step1: build the structure of query graph | notice, we allow CIRCLE and WRONG edge (for evaluation method 2) + while((curCenterNode = queue.poll()) != null) + { + if(curCenterNode.word.represent != null || cr.getRefWord(curCenterNode.word,ds,qlog) != null ) + { + if(curCenterNode != target) // if target be represented, continue will get empty semantic unit list. + //TODO: it may lose other nodes when prune the represent/coref nodes, the better way is do coref resolution after structure construction. + continue; + } + + //Notice, the following codes guarantee all possible edges (allow CIRCLE). + //Otherwise, NO CIRCLE, and the structure may be different by changing target. + if(Globals.evaluationMethod > 1) + { + visited.clear(); + } + + SemanticUnit curSU = new SemanticUnit(curCenterNode.word,true); + expandNodeList = new ArrayList(); + dfs(curCenterNode, curCenterNode, expandNodeList); // search neighbors of current node + // expand nodes + for(DependencyTreeNode expandNode: expandNodeList) + { + if(!expandedNodes.contains(expandNode)) + { + queue.add(expandNode); + expandedNodes.add(expandNode); + } + } + + semanticUnitList.add(curSU); + for(DependencyTreeNode expandNode: expandNodeList) + { + String subj = curCenterNode.word.getBaseFormEntityName(); + String obj = expandNode.word.getBaseFormEntityName(); + + //omit inner relation + if(subj.equals(obj)) + continue; + + //we just omit represented nodes now. + //TODO: Co-refernce (continue may not suitable in some cases) + if(expandNode.word.represent != null) + continue; + + //expandNode is a new SemanticUnit + SemanticUnit expandSU = new SemanticUnit(expandNode.word,false); + //expandUnit is the neighbor of current unit + curSU.neighborUnitList.add(expandSU); + } + } + qlog.timeTable.put("BQG_structure", (int)(System.currentTimeMillis()-t)); + + //step2: Find relations (Notice, we regard that the coreference have been resolved now) + t = System.currentTimeMillis(); + qlog.semanticUnitList = new ArrayList(); + extractRelation(semanticUnitList, qlog); // RE for each two connected nodes + matchRelation(semanticUnitList, qlog); // Drop the nodes who cannot find relations (except implicit relation) + qlog.timeTable.put("BQG_relation", (int)(System.currentTimeMillis()-t)); + + //Prepare for item mapping + TypeRecognition.AddTypesOfWhwords(qlog.semanticRelations); // Type supplementary + TypeRecognition.constantVariableRecognition(qlog.semanticRelations, qlog); // Constant or Variable, embedded triples + + //(just for display) + recordOriginalTriples(semanticUnitList, qlog); + + //step3: item mapping & top-k join + t = System.currentTimeMillis(); + SemanticItemMapping step5 = new SemanticItemMapping(); + step5.process(qlog, qlog.semanticRelations); //top-k join (generate SPARQL queries), disambiguation + qlog.timeTable.put("BQG_topkjoin", (int)(System.currentTimeMillis()-t)); + + //step6: implicit relation [modify word] + t = System.currentTimeMillis(); + ExtractImplicitRelation step6 = new ExtractImplicitRelation(); + step6.supplementTriplesByModifyWord(qlog); + qlog.timeTable.put("BQG_implicit", (int)(System.currentTimeMillis()-t)); + + } + catch (Exception e) { + e.printStackTrace(); + } + + return semanticUnitList; + } + + /* + * For experiment. + */ + public ArrayList getNodeList(QueryLogger qlog, DependencyTree ds) + { + semanticUnitList = new ArrayList(); + + // For ComplexQuestions or WebQuestions, only consider wh-word and at most two entities. + if(Globals.runningBenchmark.equals("CQ") || Globals.runningBenchmark.equals("WQ")) + { +// DependencyTreeNode target = ds.nodesList.get(0); +// if(Globals.runningBenchmark.equals("CQ")) +// target = detectTargetForCQ(ds, qlog); +// qlog.target = target.word; +// qlog.SQGlog += "++++ Target detect: "+target+"\n"; +// +// detectTopicConstraint(qlog); +// semanticUnitList.add(new SemanticUnit(qlog.target, false)); //Set variable to object +// if(topicEnt != null) +// { +// semanticUnitList.add(new SemanticUnit(topicEnt, true)); //Set entity to subject +// } +// if(constraintEnt != null) +// { +// semanticUnitList.add(new SemanticUnit(constraintEnt, true)); //Set entity to subject +// } + } + // For general cases (e.g, QALD), consider internal variables. + else + { + for(DependencyTreeNode dtn: ds.nodesList) + { + if(isNodeWoCorefRe(dtn)) // ! Omit the coreference resolution rules ! + { + semanticUnitList.add(new SemanticUnit(dtn.word, true)); //No prefer subject (default is true) + } + } + } + return semanticUnitList; + } + + /* + * (For Experiment) Build query graph using STATE TRANSITION method based on 4 operations (with 4 conditions). + * 1. Condition for Connect operation: do and must do | no other nodes on simple path in DS tree. + * 2. Condition for Merge operation: do and must do | heuristic rules of CoReference Resolution. + * 3. Condition for Fold operation: do or not do | no matches of low confidence of an edge. + * 4. Condition for Expand operation: do and must do | has corresponding information. + * */ + public ArrayList processEXP(QueryLogger qlog) + { + //0) Fix stop words + DependencyTree ds = qlog.s.dependencyTreeStanford; + if(qlog.isMaltParserUsed) + ds = qlog.s.dependencyTreeMalt; + fixStopWord(qlog, ds); + + //1) Detect Modifier/Modified + //rely on sentence (rather than dependency tree) + //with some ADJUSTMENT (eg, ent+noun(noType&&noEnt) -> noun.omitNode=TRUE) + for(Word word: qlog.s.words) + getTheModifiedWordBySentence(qlog.s, word); //Find continuous modifier + for(Word word: qlog.s.words) + getDiscreteModifiedWordBySentence(qlog.s, word); //Find discrete modifier + for(Word word: qlog.s.words) + if(word.modifiedWord == null) //Other words modify themselves. NOTICE: only can be called after detecting all modifier. + word.modifiedWord = word; + + //print log + for(Word word: qlog.s.words) + { + if(word.modifiedWord != null && word.modifiedWord != word) + { + modifierList.add(word); + qlog.SQGlog += "++++ Modify detect: "+word+" --> " + word.modifiedWord + "\n"; + } + } + + //2) Detect target & 3) Coreference resolution + DependencyTreeNode target = detectTarget(ds,qlog); + qlog.SQGlog += "++++ Target detect: "+target+"\n"; + + if(target == null) + return null; + + qlog.target = target.word; + // !target can NOT be entity. (except general question)| which [city] has most people? + if(qlog.s.sentenceType != SentenceType.GeneralQuestion && target.word.emList!=null) + { + //Counter example:Give me all Seven_Wonders_of_the_Ancient_World | (in fact, it not ENT, but CATEGORY, ?x subject Seve...) + target.word.mayEnt = false; + target.word.emList.clear(); + } + + try + { + // step1: get node list + semanticUnitList = getNodeList(qlog, ds); + if(semanticUnitList == null || semanticUnitList.isEmpty()) + { + qlog.SQGlog += "ERROR: no nodes found."; + return null; + } + + // step2: extract all potential relations + long t = System.currentTimeMillis(); + System.out.println("Potential Relation Extraction start ..."); + extractPotentialSemanticRelations(semanticUnitList, qlog); + qlog.timeTable.put("BQG_relation", (int)(System.currentTimeMillis()-t)); + + // setp3: build query graph structure by 4 operations + t = System.currentTimeMillis(); + SemanticQueryGraph bestSQG = null; + if(Globals.usingOperationCondition) + { + //TODO: use operation condition + } + else + { + // for experiment, do not use conditions. + PriorityQueue QGs = new PriorityQueue(); + HashSet visited = new HashSet<>(); + //Initial state: all nodes isolated. + SemanticQueryGraph head = new SemanticQueryGraph(semanticUnitList); + QGs.add(head); + + while(!QGs.isEmpty()) + { + head = QGs.poll(); + visited.add(head); + + //Judge: is it a final state? + if(head.isFinalState()) + { + bestSQG = head; + break; // now we just find the top-1 SQG + } + + //SQG generation + //Connect (enumerate) + for(SemanticUnit u: head.semanticUnitList) + for(SemanticUnit v: head.semanticUnitList) + if(!u.equals(v) && !u.neighborUnitList.contains(v) && !v.neighborUnitList.contains(u)) + { + SemanticQueryGraph tail = new SemanticQueryGraph(head); + tail.connect(u, v); + if(!QGs.contains(tail) && !visited.contains(tail)) + { + tail.calculateScore(qlog.potentialSemanticRelations); + QGs.add(tail); + } + } + + //Merge (coref resolution) + if(head.semanticUnitList.size() > 2) + for(SemanticUnit u: head.semanticUnitList) + for(SemanticUnit v: head.semanticUnitList) + if(!u.equals(v) && (!u.neighborUnitList.contains(v) && !v.neighborUnitList.contains(u)) || (u.neighborUnitList.contains(v) && v.neighborUnitList.contains(u))) + { + SemanticQueryGraph tail = new SemanticQueryGraph(head); + tail.merge(u, v); + if(!QGs.contains(tail) && !visited.contains(tail)) + { + tail.calculateScore(qlog.potentialSemanticRelations); + QGs.add(tail); + } + } + } + } + qlog.timeTable.put("BQG_structure", (int)(System.currentTimeMillis()-t)); + + //Relation Extraction by potentialSR + qlog.semanticUnitList = new ArrayList(); + qlog.semanticRelations = bestSQG.semanticRelations; + semanticUnitList = bestSQG.semanticUnitList; + matchRelation(semanticUnitList, qlog); + + //Prepare for item mapping + TypeRecognition.AddTypesOfWhwords(qlog.semanticRelations); // Type supplementary + TypeRecognition.constantVariableRecognition(qlog.semanticRelations, qlog); // Constant or Variable, embedded triples + + //(just for display) + recordOriginalTriples(semanticUnitList, qlog); + + //step3: item mapping & top-k join + t = System.currentTimeMillis(); + SemanticItemMapping step5 = new SemanticItemMapping(); + step5.process(qlog, qlog.semanticRelations); //top-k join (generate SPARQL queries), disambiguation + qlog.timeTable.put("BQG_topkjoin", (int)(System.currentTimeMillis()-t)); + + //step6: implicit relation [modify word] + t = System.currentTimeMillis(); + ExtractImplicitRelation step6 = new ExtractImplicitRelation(); + step6.supplementTriplesByModifyWord(qlog); + qlog.timeTable.put("BQG_implicit", (int)(System.currentTimeMillis()-t)); + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + return semanticUnitList; + } + + public void extractPotentialSemanticRelations(ArrayList semanticUnitList, QueryLogger qlog) + { + ExtractRelation er = new ExtractRelation(); + ArrayList simpleRelations = new ArrayList(); + for(SemanticUnit curSU: semanticUnitList) + { + for(SemanticUnit expandSU: semanticUnitList) + { + //Deduplicate + if(curSU.centerWord.position > expandSU.centerWord.position) + continue; + + ArrayList tmpRelations = null; + //get simple relations by PARAPHRASE + tmpRelations = er.findRelationsBetweenTwoUnit(curSU, expandSU, qlog); + if(tmpRelations!=null && tmpRelations.size()>0) + simpleRelations.addAll(tmpRelations); + else + { + tmpRelations = new ArrayList(); + //Copy relations (for 'and', 'as soon as'...) |eg, In which films did Julia_Roberts and Richard_Gere play? + //TODO: judge by dependency tree | other way to supplement relations + if(curSU.centerWord.position + 2 == expandSU.centerWord.position && qlog.s.words[curSU.centerWord.position].baseForm.equals("and")) + { + for(SimpleRelation sr: simpleRelations) + { + if(sr.arg1Word == curSU.centerWord) + { + SimpleRelation tsr = new SimpleRelation(sr); + tsr.arg1Word = expandSU.centerWord; + tmpRelations.add(tsr); + } + else if (sr.arg2Word == curSU.centerWord) + { + SimpleRelation tsr = new SimpleRelation(sr); + tsr.arg2Word = expandSU.centerWord; + tmpRelations.add(tsr); + } + } + if(tmpRelations.size() > 0) + simpleRelations.addAll(tmpRelations); + } + } + } + } + + //get semantic relations + HashMap semanticRelations = er.groupSimpleRelationsByArgsAndMapPredicate(simpleRelations); + + if(Globals.evaluationMethod > 1) + { + //TODO: recognize unsteady edge by judging connectivity (now we just recognize all edges are unsteady when it has circle) + if(semanticRelations.size() >= semanticUnitList.size()) // has CIRCLE + for(SemanticRelation sr: semanticRelations.values()) + { + sr.isSteadyEdge = false; + } + } + + qlog.potentialSemanticRelations = semanticRelations; + } + + public void extractRelation(ArrayList semanticUnitList, QueryLogger qlog) + { + ExtractRelation er = new ExtractRelation(); + ArrayList simpleRelations = new ArrayList(); + for(SemanticUnit curSU: semanticUnitList) + { + for(SemanticUnit expandSU: curSU.neighborUnitList) + { + //Deduplicate | method 1 only can generate DIRECTED edge + if(Globals.evaluationMethod > 1 && curSU.centerWord.position > expandSU.centerWord.position) + continue; + + ArrayList tmpRelations = null; + //get simple relations by PARAPHRASE + tmpRelations = er.findRelationsBetweenTwoUnit(curSU, expandSU, qlog); + if(tmpRelations!=null && tmpRelations.size()>0) + simpleRelations.addAll(tmpRelations); + else + { + tmpRelations = new ArrayList(); + //Copy relations (for 'and', 'as soon as'...) |eg, In which films did Julia_Roberts and Richard_Gere play? + //TODO: judge by dependency tree | other way to supplement relations + if(curSU.centerWord.position + 2 == expandSU.centerWord.position && qlog.s.words[curSU.centerWord.position].baseForm.equals("and")) + { + for(SimpleRelation sr: simpleRelations) + { + if(sr.arg1Word == curSU.centerWord) + { + SimpleRelation tsr = new SimpleRelation(sr); + tsr.arg1Word = expandSU.centerWord; + tmpRelations.add(tsr); + } + else if (sr.arg2Word == curSU.centerWord) + { + SimpleRelation tsr = new SimpleRelation(sr); + tsr.arg2Word = expandSU.centerWord; + tmpRelations.add(tsr); + } + } + if(tmpRelations.size() > 0) + simpleRelations.addAll(tmpRelations); + } + } + } + } + + //get semantic relations + HashMap semanticRelations = er.groupSimpleRelationsByArgsAndMapPredicate(simpleRelations); + + if(Globals.evaluationMethod > 1) + { + //TODO: recognize unsteady edge by judging connectivity (now we just recognize all edges are unsteady when it has circle) + if(semanticRelations.size() >= semanticUnitList.size()) // has CIRCLE + for(SemanticRelation sr: semanticRelations.values()) + { + sr.isSteadyEdge = false; + } + } + + qlog.semanticRelations = semanticRelations; + } + + public void matchRelation(ArrayList semanticUnitList, QueryLogger qlog) + { + //Drop the nodes who cannot find relations (except [modifier] implicit relation) + for(int relKey: qlog.semanticRelations.keySet()) + { + boolean matched = false; + SemanticRelation sr = qlog.semanticRelations.get(relKey); + for(SemanticUnit curSU: semanticUnitList) + { + for(SemanticUnit expandSU: curSU.neighborUnitList) + { + //Deduplicate | method 1 only can generate DIRECTED edge + if(Globals.evaluationMethod > 1 && curSU.centerWord.position > expandSU.centerWord.position) + continue; + + int key = curSU.centerWord.getNnHead().hashCode() ^ expandSU.centerWord.getNnHead().hashCode(); + if(relKey == key) + { + matched = true; + matchedSemanticRelations.put(relKey, sr); + if(!qlog.semanticUnitList.contains(curSU)) + qlog.semanticUnitList.add(curSU); + if(!qlog.semanticUnitList.contains(expandSU)) + qlog.semanticUnitList.add(expandSU); + + curSU.RelationList.put(expandSU.centerWord, sr); + expandSU.RelationList.put(curSU.centerWord, sr); + } + } + } + if(!matched) + { + qlog.SQGlog += "sr not found: "+sr+"\n"; + } + } + if(qlog.semanticUnitList.size() == 0) + qlog.semanticUnitList = semanticUnitList; + + // Now we regard that ONLY modified word can have implicit relations, they will be supplemented later. + // TODO: Maybe some other reasons lead to relation extraction FAILED between two nodes. (eg, .. and .. | .. in ..) + } + + // Print original structure of query graph. Notice, the relations have not been decided. + public void recordOriginalTriples(ArrayList SUList, QueryLogger qlog) + { + SemanticUnit curSU = null; + SemanticUnit neighborSU = null; + SemanticRelation sr = null; + String subj = null; + String obj = null; + int rel = 0; + + for(int i=0;i 1 && curSU.centerWord.position > neighborSU.centerWord.position) + continue; + + obj = neighborSU.centerWord.getFullEntityName(); + sr = curSU.RelationList.get(neighborSU.centerWord); + rel = 0; + if(sr != null && sr.predicateMappings.size()>0) + { + PredicateMapping pm = sr.predicateMappings.get(0); + rel = pm.pid; + if(sr.preferredSubj != null) + { + if(sr.arg1Word == sr.preferredSubj) + { + subj = sr.arg1Word.getFullEntityName(); + obj = sr.arg2Word.getFullEntityName(); + if(sr.isArg1Constant == false) + subj = "?"+subj; + if(sr.isArg2Constant == false) + obj = "?"+obj; + } + else + { + subj = sr.arg2Word.getFullEntityName(); + obj = sr.arg1Word.getFullEntityName(); + if(sr.isArg2Constant == false) + subj = "?"+subj; + if(sr.isArg1Constant == false) + obj = "?"+obj; + } + } + } + + Triple next = new Triple(-1, subj,rel, -1, obj,null,0); + qlog.SQGlog += "++++ Triple detect: "+next+"\n"; + } + // current unit's TYPE + if(curSU.prefferdType != null) + { + String type = TypeFragment.typeId2ShortName.get(curSU.prefferdType); + Triple next = new Triple(-1, curSU.centerWord.getFullEntityName(),Globals.pd.typePredicateID,Triple.TYPE_ROLE_ID, type,null,0); + qlog.SQGlog += "++++ Triple detect: "+next+"\n"; + } + // current unit's describe + for(DependencyTreeNode describeNode: curSU.describeNodeList) + { + qlog.SQGlog += "++++ Describe detect: "+describeNode.dep_father2child+"\t"+describeNode.word+"\t"+curSU.centerWord+"\n"; + } + } + } + + public void dfs(DependencyTreeNode head, DependencyTreeNode cur, ArrayList ret) + { + if(cur == null) + return; + visited.add(cur); + + if(isNode(cur) && head!=cur) + { + ret.add(cur); + return; + } + + if(cur.father!=null && !visited.contains(cur.father)) + { + dfs(head,cur.father,ret); + } + for(DependencyTreeNode child: cur.childrenList) + { + if(!visited.contains(child)) + dfs(head,child,ret); + } + return; + } + + /* + * Judge nodes strictly. + * */ + public boolean isNode(DependencyTreeNode cur) + { + if(stopNodeList.contains(cur.word.baseForm)) + return false; + + if(cur.word.omitNode || cur.word.represent!=null) + return false; + + // Modifier can NOT be node (They may be added in query graph in the end) e.g., Queen Elizabeth II,Queen(modifier) + if(modifierList.contains(cur.word)) + return false; + + // NOUN + if(cur.word.posTag.startsWith("N")) + return true; + + // Wh-word + if(whList.contains(cur.word.baseForm)) + return true; + + if(cur.word.mayEnt || cur.word.mayType || cur.word.mayCategory) + return true; + return false; + } + + /* + * Judge nodes strictly. + * For EXP, do not use COREF resolution rules. + * */ + public boolean isNodeWoCorefRe(DependencyTreeNode cur) + { + if(stopNodeList.contains(cur.word.baseForm)) + return false; + + if(cur.word.omitNode) + return false; + + // Modifier can NOT be node (They may be added in query graph in the end) e.g., Queen Elizabeth II,Queen(modifier) + if(modifierList.contains(cur.word)) + return false; + + // NOUN + if(cur.word.posTag.startsWith("N")) + return true; + + // Wh-word + if(whList.contains(cur.word.baseForm)) + return true; + + if(cur.word.mayEnt || cur.word.mayType || cur.word.mayCategory) + return true; + return false; + } + + public DependencyTreeNode detectTarget(DependencyTree ds, QueryLogger qlog) + { + visited.clear(); + DependencyTreeNode target = null; + Word[] words = qlog.s.words; + + for(DependencyTreeNode cur : ds.nodesList) + { + if(isWh(cur.word)) + { + target = cur; + break; + } + } + // No Wh-Word: use the first node; NOTICE: consider MODIFIER rules. E.g, was us president Obama ..., target=obama (rather us) + if(target == null) + { + for(Word word: words) + { + Word modifiedWord = word.modifiedWord; + if(modifiedWord != null && isNodeCandidate(modifiedWord)) + { + target = ds.getNodeByIndex(modifiedWord.position); + break; + } + } + + if(target == null) + target = ds.nodesList.get(0); + + /* Are [E|tree_frogs] a type of [E|amphibian] , type + */ + for(DependencyTreeNode dtn: target.childrenList) + { + if(dtn.word.baseForm.equals("type")) + { + dtn.word.represent = target.word; + } + } + + + } + //where, NOTICE: wh target from NN may not pass the function isNode() + if(target.word.baseForm.equals("where")) + { + int curPos = target.word.position - 1; + + //!Where is the residence of + if(words[curPos+1].baseForm.equals("be") && words[curPos+2].posTag.equals("DT")) + { + for(int i=curPos+4;i had the highest budget + boolean ok = false; + for(DependencyTreeNode dtn: target.childrenList) + { + if(dtn.word.posTag.startsWith("IN")) + { + for(DependencyTreeNode chld: dtn.childrenList) + if(isNode(chld)) + { + target.word.represent = chld.word; + target = chld; + ok = true; + break; + } + } + if(ok) + break; + } + } + + } + //what + else if(target.word.baseForm.equals("what")) + { + //Detect:what is [the] sth1 prep. sth2? + //Omit: what is sth? + if(target.father != null && ds.nodesList.size()>=5) + { + DependencyTreeNode tmp1 = target.father; + if(tmp1.word.baseForm.equals("be")) + { + for(DependencyTreeNode child: tmp1.childrenList) + { + if(child == target) + continue; + if(isNode(child)) + { + //sth1 + boolean hasPrep = false; + for(DependencyTreeNode grandson: child.childrenList) + { //prep + if(grandson.dep_father2child.equals("prep")) + hasPrep = true; + } + //Detect modifier: what is the sht1's [sth2]? | what is the largest [city]? + if(hasPrep || qlog.s.hasModifier(child.word)) + { + target.word.represent = child.word; + target = child; + break; + } + } + } + } + //what sth || What airlines are (part) of the SkyTeam alliance? + else if(isNode(tmp1)) + { + target.word.represent = tmp1.word; + target = tmp1; + // Coreference resolution + int curPos = target.word.position - 1; + if(curPos+3 6) + { + words[curPos+2].represent = target.word; + } + + } + } + // by sentence + if(target.word.baseForm.equals("what")) + { + int curPos = target.word.position - 1; + // what be the [node] ... ? (Notice: words.length CONTAINS symbol(?),different from nodeList) + if(words.length > 5 && words[curPos+1].baseForm.equals("be") && words[curPos+2].baseForm.equals("the") && isNodeCandidate(words[curPos+3])) + { + target.word.represent = words[curPos+3]; + target = ds.getNodeByIndex(words[curPos+3].position); + } + } + + } + //who + else if(target.word.baseForm.equals("who")) + { + //Detect:who is/does [the] sth1 prep. sth2? || Who was the pope that founded the Vatican_Television ? | Who does the voice of Bart Simpson? + //Others: who is sth? who do sth? | target = who + //test case: Who is the daughter of Robert_Kennedy married to? + if(ds.nodesList.size()>=5) + { //who + for(DependencyTreeNode tmp1: ds.nodesList) + { + if(tmp1 != target.father && !target.childrenList.contains(tmp1)) + continue; + if(tmp1.word.baseForm.equals("be") || tmp1.word.baseForm.equals("do")) + { //is + for(DependencyTreeNode child: tmp1.childrenList) + { + if(child == target) + continue; + if(isNode(child)) + { //sth1 + boolean hasPrep = false; + for(DependencyTreeNode grandson: child.childrenList) + { //prep + if(grandson.dep_father2child.equals("prep")) + hasPrep = true; + } + //Detect modifier: who is the sht1's sth2? +// if(hasPrep || qlog.s.plainText.contains(child.word.originalForm + " 's")) // replaced by detect modifier directly + if(hasPrep || qlog.s.hasModifier(child.word)) + { + target.word.represent = child.word; + target = child; + break; + } + } + } + } + } + } + // by sentence + if(target.word.baseForm.equals("who")) + { + int curPos = target.word.position - 1; + // who is usually coreference when it not the first word. + if(curPos - 1 >= 0 && isNodeCandidate(words[curPos-1])) + { + target.word.represent = words[curPos-1]; + target = ds.getNodeByIndex(words[curPos-1].position); + } + } + } + //how + else if(target.word.baseForm.equals("how")) + { + //Detect:how many sth ... |eg: how many popular Chinese director are there + int curPos = target.word.position-1; + if(curPos+2 < words.length && words[curPos+1].baseForm.equals("many")) + { + Word modifiedWord = words[curPos+2].modifiedWord; + if(isNodeCandidate(modifiedWord)) + { + target.word.represent = modifiedWord; + target = ds.getNodeByIndex(modifiedWord.position); + } + } + //Detect: how big is [det] (ent)'s (var), how = var + else if(curPos+6 < words.length && words[curPos+1].baseForm.equals("big")) + { + if(words[curPos+2].baseForm.equals("be") && words[curPos+3].baseForm.equals("the") && words[curPos+4].mayEnt && words[curPos+5].baseForm.equals("'s")) + { + Word modifiedWord = words[curPos+6].modifiedWord; + if(isNodeCandidate(modifiedWord)) + { + target.word.represent = modifiedWord; + target = ds.getNodeByIndex(modifiedWord.position); + } + } + } + //Detect:how much ... + else if(curPos+2 < words.length && words[curPos+1].baseForm.equals("much")) + { + Word modifiedWord = words[curPos+2].modifiedWord; + // How much carbs does peanut_butter have + if(isNodeCandidate(modifiedWord)) + { + target.word.represent = modifiedWord; + target = ds.getNodeByIndex(modifiedWord.position); + } + // How much did Pulp_Fiction cost | dependency tree + else + { + if(target.father!=null && isNodeCandidate(target.father.word)) + { + target.word.represent = target.father.word; + target = target.father; + } + } + } + } + return target; + } + + /* + * There are two cases of [ent]+[type]:1、Chinese company 2、De_Beer company; + * For 1, chinese -> company,for 2, De_Beer <- company + * Return: True : ent -> type | False : type <- ent + * */ + public boolean checkModifyBetweenEntType(Word entWord, Word typeWord) + { + int eId = entWord.emList.get(0).entityID; + int tId = typeWord.tmList.get(0).typeID; + EntityFragment ef = EntityFragment.getEntityFragmentByEntityId(eId); + + if(ef == null || !ef.types.contains(tId)) + return true; + + return false; + } + + /* + * Modify:in correct dependency tree, word1(ent/type)--mod-->word2 + * eg, Chinese teacher --> Chinese (modify) teacher; the Chinese teacher Wang Wei --> Chinese & teacher (modify) Wang Wei + * Find a word modify which word (modify itself default) + * Trough sentence rather than dependency tree as the latter often incorrect + * Generally a sequencial nodes always modify the last node, an exception is test case 3. So we apply recursive search method. + * test case: + * 1) the highest Chinese mountain + * 2) the Chinese popular director + * 3) the De_Beers company (company[type]-> De_Beers[ent]) + * */ + public Word getTheModifiedWordBySentence(Sentence s, Word curWord) + { + if(curWord == null) + return null; + if(curWord.modifiedWord != null) + return curWord.modifiedWord; + // return null if it is not NODE or adjective + if(!isNodeCandidate(curWord) && !curWord.posTag.startsWith("JJ") && !curWord.posTag.startsWith("R")) + return curWord.modifiedWord = null; + + curWord.modifiedWord = curWord; //default, modify itself + Word preWord = null, nextWord = null; + int curPos = curWord.position - 1; //word's position from 1, so need -1 + if(curPos-1 >= 0) preWord = s.words[curPos-1]; + if(curPos+1 < s.words.length) nextWord = s.words[curPos+1]; + Word nextModifiedWord = getTheModifiedWordBySentence(s, nextWord); + + //External rule: ent+noun(no type|ent), then ent is not modifier and noun is not node + //eg:Does the [Isar] [flow] into a lake? | Who was on the [Apollo 11] [mission] | When was the [De Beers] [company] founded + if(curWord.mayEnt && nextWord != null && !nextWord.mayEnt && !nextWord.mayType && !nextWord.mayLiteral) + { + nextWord.omitNode = true; + if(nextModifiedWord == nextWord) + return curWord.modifiedWord = curWord; + } + + //modify LEFT: ent + type(cur) : De_Beer company + if(preWord != null && curWord.mayType && preWord.mayEnt) //ent + type(cur) + { + if(!checkModifyBetweenEntType(preWord, curWord)) //De_Beer <- company, 注意此时即使type后面还连着node,也不理会了 + return curWord.modifiedWord = preWord; + } + + //modify itself: ent(cur) + type : De_Beer company + if(nextModifiedWord != null && curWord.mayEnt && nextModifiedWord.mayType) + { + if(!checkModifyBetweenEntType(curWord, nextModifiedWord)) + return curWord.modifiedWord = curWord; + } + + //generally, modify RIGHT + if(nextModifiedWord != null) + return curWord.modifiedWord = nextModifiedWord; + + //modify itself + return curWord.modifiedWord; + } + + /* + * recognize modifier/modified relation in DISCRETE nodes + * 1、[ent1] 's [ent2] + * 2、[ent1|type] by [ent2] + * Notice: run "getTheModifiedWordBySentence" first! + * */ + public Word getDiscreteModifiedWordBySentence(Sentence s, Word curWord) + { + int curPos = curWord.position - 1; + + //[ent1](cur) 's [ent2], ent1->ent2, usually do NOT appear in SPARQL | eg:Show me all books in Asimov 's Foundation_series + if(curPos+2 < s.words.length && curWord.mayEnt && s.words[curPos+1].baseForm.equals("'s") && s.words[curPos+2].mayEnt) + return curWord.modifiedWord = s.words[curPos+2]; + + //[ent1] by [ent2](cur), ent2->ent1, usually do NOT appear in SPARQL | eg: Which museum exhibits The Scream by Munch? + if(curPos-2 >=0 && (curWord.mayEnt||curWord.mayType) && s.words[curPos-1].baseForm.equals("by") && (s.words[curPos-2].mayEnt||s.words[curPos-2].mayType)) + return curWord.modifiedWord = s.words[curPos-2]; + + return curWord.modifiedWord; + } + + /* + * Judge nodes unstrictly. + * */ + public boolean isNodeCandidate(Word word) + { + if(word == null || stopNodeList.contains(word.baseForm)) + return false; + + if(word.posTag.startsWith("N")) + return true; + if(word.mayEnt || word.mayType || word.mayLiteral || word.mayCategory) + return true; + + return false; + } + + public boolean isWh(Word w) + { + String tmp = w.baseForm; + if(whList.contains(tmp)) + return true; + return false; + } +} diff --git a/src/qa/parsing/QuestionParsing.java b/src/qa/parsing/QuestionParsing.java new file mode 100644 index 0000000..ddc0cfd --- /dev/null +++ b/src/qa/parsing/QuestionParsing.java @@ -0,0 +1,208 @@ +package qa.parsing; + +import org.maltparser.core.exception.MaltChainedException; + +import log.QueryLogger; +import nlp.ds.DependencyTree; +import nlp.ds.DependencyTreeNode; +import nlp.ds.Word; +import nlp.ds.Sentence.SentenceType; +import qa.Globals; +import rdf.Sparql; +import rdf.Triple; + +public class QuestionParsing { + public void process(QueryLogger qlog) { + getDependenciesAndNER(qlog); + recognizeSentenceType(qlog); + } + + public void getDependenciesAndNER (QueryLogger qlog) { + long t1 = System.currentTimeMillis(); + try { + qlog.s.dependencyTreeStanford = new DependencyTree(qlog.s, Globals.stanfordParser); + }catch(Exception e){ + e.printStackTrace(); + } + + long t2 = System.currentTimeMillis(); + try{ + qlog.s.dependencyTreeMalt = new DependencyTree(qlog.s, Globals.maltParser); + }catch(Exception e){ + //if errors occur, abandon malt tree + qlog.s.dependencyTreeMalt = qlog.s.dependencyTreeStanford; + System.err.println("MALT parser error! Use stanford parser instead."); + } + + try { + long t3 = System.currentTimeMillis(); + Globals.nerRecognizer.recognize(qlog.s); + long t4 = System.currentTimeMillis(); + System.out.println("====StanfordDependencies("+(t2-t1)+"ms)===="); + System.out.println(qlog.s.dependencyTreeStanford); + System.out.println("====MaltDependencies("+(t3-t2)+"ms)===="); + System.out.println(qlog.s.dependencyTreeMalt); + System.out.println("====NameEntityRecognition("+(t4-t3)+"ms)===="); + qlog.s.printNERResult(); + + qlog.timeTable.put("StanfordParser", (int)(t2-t1)); + qlog.timeTable.put("MaltParser", (int)(t3-t2)); + qlog.timeTable.put("NER", (int)(t4-t3)); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public void recognizeSentenceType(QueryLogger qlog) + { + boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford)|| + recognizeImperativeSentence(qlog.s.dependencyTreeMalt); + if (IsImperativeSentence) + { + qlog.s.sentenceType = SentenceType.ImperativeSentence; + //two dependencyTree's ignored words should equal + for (DependencyTreeNode sNode : qlog.s.dependencyTreeStanford.nodesList) + for (DependencyTreeNode mNode : qlog.s.dependencyTreeMalt.nodesList) + if (sNode.equals(mNode) && (sNode.word.isIgnored||mNode.word.isIgnored)) + sNode.word.isIgnored = mNode.word.isIgnored = true; + return; + } + + boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford)|| + recognizeSpecialQuestion(qlog.s.dependencyTreeMalt); + if (IsSpecialQuestion) + { + qlog.s.sentenceType = SentenceType.SpecialQuestion; + return; + } + + boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford)|| + recognizeGeneralQuestion(qlog.s.dependencyTreeMalt); + if (IsGeneralQuestion) + { + qlog.s.sentenceType = SentenceType.GeneralQuestion; + return; + } + + //default is special + qlog.s.sentenceType = SentenceType.SpecialQuestion; + + } + + //if imperative, omitting those polite words + private boolean recognizeImperativeSentence(DependencyTree tree) { + if(tree.getRoot().word.posTag.startsWith("V") || tree.getRoot().word.posTag.startsWith("NN")) { + DependencyTreeNode dobj = null; + DependencyTreeNode iobj = null; + for (DependencyTreeNode n : tree.getRoot().childrenList) { + if (n.dep_father2child.equals("dobj")) { + dobj = n; + } + else if (n.dep_father2child.equals("iobj")) { + iobj = n; + } + } + if (dobj != null && iobj != null) { + tree.getRoot().word.isIgnored = true; + iobj.word.isIgnored = true; + + // give me a list of .. + if (dobj.word.baseForm.equals("list")) + { + dobj.word.isIgnored = true; + } + + return true; + } + + //start with "List": List all games by GMT. + if (dobj != null && tree.getRoot().word.baseForm.equals("list")) + { + //System.out.println("isListSentence!"); + tree.getRoot().word.isIgnored = true; + + return true; + } + } + return false; + } + + private boolean recognizeSpecialQuestion(DependencyTree tree) + { + DependencyTreeNode firstNode = null; + for (DependencyTreeNode dtn : tree.nodesList) + if (dtn.word.position == 1) + { + firstNode = dtn; + break; + } + //eg. In which city... + if (firstNode!=null && + (firstNode.word.posTag.equals("IN")||firstNode.word.posTag.equals("TO"))&& + firstNode.dep_father2child.startsWith("prep")) + { + firstNode = null; + for (DependencyTreeNode dtn : tree.nodesList) + if (dtn.word.position == 2) + { + firstNode = dtn; + break; + } + } + + if (firstNode != null) + { + if (firstNode.word.posTag.startsWith("W")) + return true; + } + return false; + } + + private boolean recognizeGeneralQuestion(DependencyTree tree) + { + DependencyTreeNode firstNode = null; + for (DependencyTreeNode dtn : tree.nodesList) + if (dtn.word.position == 1) + { + firstNode = dtn; + break; + } + + if (firstNode != null) + { + String dep = firstNode.dep_father2child; + String pos = firstNode.word.posTag; + String baseform = firstNode.word.baseForm; + + if ((baseform.equals("be")||baseform.equals("do")) && + pos.startsWith("VB") && + (dep.equals("root")||dep.equals("cop")||dep.startsWith("aux"))) + return true; + } + return false; + } + + public static String detectQuestionFocus(Sparql spq) { + String ret = null; + int posi = Integer.MAX_VALUE; + for (Triple t : spq.tripleList) { + + if (!t.isSubjConstant()) { + Word subj = t.getSubjectWord(); + if (subj!=null && subj.position < posi) { + posi = subj.position; + ret = t.subject; + } + } + if (!t.isObjConstant()) { + Word obj = t.getObjectWord(); + if (obj!=null && obj.position < posi) { + posi = obj.position; + ret = t.object; + } + } + } + if (ret != null) return ret.replace(' ', '_'); + else return null; + } +} diff --git a/src/rdf/EntityMapping.java b/src/rdf/EntityMapping.java new file mode 100644 index 0000000..db01ff9 --- /dev/null +++ b/src/rdf/EntityMapping.java @@ -0,0 +1,40 @@ +package rdf; + +import fgmt.EntityFragment; + +public class EntityMapping implements Comparable { + public int entityID = -1; + public String entityName = null; + public double score = 0; + + public EntityFragment entityFragment = null; + + public EntityMapping(int eid, String en, double sco) { + entityID = eid; + entityName = en; + score = sco; + + //punishment if entity start with "?" + if (entityName.startsWith("?")) + score *=0.5; + } + + // In descending order: big --> small + public int compareTo(EntityMapping o) { + double diff = this.score - o.score; + if (diff > 0) return -1; + else if (diff < 0) return 1; + else return 0; + } + + public int hashCode() + { + return new Integer(entityID).hashCode(); + } + + public String toString() + { + StringBuilder res = new StringBuilder(entityName+"("+score+")"); + return res.toString(); + } +} \ No newline at end of file diff --git a/src/rdf/ImplicitRelation.java b/src/rdf/ImplicitRelation.java new file mode 100644 index 0000000..451d632 --- /dev/null +++ b/src/rdf/ImplicitRelation.java @@ -0,0 +1,77 @@ +package rdf; + +import fgmt.TypeFragment; +import qa.Globals; +import lcn.EntityFragmentFields; + +public class ImplicitRelation { + + public String subj = null; + public String obj = null; + + public int pId = -1; + public double score = 0; + + //Role : 1|ent , 2|type_ , 3|var + public enum roleEnum {ENTITY, TYPE_CONSTANT, TYPE_VARIABLE, VARIABLE}; + public int subjRole = -1; + public int objRole = -1; + public int subjId = -1; + public int objId = -1; + + public ImplicitRelation(String s, String o, int pid, double sc) + { + pId = pid; + subj = s; + obj = o; + score = sc; + subjId = EntityFragmentFields.entityName2Id.get(s); + if(pId != Globals.pd.typePredicateID) + objId = EntityFragmentFields.entityName2Id.get(o); + else + objId = TypeFragment.typeShortName2IdList.get(o).get(0); + } + + public ImplicitRelation(Integer sId, Integer oId, int pid, double sc) + { + pId = pid; + subjId = sId; + objId = oId; + score = sc; + } + + public void setSubjectId(Integer s) + { + subjId = s; + } + + public void setObjectId(Integer o) + { + objId = o; + } + + public void setSubject(String s) + { + subj = s; + } + + public void setObject(String o) + { + obj = o; + } + + public int hashCode() + { + return new Integer(pId).hashCode() ^ new Integer(subjId).hashCode() ^ new Integer(objId).hashCode(); + } + + @Override + public boolean equals(Object ir) + { + ImplicitRelation tmpIr = (ImplicitRelation) ir; + if (pId == tmpIr.pId && subjId == tmpIr.subjId && objId == tmpIr.objId) + return true; + else return false; + } + +} diff --git a/src/rdf/MergedWord.java b/src/rdf/MergedWord.java new file mode 100644 index 0000000..e011088 --- /dev/null +++ b/src/rdf/MergedWord.java @@ -0,0 +1,41 @@ +package rdf; + +import java.util.ArrayList; + +import rdf.EntityMapping; +import rdf.TypeMapping; + +public class MergedWord implements Comparable +{ + //original position + public int st,ed; + //position after merge (unselected is -1) + public int mergedPos = -1; + public String name; + public boolean mayCategory = false; + public boolean mayLiteral = false; + public boolean mayEnt = false; + public boolean mayType = false; + public ArrayList emList = null; + public ArrayList tmList = null; + public String category = null; + + public MergedWord(int s,int e,String n) + { + st = s; + ed = e; + name = n; + } + + @Override + //long to short + public int compareTo(MergedWord o) + { + int lenDiff = (this.ed-this.st) - (o.ed-o.st); + + if (lenDiff > 0) return -1; + else if (lenDiff < 0) return 1; + return 0; + } + +} diff --git a/src/rdf/NodeSelectedWithScore.java b/src/rdf/NodeSelectedWithScore.java new file mode 100644 index 0000000..70a79e5 --- /dev/null +++ b/src/rdf/NodeSelectedWithScore.java @@ -0,0 +1,24 @@ +package rdf; + +import java.util.ArrayList; + +public class NodeSelectedWithScore implements Comparable +{ + public ArrayList selected; + int size; //split key to st and ed + public double score = 0; + + public NodeSelectedWithScore(ArrayList a, double b) + { + selected = a; + score = b; + } + + // In descending order: big --> small + public int compareTo(NodeSelectedWithScore o) { + double diff = this.score - o.score; + if (diff > 0) return -1; + else if (diff < 0) return 1; + else return 0; + } +} \ No newline at end of file diff --git a/src/rdf/PredicateMapping.java b/src/rdf/PredicateMapping.java new file mode 100644 index 0000000..5a5a15e --- /dev/null +++ b/src/rdf/PredicateMapping.java @@ -0,0 +1,28 @@ +package rdf; + +public class PredicateMapping implements Comparable { + public int pid = -1; + public double score = 0; + public String parapharase = null; + + public PredicateMapping (int pid, double sco, String para) { + this.pid = pid; + score = sco; + parapharase = para; + } + + // In descending order: big --> small + public int compareTo(PredicateMapping o) { + double diff = this.score - o.score; + if (diff > 0) return -1; + else if (diff < 0) return 1; + else return 0; + } + + @Override + public String toString() { + String ret = ""; + ret = "<"+pid+" : "+parapharase+" : "+score+">"; + return ret; + } +} diff --git a/src/rdf/SemanticQueryGraph.java b/src/rdf/SemanticQueryGraph.java new file mode 100644 index 0000000..de95df1 --- /dev/null +++ b/src/rdf/SemanticQueryGraph.java @@ -0,0 +1,180 @@ +package rdf; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +import qa.Globals; +import nlp.ds.Word; + +public class SemanticQueryGraph implements Comparable +{ + public ArrayList semanticUnitList = null; + public HashMap semanticRelations = new HashMap<>(); + public double score = 0; + + public SemanticQueryGraph(ArrayList suList) + { + semanticUnitList = suList; //TODO: need copy? + // Calculate Score by a reward function (TODO: using SVM-Rank) + } + + public SemanticQueryGraph(SemanticQueryGraph head) + { + semanticUnitList = new ArrayList<>(); + for(SemanticUnit su: head.semanticUnitList) + semanticUnitList.add(su.copy()); + score = head.score; + } + + public void connect(SemanticUnit u, SemanticUnit v) + { + if(u.equals(v)) + return; + + SemanticUnit su1 = null, su2 = null; + for(SemanticUnit su: this.semanticUnitList) + if(su.equals(u)) + su1 = su; + else if(su.equals(v)) + su2 = su; + if(su1 != null && su2 != null) + if(!su1.neighborUnitList.contains(su2) && !su2.neighborUnitList.contains(su1)) + { + su1.neighborUnitList.add(su2); + su2.neighborUnitList.add(su1); + } + } + + public void merge(SemanticUnit u, SemanticUnit v) + { + SemanticUnit su1 = null, su2 = null; + for(SemanticUnit su: this.semanticUnitList) + if(su.equals(u)) + su1 = su; + else if(su.equals(v)) + su2 = su; + if(su1 != null && su2 != null) + { + for(SemanticUnit su: this.semanticUnitList) + if(su != su2 && su.neighborUnitList.contains(su1) && !su.neighborUnitList.contains(su2)) //TODO: Notice, now REJECT multi-edges; The hash function of SR should be modified to allow multi-edges. + su.neighborUnitList.add(su2); + + this.semanticUnitList.remove(su1); + su2.neighborUnitList.remove(su1); + } + } + + @Override + public int hashCode() { + int code = 0; + for(SemanticUnit su: this.semanticUnitList) + code ^= su.hashCode(); + return code; + } + + @Override + public boolean equals(Object o) + { + if (o instanceof SemanticQueryGraph) + { + int matchCnt = 0; + for(SemanticUnit su1: ((SemanticQueryGraph) o).semanticUnitList) + for(SemanticUnit su2: this.semanticUnitList) + { + if(su1.equals(su2)) + { + if(su1.neighborUnitList.containsAll(su2.neighborUnitList) && su2.neighborUnitList.containsAll(su1.neighborUnitList)) + matchCnt++; + } + } + if(matchCnt == ((SemanticQueryGraph) o).semanticUnitList.size() && matchCnt == this.semanticUnitList.size()) + return true; + } + return false; + } + + @Override + public int compareTo(SemanticQueryGraph o) + { + double diff = this.score - o.score; + if (diff > 0) return -1; + else if (diff < 0) return 1; + else return 0; + } + + public boolean isFinalState() + { + if(semanticUnitList == null || semanticUnitList.isEmpty()) + return false; + + // Basic assumption: a final Semantic Query Graph should be Connected. + HashSet visited = new HashSet<>(); + SemanticUnit start = semanticUnitList.get(0); + visited.add(start); + dfs(start, visited); + + if(visited.size() == semanticUnitList.size()) + return true; + return false; + } + + private void dfs(SemanticUnit headNode, HashSet visited) + { + for(SemanticUnit curNode: headNode.neighborUnitList) + if(!visited.contains(curNode)) + { + visited.add(curNode); + dfs(curNode, visited); + } + + for(SemanticUnit curNode: semanticUnitList) + { + if(curNode.neighborUnitList.contains(headNode) || headNode.neighborUnitList.contains(curNode)) + { + if(!visited.contains(curNode)) + { + visited.add(curNode); + dfs(curNode, visited); + } + } + } + } + + public void calculateScore(HashMap potentialSemanticRelations) + { + // 1. entity/type score + double entSco = 0; + for(SemanticUnit su: this.semanticUnitList) + { + Word w = su.centerWord; + if(w.mayEnt && w.emList.size()>0) + entSco += w.emList.get(0).score * 100; + if(w.mayType && w.tmList.size()>0) + entSco += w.tmList.get(0).score; + } + // 2. relation score + double relSco = 0; + int relCnt = 0; + for(SemanticUnit su1: this.semanticUnitList) + for(SemanticUnit su2: su1.neighborUnitList) + { + //Deduplicate + if(su1.centerWord.position > su2.centerWord.position) + continue; + + relCnt++; + int key = su1.centerWord.getNnHead().hashCode() ^ su2.centerWord.getNnHead().hashCode(); + SemanticRelation sr = potentialSemanticRelations.get(key); + if(sr == null) + System.err.println("No semantic relation for: " + su1 + " & " + su2); + else + { + relSco += sr.predicateMappings.get(0).score; + semanticRelations.put(key, sr); + } + } + relSco/=relCnt; //average + this.score = entSco + relSco; + } +} diff --git a/src/rdf/SemanticRelation.java b/src/rdf/SemanticRelation.java new file mode 100644 index 0000000..8625853 --- /dev/null +++ b/src/rdf/SemanticRelation.java @@ -0,0 +1,171 @@ +package rdf; + +import java.util.ArrayList; + +import rdf.SimpleRelation; + +import nlp.ds.Word; + +public class SemanticRelation { + public Word arg1Word = null; + public Word arg2Word = null; + public String relationParaphrase = null; // longest match + public double LongestMatchingScore = 0; // longest match score + + //judge difference when copy semantic relation from special pattern + public int arg1SuffixId = 0; + public int arg2SuffixId = 0; + + public Word arg1Word_beforeCRR = null; + public Word arg2Word_beforeCRR = null; + + public ArrayList predicateMappings = null; + + public boolean isArg1Constant = false; + public boolean isArg2Constant = false; + + public char extractingMethod = ' '; // S: StanfordParser; M: MaltParser; N: N-gram; R: rules + + public SemanticRelation dependOnSemanticRelation = null; + public Word preferredSubj = null; + + public boolean isSteadyEdge = true; + + public SemanticRelation(SemanticRelation r2) { + arg1Word = r2.arg1Word; + arg2Word = r2.arg2Word; + relationParaphrase = r2.relationParaphrase; + LongestMatchingScore = r2.LongestMatchingScore; + + arg1SuffixId = r2.arg1SuffixId; + arg2SuffixId = r2.arg2SuffixId; + + arg1Word_beforeCRR = r2.arg1Word_beforeCRR; + arg2Word_beforeCRR = r2.arg2Word_beforeCRR; + + arg1Word.emList = r2.arg1Word.emList; + arg2Word.emList = r2.arg2Word.emList; + predicateMappings = r2.predicateMappings; + +// arg1Types = r2.arg1Types; +// arg2Types = r2.arg2Types; + + isArg1Constant = r2.isArg1Constant; + isArg2Constant = r2.isArg2Constant; + + extractingMethod = r2.extractingMethod; + + dependOnSemanticRelation = r2.dependOnSemanticRelation; + preferredSubj = r2.preferredSubj; + } + + public void swapArg1Arg2() + { + Word tmpWord = arg1Word; + arg1Word = arg2Word; + arg2Word = tmpWord; + int tmpSuffixId = arg1SuffixId; + arg1SuffixId = arg2SuffixId; + arg2SuffixId = tmpSuffixId; + tmpWord = arg1Word_beforeCRR; + arg1Word_beforeCRR = arg2Word_beforeCRR; + arg2Word_beforeCRR = tmpWord; + boolean tmpBool = isArg1Constant; + isArg1Constant = isArg2Constant; + isArg2Constant = tmpBool; + } + + public SemanticRelation (SimpleRelation simr) { + if (simr.preferredSubj == null) { + if (simr.arg1Word.compareTo(simr.arg2Word) < 0) { + this.arg1Word = simr.arg1Word; + this.arg2Word = simr.arg2Word; + this.arg1Word_beforeCRR = simr.arg1Word_beforeCRR; + this.arg2Word_beforeCRR = simr.arg2Word_beforeCRR; + } + else { + this.arg1Word = simr.arg2Word; + this.arg2Word = simr.arg1Word; + this.arg1Word_beforeCRR = simr.arg2Word_beforeCRR; + this.arg2Word_beforeCRR = simr.arg1Word_beforeCRR; + } + this.extractingMethod = simr.extractingMethod; + } + else { + if (simr.arg1Word == simr.preferredSubj) { + this.arg1Word = simr.arg1Word; + this.arg2Word = simr.arg2Word; + this.arg1Word_beforeCRR = simr.arg1Word_beforeCRR; + this.arg2Word_beforeCRR = simr.arg2Word_beforeCRR; + this.preferredSubj = simr.preferredSubj; + } + else { + this.arg1Word = simr.arg2Word; + this.arg2Word = simr.arg1Word; + this.arg1Word_beforeCRR = simr.arg2Word_beforeCRR; + this.arg2Word_beforeCRR = simr.arg1Word_beforeCRR; + this.preferredSubj = simr.preferredSubj; + } + this.extractingMethod = simr.extractingMethod; + } + } + + @Override + public int hashCode() { + return arg1Word.hashCode() ^ arg2Word.hashCode() + arg1SuffixId + arg2SuffixId; + } + + @Override + public boolean equals(Object o) { + if (o instanceof SemanticRelation) { + SemanticRelation sr2 = (SemanticRelation) o; + if (this.arg1Word.equals(sr2.arg1Word) + && this.arg2Word.equals(sr2.arg2Word) + && this.arg1SuffixId == sr2.arg1SuffixId + && this.arg2SuffixId == sr2.arg2SuffixId + && this.relationParaphrase.equals(sr2.relationParaphrase) + && this.LongestMatchingScore == sr2.LongestMatchingScore) { + return true; + } + } + return false; + } + + @Override + public String toString() { + return arg1Word.originalForm + "," + arg2Word.originalForm + "," + relationParaphrase + "," + LongestMatchingScore + "["+extractingMethod+"]"; +// return arg1Word.getFullEntityName() + "," + arg2Word.getFullEntityName() + "," + relationParaphrase + "," + LongestMatchingScore + "["+extractingMethod+"]"; + } + + public void normalizeScore() + { + double maxScore; + + if (arg1Word.emList!=null && !arg1Word.emList.isEmpty()) + { + maxScore=0.0; + for (EntityMapping em : arg1Word.emList) + maxScore = Math.max(maxScore, em.score); + for (EntityMapping em : arg1Word.emList) + em.score = em.score/maxScore; + } + + if (arg2Word.emList!=null && !arg2Word.emList.isEmpty()) + { + maxScore=0.0; + for (EntityMapping em : arg2Word.emList) + maxScore = Math.max(maxScore, em.score); + for (EntityMapping em : arg2Word.emList) + em.score = em.score/maxScore; + } + + if (predicateMappings!=null && !predicateMappings.isEmpty()) + { + maxScore=0.0; + for (PredicateMapping pm : predicateMappings) + maxScore = Math.max(maxScore, pm.score); + for (PredicateMapping pm : predicateMappings) + pm.score = pm.score/maxScore; + } + } +} diff --git a/src/rdf/SemanticUnit.java b/src/rdf/SemanticUnit.java new file mode 100644 index 0000000..668690b --- /dev/null +++ b/src/rdf/SemanticUnit.java @@ -0,0 +1,61 @@ +package rdf; + +import java.util.ArrayList; +import java.util.HashMap; + +import rdf.SemanticRelation; +import nlp.ds.DependencyTreeNode; +import nlp.ds.Word; + +public class SemanticUnit +{ + public Word centerWord = null; + public ArrayList describeNodeList = new ArrayList(); + public ArrayList neighborUnitList = new ArrayList(); + public HashMap RelationList = new HashMap(); + + public boolean isSubj = true; + public Integer prefferdType = null; + + public SemanticUnit(Word center, boolean isSubJ) + { + centerWord = center; + isSubj = isSubJ; + } + + public SemanticUnit copy() + { + SemanticUnit su = new SemanticUnit(this.centerWord, this.isSubj); + su.describeNodeList = (ArrayList) this.describeNodeList.clone(); + su.neighborUnitList = (ArrayList) this.neighborUnitList.clone(); + su.RelationList = (HashMap) this.RelationList.clone(); + return su; + } + + @Override + public int hashCode() { + return centerWord.hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o instanceof SemanticUnit) { + SemanticUnit su2 = (SemanticUnit) o; + if(this.centerWord.equals(su2.centerWord)) + return true; + } + return false; + } + + @Override + public String toString() + { + String ret = "<" + centerWord + ", {"; + for(SemanticUnit su: neighborUnitList) + ret += su.centerWord + ", "; + ret += "}>"; + + return ret; + } + +} diff --git a/src/rdf/SimpleRelation.java b/src/rdf/SimpleRelation.java new file mode 100644 index 0000000..a3b5334 --- /dev/null +++ b/src/rdf/SimpleRelation.java @@ -0,0 +1,88 @@ +package rdf; + +import java.util.ArrayList; +import java.util.HashMap; + +import paradict.PredicateIDAndSupport; +import qa.Globals; + +import nlp.ds.DependencyTree; +import nlp.ds.DependencyTreeNode; +import nlp.ds.Word; + +// allow repetition +public class SimpleRelation { + public Word arg1Word = null; + public Word arg2Word = null; + public String relationParaphrase = null; + public double matchingScore = 0; + + public Word arg1Word_beforeCRR = null; + public Word arg2Word_beforeCRR = null; + + public HashMap pasList = new HashMap(); + + public Word preferredSubj = null; + + public char extractingMethod = ' '; // S: StanfordParser; M: MaltParser; N: N-gram; R: rules + + public SimpleRelation() + { + + } + + public SimpleRelation(SimpleRelation sr) + { + arg1Word = sr.arg1Word; + arg2Word = sr.arg2Word; + relationParaphrase = sr.relationParaphrase; + matchingScore = sr.matchingScore; + arg1Word_beforeCRR = sr.arg1Word_beforeCRR; + arg2Word_beforeCRR = sr.arg2Word_beforeCRR; + pasList = sr.pasList; + preferredSubj = sr.preferredSubj; + extractingMethod = 'R'; + } + + @Override + public String toString() { + return arg1Word.originalForm + "," + arg2Word.originalForm + "," + relationParaphrase + "," + matchingScore + "["+extractingMethod+"]"; + //return arg1Word.getFullEntityName() + "," + arg2Word.getFullEntityName() + "," + relationParaphrase + "," + matchingScore + "["+extractingMethod+"]"; + } + + public int getHashCode() { + return arg1Word.hashCode() ^ arg2Word.hashCode(); + } + + public void setPasList (String pattern, double matchingScore, boolean[] matchedFlag) { + ArrayList list = Globals.pd.nlPattern_2_predicateList.get(pattern); + for (PredicateIDAndSupport pidsup : list) { + double sumSelectivity = 0; + for (int i = 0; i < matchedFlag.length; i ++) { + if (matchedFlag[i]) { + sumSelectivity += pidsup.wordSelectivity[i]; + } + } + sumSelectivity = matchingScore*sumSelectivity*pidsup.support; + int pid = pidsup.predicateID; + if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5; //����dbo�е�predicate //pid ���ܲ��� dbo �У� + + if (!pasList.containsKey(pid)) + pasList.put(pid, sumSelectivity); + else if (sumSelectivity > pasList.get(pid)) + pasList.put(pid, sumSelectivity); + } + } + + public void setPreferedSubjObjOrder(DependencyTree tree) { + DependencyTreeNode n1 = tree.getNodeByIndex(this.arg1Word.position).getNNTopTreeNode(tree); + DependencyTreeNode n2 = tree.getNodeByIndex(this.arg2Word.position).getNNTopTreeNode(tree); + if (n1.father != null && n1.father.word.baseForm.equals("of") && n1.dep_father2child.equals("pobj")) { + this.preferredSubj = this.arg1Word; + } + else if (n2.father != null && n2.father.word.baseForm.equals("of") && n2.dep_father2child.equals("pobj")) { + this.preferredSubj = this.arg2Word; + } + } + +} diff --git a/src/rdf/Sparql.java b/src/rdf/Sparql.java new file mode 100644 index 0000000..a139e8e --- /dev/null +++ b/src/rdf/Sparql.java @@ -0,0 +1,305 @@ +package rdf; + +import java.util.ArrayList; +import java.util.Collections; +//import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; + +import log.QueryLogger; +import nlp.ds.Sentence; +import nlp.ds.Sentence.SentenceType; +import qa.Globals; + +public class Sparql implements Comparable +{ + public ArrayList tripleList = new ArrayList(); + public boolean countTarget = false; + public String mostStr = null; + public String moreThanStr = null; + public double score = 0; + + public String questionFocus = null; // The answer variable + public HashSet variables = new HashSet(); + + public enum QueryType {Select,Ask} + public QueryType queryType = QueryType.Select; + + public HashMap semanticRelations = null; + + public void addTriple(Triple t) + { + if(!tripleList.contains(t)) + { + tripleList.add(t); + score += t.score; + } + } + + public void delTriple(Triple t) + { + if(tripleList.contains(t)) + { + tripleList.remove(t); + score -= t.score; + } + } + + @Override + public String toString() + { + String ret = ""; + for (Triple t : tripleList) { + ret += t.toString(); + ret += '\n'; + } + return ret; + } + + public void deduplicate() + { + HashSet set = new HashSet(); + ArrayList list = new ArrayList(); + for(Triple t: tripleList) + { + String st = t.toStringWithoutScore(); + if(set.contains(st)) + list.add(t); + set.add(st); + } + for(Triple t: list) + this.delTriple(t); + } + + // Is it a Basic Graph Pattern without filter and aggregation? + public boolean isBGP() + { + if(moreThanStr != null || mostStr != null || countTarget) + return false; + return true; + } + + //Use to display (can not be executed) + public String toStringForGStore() + { + String ret = ""; + for (Triple t : tripleList) + { + // !Omit obvious LITERAL + if(t.object.equals("literal_HRZ")) + continue; + + // !Omit some bad TYPEs + if(t.predicateID==Globals.pd.typePredicateID && Globals.pd.bannedTypes.contains(t.object)) + continue; + + ret += t.toStringForGStore(); + ret += '\n'; + } + return ret; + } + + /** + * @description: + * 1. Select all variables for BGP queries to display specific information. + * 2. DO NOT select all variables when Aggregation like "HAVING" "COUNT" ... + * (It may involves too many results, e.g. "which countries have more than 1000 caves?", caves is no need to display) + * @param: NULL. + * @return: A SPARQL query can be executed by GStore (NO prefix of entities/predicates). + */ + public String toStringForGStore2() + { + String ret = ""; + variables.clear(); + for(Triple t: tripleList) + { + if (!t.isSubjConstant()) variables.add(t.subject.replaceAll(" ", "_")); + if (!t.isObjConstant()) variables.add(t.object.replaceAll(" ", "_")); + } + if(variables.size() == 0) + queryType = QueryType.Ask; + + // part1: select / ask ... + if (queryType==QueryType.Ask) + ret += "ask"; + else if(countTarget) + ret += ("select COUNT(DISTINCT " + questionFocus + ")"); + else + { + if(!isBGP()) // AGG: select question focus + ret += ("select DISTINCT " + questionFocus); + else // BGP: select all variables + { + ret += "select DISTINCT "; + for (String v : variables) + ret += v + " "; + } + } + + // part2: triples + ret += " where\n{\n"; + for(Triple t : tripleList) + { + if (!t.object.equals("literal_HRZ")) { // need not display literal + ret += t.toStringForGStore(); + ret += " .\n"; + } + } + ret += "}\n"; + + // part3: order by / group by ... + if(moreThanStr != null) + ret += moreThanStr+"\n"; + if(mostStr != null) + ret += mostStr+"\n"; + + // part4: limit + if(queryType != QueryType.Ask && (mostStr == null || !mostStr.contains("LIMIT"))) + ret += "LIMIT " + Globals.MaxAnswerNum; + + return ret; + } + + //Use to execute (select all variables; format 'aggregation' and 'ask') + public String toStringForVirtuoso() + { + String ret = ""; + HashSet variables = new HashSet(); + + // prefix + if (queryType==QueryType.Ask) + ret += "ask where"; + else if(countTarget) + ret += ("select COUNT(DISTINCT " + questionFocus + ") where"); + else + { + // AGG: select question focus + if(moreThanStr != null || mostStr != null) + ret += ("select DISTINCT " + questionFocus + " where"); + // BGP: select all variables + else + { + for (Triple t: tripleList) + { + if (!t.isSubjConstant()) variables.add(t.subject.replaceAll(" ", "_")); + if (!t.isObjConstant()) variables.add(t.object.replaceAll(" ", "_")); + } + + ret += "select "; + for (String v : variables) + ret += v + " "; + ret += "where"; + } + } + ret += "\n{\n"; + if(variables.size() == 0) + variables.add(questionFocus); + + // triples + for (Triple t : tripleList) + { + if (!t.object.equals("literal_HRZ")) { + ret += t.toStringForGStore(); + ret += " .\n"; + } + } + ret += "}\n"; + + // suffix + if(moreThanStr != null) + { + ret += moreThanStr+"\n"; + } + if(mostStr != null) + { + ret += mostStr+"\n"; + } + + return ret; + } + + public int getVariableNumber() + { + int res = 0; + for (Triple t: tripleList) + { + if (!t.isSubjConstant()) res++; + if (!t.isObjConstant()) res++; + } + return res; + } + + public void adjustTriplesOrder() + { + Collections.sort(this.tripleList); + } + + public int compareTo(Sparql o) + { + double diff = this.score - o.score; + if (diff > 0) + return -1; + else if (diff < 0) + return 1; + else + return 0; + } + + @Override + public int hashCode() + { + int key = 0; + for(Triple t: this.tripleList) + key ^= t.hashCode(); + return key; + } + + @Override + public boolean equals(Object spq) + { + Sparql tempSparql= (Sparql) spq; + String s1 = this.toStringForGStore2(), s2 = tempSparql.toStringForGStore2(); + if(this.toStringForGStore2().equals(tempSparql.toStringForGStore2())) + return true; + else + return false; + } + + public Sparql(){} + public Sparql(HashMap semanticRelations) + { + this.semanticRelations = semanticRelations; + } + + public Sparql copy() + { + Sparql spq = new Sparql(this.semanticRelations); + for (Triple t : this.tripleList) + spq.addTriple(t); + return spq; + } + + public void removeLastTriple() + { + int idx = tripleList.size()-1; + score -= tripleList.get(idx).score; + tripleList.remove(idx); + } + + public Sparql removeAllTypeInfo () + { + score = 0; + ArrayList newTripleList = new ArrayList(); + for (Triple t : tripleList) + { + if (t.predicateID != Globals.pd.typePredicateID) + { + newTripleList.add(t); + score += t.score; + } + } + tripleList = newTripleList; + return this; + } + +}; diff --git a/src/rdf/Triple.java b/src/rdf/Triple.java new file mode 100644 index 0000000..e89b7d6 --- /dev/null +++ b/src/rdf/Triple.java @@ -0,0 +1,257 @@ +package rdf; + +import nlp.ds.Word; +import qa.Globals; + +public class Triple implements Comparable{ + public String subject = null; // subject/object after disambiguation. + public String object = null; + + static public int TYPE_ROLE_ID = -5; + static public int VAR_ROLE_ID = -2; + static public int CAT_ROLE_ID = -8; // Category + static public String VAR_NAME = "?xxx"; + + // subjId/objId: entity id | TYPE_ROLE_ID | VAR_ROLE_ID + public int subjId = -1; + public int objId = -1; + public int predicateID = -1; + public Word subjWord = null; // only be used when semRltn == null + public Word objWord = null; + + public SemanticRelation semRltn = null; + public double score = 0; + public boolean isSubjObjOrderSameWithSemRltn = true; + public boolean isSubjObjOrderPrefered = false; + + public Word typeSubjectWord = null; // for "type" triples only + + public Triple (Triple t) { + subject = t.subject; + object = t.object; + subjId = t.subjId; + objId = t.objId; + predicateID = t.predicateID; + + semRltn = t.semRltn; + score = t.score; + isSubjObjOrderSameWithSemRltn = t.isSubjObjOrderSameWithSemRltn; + isSubjObjOrderPrefered = t.isSubjObjOrderPrefered; + } + + // A final triple (subject/object order will not changed), does not rely on semantic relation (sr == null), from one word (type variable | embedded info) + public Triple (int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco) { + subjId = sId; + objId = oId; + subject = s; + predicateID = p; + object = o; + semRltn = sr; + score = sco; + } + + // A triple translated from a semantic relation (subject/object order can be changed in later) + public Triple (int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco, boolean isSwap) { + subjId = sId; + objId = oId; + subject = s; + predicateID = p; + object = o; + semRltn = sr; + score = sco; + isSubjObjOrderSameWithSemRltn = isSwap; + } + + // A final triple (subject/object order will not changed), does not rely on semantic relation (sr == null), from two word (implicit relations of modifier) + public Triple(int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco, Word subj, Word obj) { + subjId = sId; + objId = oId; + subject = s; + predicateID = p; + object = o; + semRltn = sr; + score = sco; + subjWord = subj; + objWord = obj; + } + + public Triple copy() { + Triple t = new Triple(this); + return t; + } + + public Triple copySwap() { + Triple t = new Triple(this); + String temp; + int tmpId; + + tmpId = t.subjId; + t.subjId = t.objId; + t.objId = tmpId; + + temp = t.subject; + t.subject = t.object; + t.object = temp; + + t.isSubjObjOrderSameWithSemRltn = !this.isSubjObjOrderSameWithSemRltn; + t.isSubjObjOrderPrefered = !this.isSubjObjOrderPrefered; + + return t; + } + + public void addScore(double s) { + score += s; + } + + public double getScore() { + return score; + } + + @Override + public int hashCode() + { + return new Integer(subjId).hashCode() ^ new Integer(objId).hashCode() ^ new Integer(predicateID).hashCode(); + } + + @Override + public String toString() { + return subjId+":<" + subject + "> <" + Globals.pd.getPredicateById(predicateID) + "> "+objId+":<" + object + ">" + " : " + score; + } + + public String toStringForGStore() { + StringBuilder sb = new StringBuilder(""); + + String _subject = subject; + if(_subject.startsWith("?")) + sb.append(_subject+"\t"); + else + sb.append("<" + _subject + ">\t"); + + sb.append("<" + Globals.pd.getPredicateById(predicateID) + ">\t"); + + String _object; + if(predicateID == Globals.pd.typePredicateID && object.contains("|")) + _object = object.substring(0, object.indexOf('|')); + else + _object = object; + if(_object.startsWith("?")) + sb.append(_object); + else + sb.append("<" + _object + ">"); + + return sb.toString().replace(' ', '_'); + } + + public String toStringWithoutScore() { + return "<" + subject + "> <" + Globals.pd.getPredicateById(predicateID) + "> <" + object + ">"; + } + + public Word getSubjectWord () { + if (predicateID == Globals.pd.typePredicateID) { + return typeSubjectWord; + } + else if(semRltn == null) + { + return subjWord; + } + else { + if (isSubjObjOrderSameWithSemRltn) return semRltn.arg1Word; + else return semRltn.arg2Word; + } + + } + + public Word getObjectWord () { + if (predicateID == Globals.pd.typePredicateID) { + return typeSubjectWord; + } + else if(semRltn == null) + { + return objWord; + } + else { + if (isSubjObjOrderSameWithSemRltn) return semRltn.arg2Word; + else return semRltn.arg1Word; + } + } + + public boolean isSubjConstant () { + if (predicateID == Globals.pd.typePredicateID) { + return !subject.startsWith("?"); + } + else { + // Triple from semantic (obvious) relation + if(semRltn != null) + { + if (isSubjObjOrderSameWithSemRltn) return semRltn.isArg1Constant; + else return semRltn.isArg2Constant; + } + // Triple from implicit relation (no semantic relation), it is final triple + else + { + if(subjId != Triple.VAR_ROLE_ID && subjId != Triple.TYPE_ROLE_ID) + return true; + else + return false; + } + } + } + + public boolean isObjConstant () { + if (predicateID == Globals.pd.typePredicateID) { + return !object.startsWith("?"); + } + else { + if(semRltn != null) + { + if (isSubjObjOrderSameWithSemRltn) return semRltn.isArg2Constant; + else return semRltn.isArg1Constant; + } + else + { + if(objId != Triple.VAR_ROLE_ID && objId != Triple.TYPE_ROLE_ID) + return true; + else + return false; + } + } + } + + public int compareTo(Triple o) + { + //Order: Type, Ent&Ent, Ent&Var, Var&Var + if(this.predicateID == Globals.pd.typePredicateID) + { + if(o.predicateID == Globals.pd.typePredicateID) + return 0; + else + return -1; + } + int cnt1 = 0, cnt2 = 0; + if(!this.subject.startsWith("?")) + cnt1++; + if(!this.object.startsWith("?")) + cnt1++; + if(!o.subject.startsWith("?")) + cnt2++; + if(!o.object.startsWith("?")) + cnt2++; + + if(cnt1 == cnt2) + return 0; + else if(cnt1 > cnt2) + return -1; + else + return 1; + } + + public void swapSubjObjOrder() { + String temp = subject; + int tmpId = subjId; + subject = object; + subjId = objId; + object = temp; + objId = tmpId; + isSubjObjOrderSameWithSemRltn = !isSubjObjOrderSameWithSemRltn; + } +}; \ No newline at end of file diff --git a/src/rdf/TypeMapping.java b/src/rdf/TypeMapping.java new file mode 100644 index 0000000..5c1c390 --- /dev/null +++ b/src/rdf/TypeMapping.java @@ -0,0 +1,53 @@ +package rdf; + +import qa.Globals; + +public class TypeMapping implements Comparable +{ + public Integer typeID = null; + public String typeName = null; + public double score = 0; + + /* + * 1, For standard type (DBO type in DBpedia), relation = typePredicateID (rdf:type) + * 2, For nonstandard type, typeID = -1 + * 3, If add type into triples, need relation | eg, Which professional surfers were born in Australia? (?uri dbo:occupation res:Surfing) relation = dbo:occupation + * 4, If needn't add type, relation = -1 | eg, Who was the father of [Queen] Elizabeth II + * */ + public int prefferdRelation = Globals.pd.typePredicateID; + + public TypeMapping(Integer tid, String type, double sco) + { + typeID = tid; + typeName = type; + score = sco; + } + + public TypeMapping(Integer tid, String type, Integer relation, double sco) + { + typeID = tid; + typeName = type.replace("_", ""); + score = sco; + prefferdRelation = relation; + } + + // In descending order: big --> small + public int compareTo(TypeMapping o) + { + double diff = this.score - o.score; + if (diff > 0) return -1; + else if (diff < 0) return 1; + else return 0; + } + + public int hashCode() + { + return typeID.hashCode(); + } + + public String toString() + { + StringBuilder res = new StringBuilder(typeName+"("+score+")"); + return res.toString(); + } +} \ No newline at end of file diff --git a/src/utils/FileUtil.java b/src/utils/FileUtil.java new file mode 100644 index 0000000..74d7e12 --- /dev/null +++ b/src/utils/FileUtil.java @@ -0,0 +1,91 @@ +package utils; + +import java.io.*; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + + +public class FileUtil { + public static List readFile(String filePath){ + List lines = new ArrayList(); + try { + BufferedReader br = new BufferedReader(new FileReader(filePath)); + String line = null; + while( (line = br.readLine()) != null ){ + lines.add(line); + } + br.close(); + }catch(Exception e){ + e.printStackTrace(); + }finally { + return lines; + } + } + + public static Set readFileAsSet(String filePath){ + Set lines = new HashSet(); + try { + BufferedReader br = new BufferedReader(new FileReader(filePath)); + String line = null; + while( (line = br.readLine()) != null ){ + lines.add(line); + } + br.close(); + }catch(Exception e){ + e.printStackTrace(); + }finally { + return lines; + } + } + + public static List readFile(InputStream is){ + List lines = new ArrayList(); + try { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String line = null; + while( (line = br.readLine()) != null ){ + lines.add(line); + } + br.close(); + }catch(Exception e){ + e.printStackTrace(); + }finally { + return lines; + } + } + + public static String readFileAsALine(InputStream is){ + List lines = readFile(is); + StringBuffer buffer = new StringBuffer(); + for(String line : lines){ + buffer.append(line); + } + return buffer.toString(); + } + + public static void writeFile(List lines, String filePath){ + try{ + BufferedWriter bw = new BufferedWriter(new FileWriter(filePath)); + for(String line : lines){ + bw.write(line+"\n"); + } + bw.close(); + }catch(Exception e){ + e.printStackTrace(); + } + } + + public static void writeFile(List lines, String filePath, boolean ifContinueWrite){ + try{ + BufferedWriter bw = new BufferedWriter(new FileWriter(filePath, ifContinueWrite)); + for(String line : lines){ + bw.write(line+"\n"); + } + bw.close(); + }catch(Exception e){ + e.printStackTrace(); + } + } +} diff --git a/src/utils/HttpRequest.java b/src/utils/HttpRequest.java new file mode 100644 index 0000000..454e300 --- /dev/null +++ b/src/utils/HttpRequest.java @@ -0,0 +1,114 @@ +package utils; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.net.URL; +import java.net.URLConnection; +import java.util.List; +import java.util.Map; + +public class HttpRequest +{ + public static String sendGet(String url, String param) { + String result = ""; + BufferedReader in = null; + try { + String urlNameString = url + "?" + param; + URL realUrl = new URL(urlNameString); + + URLConnection connection = realUrl.openConnection(); + connection.setRequestProperty("accept", "*/*"); + connection.setRequestProperty("connection", "Keep-Alive"); + connection.setRequestProperty("user-agent", + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); + + connection.connect(); + Map> map = connection.getHeaderFields(); + for (String key : map.keySet()) { + System.out.println(key + "--->" + map.get(key)); + } + in = new BufferedReader(new InputStreamReader( + connection.getInputStream())); + String line; + while ((line = in.readLine()) != null) { + result += line; + } + } catch (Exception e) { + System.out.println("Error when sending GET request: " + e); + e.printStackTrace(); + } + finally { + try { + if (in != null) { + in.close(); + } + } catch (Exception e2) { + e2.printStackTrace(); + } + } + return result; + } + + public static String sendPost(String url, String param) { + PrintWriter out = null; + BufferedReader in = null; + String result = ""; + try { + URL realUrl = new URL(url); + URLConnection conn = realUrl.openConnection(); + conn.setRequestProperty("accept", "*/*"); + conn.setRequestProperty("connection", "Keep-Alive"); + conn.setRequestProperty("user-agent", + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); + conn.setDoOutput(true); + conn.setDoInput(true); + out = new PrintWriter(conn.getOutputStream()); + out.print(param); + out.flush(); + in = new BufferedReader( + new InputStreamReader(conn.getInputStream())); + String line; + while ((line = in.readLine()) != null) { + result += line; + } + } catch (Exception e) { + System.out.println("Error when sending POST request: "+e); + e.printStackTrace(); + } + finally{ + try{ + if(out!=null){ + out.close(); + } + if(in!=null){ + in.close(); + } + } + catch(IOException ex){ + ex.printStackTrace(); + } + } + return result; + } + + + public static String getPostData(InputStream in, int size, String charset) { + if (in != null && size > 0) { + byte[] buf = new byte[size]; + try { + in.read(buf); + if (charset == null || charset.length() == 0) + return new String(buf); + else { + return new String(buf, charset); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + return null; + } +} \ No newline at end of file