@@ -0,0 +1,238 @@ | |||||
package addition; | |||||
import java.util.ArrayList; | |||||
import java.util.HashMap; | |||||
import paradict.PredicateIDAndSupport; | |||||
import log.QueryLogger; | |||||
//import nlp.ds.DependencyTree; | |||||
//import nlp.ds.DependencyTreeNode; | |||||
import nlp.ds.Word; | |||||
import nlp.ds.Sentence.SentenceType; | |||||
import qa.Globals; | |||||
//import qa.extract.TypeRecognition; | |||||
//import qa.mapping.SemanticItemMapping; | |||||
//import rdf.EntityMapping; | |||||
import rdf.SemanticUnit; | |||||
import rdf.Sparql; | |||||
import rdf.Sparql.QueryType; | |||||
import rdf.Triple; | |||||
//import fgmt.TypeFragment; | |||||
public class AddtionalFix | |||||
{ | |||||
public HashMap<String, String> pattern2category = new HashMap<String, String>(); | |||||
public AddtionalFix() | |||||
{ | |||||
// Some category mappings for DBpedia, try automatic linking methods later. | base form | |||||
pattern2category.put("gangster_from_the_prohibition_era", "Prohibition-era_gangsters"); | |||||
pattern2category.put("seven_wonder_of_the_ancient_world", "Seven_Wonders_of_the_Ancient_World"); | |||||
pattern2category.put("three_ship_use_by_columbus", "Christopher_Columbus"); | |||||
pattern2category.put("13_british_colony", "Thirteen_Colonies"); | |||||
} | |||||
public void process(QueryLogger qlog) | |||||
{ | |||||
fixCategory(qlog); | |||||
oneTriple(qlog); | |||||
oneNode(qlog); | |||||
//aggregation | |||||
AggregationRecognition ar = new AggregationRecognition(); | |||||
ar.recognize(qlog); | |||||
//query type | |||||
decideQueryType(qlog); | |||||
} | |||||
public void decideQueryType(QueryLogger qlog) | |||||
{ | |||||
for(Sparql spq: qlog.rankedSparqls) | |||||
if(qlog.s.sentenceType == SentenceType.GeneralQuestion) | |||||
spq.queryType = QueryType.Ask; | |||||
} | |||||
public void fixCategory(QueryLogger qlog) | |||||
{ | |||||
if(qlog == null || qlog.semanticUnitList == null) | |||||
return; | |||||
String var = null, category = null; | |||||
for(SemanticUnit su: qlog.semanticUnitList) | |||||
{ | |||||
if(su.centerWord.mayCategory) | |||||
{ | |||||
var = "?"+su.centerWord.originalForm; | |||||
category = su.centerWord.category; | |||||
} | |||||
} | |||||
if(category != null && var != null) | |||||
for(Sparql spq: qlog.rankedSparqls) | |||||
{ | |||||
boolean occured = false; | |||||
for(Triple tri: spq.tripleList) | |||||
{ | |||||
if(tri.subject.equals(var)) | |||||
{ | |||||
occured = true; | |||||
break; | |||||
} | |||||
} | |||||
String oName = category; | |||||
String pName = "subject"; | |||||
int pid = Globals.pd.predicate_2_id.get(pName); | |||||
Triple triple = new Triple(Triple.VAR_ROLE_ID, var, pid, Triple.CAT_ROLE_ID, oName, null, 100); | |||||
spq.addTriple(triple); | |||||
} | |||||
} | |||||
/* recognize one-Node query | |||||
* Two cases:1、Special question|Imperative sentence 2、General question | |||||
* 1-1:how many [], highest [] ... | For single variable, add constraint (aggregation) | |||||
* 1-2: What is backgammon? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...) | |||||
* 1-3: Give me all Seven Wonders of the Ancient World. | Notice, "Seven Wonders of the Ancient World" should be recognized as ENT before. (in fact it is CATEGORY in DBpeida) | |||||
* 2-1: Are there any [castles_in_the_United_States](yago:type) | |||||
* 2-2:Was Sigmund Freud married? | Lack of variable node. | |||||
* 2-3:Are penguins endangered? | No suitable relation matching, need transition. | |||||
*/ | |||||
public void oneNode(QueryLogger qlog) | |||||
{ | |||||
if(qlog == null || qlog.semanticUnitList == null || qlog.semanticUnitList.size()>1) | |||||
return; | |||||
Word target = qlog.target; | |||||
Word[] words = qlog.s.words; | |||||
if(qlog.s.sentenceType != SentenceType.GeneralQuestion) | |||||
{ | |||||
//1-1: how many [type] are there | List all [type] | |||||
if(target.mayType && target.tmList != null) | |||||
{ | |||||
String subName = "?"+target.originalForm; | |||||
String typeName = target.tmList.get(0).typeName; | |||||
Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100); | |||||
Sparql sparql = new Sparql(); | |||||
sparql.addTriple(triple); | |||||
qlog.rankedSparqls.add(sparql); | |||||
} | |||||
//1-2: What is [ent]? | |||||
else if(target.mayEnt && target.emList != null) | |||||
{ | |||||
if(words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be")) | |||||
{ | |||||
int eid = target.emList.get(0).entityID; | |||||
String subName = target.emList.get(0).entityName; | |||||
Triple triple = new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+target.originalForm, null, target.emList.get(0).score); | |||||
Sparql sparql = new Sparql(); | |||||
sparql.addTriple(triple); | |||||
qlog.rankedSparqls.add(sparql); | |||||
} | |||||
} | |||||
//1-3: Give me all Seven Wonders of the Ancient World. | |||||
else if(target.mayCategory && target.category != null) | |||||
{ | |||||
String oName = target.category; | |||||
String pName = "subject"; | |||||
int pid = Globals.pd.predicate_2_id.get(pName); | |||||
Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+target.originalForm, pid, Triple.CAT_ROLE_ID, oName, null, 100); | |||||
Sparql sparql = new Sparql(); | |||||
sparql.addTriple(triple); | |||||
qlog.rankedSparqls.add(sparql); | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
if(target.mayEnt && target.emList != null) | |||||
{ | |||||
//2-2:Was Sigmund Freud married? | |||||
String relMention = ""; | |||||
for(Word word: words) | |||||
if(word != target && !word.baseForm.equals(".") && !word.baseForm.equals("?")) | |||||
relMention += word.baseForm+" "; | |||||
if(relMention.length() > 1) | |||||
relMention = relMention.substring(0, relMention.length()-1); | |||||
ArrayList<PredicateIDAndSupport> pmList = null; | |||||
if(Globals.pd.nlPattern_2_predicateList.containsKey(relMention)) | |||||
pmList = Globals.pd.nlPattern_2_predicateList.get(relMention); | |||||
if(pmList != null && pmList.size() > 0) | |||||
{ | |||||
int pid = pmList.get(0).predicateID; | |||||
int eid = target.emList.get(0).entityID; | |||||
String subName = target.emList.get(0).entityName; | |||||
Triple triple = new Triple(eid, subName, pid, Triple.VAR_ROLE_ID, "?x", null, 100); | |||||
Sparql sparql = new Sparql(); | |||||
sparql.addTriple(triple); | |||||
qlog.rankedSparqls.add(sparql); | |||||
} | |||||
//2-3:Are penguins endangered? | |||||
else | |||||
{ | |||||
if(target.position < words.length && pattern2category.containsKey(words[target.position].baseForm)) | |||||
{ | |||||
String oName = pattern2category.get(words[target.position].baseForm); | |||||
String pName = "subject"; | |||||
int pid = Globals.pd.predicate_2_id.get(pName); | |||||
int eid = target.emList.get(0).entityID; | |||||
String subName = target.emList.get(0).entityName; | |||||
Triple triple = new Triple(eid, subName, pid, Triple.CAT_ROLE_ID, oName, null, 100); | |||||
Sparql sparql = new Sparql(); | |||||
sparql.addTriple(triple); | |||||
qlog.rankedSparqls.add(sparql); | |||||
} | |||||
} | |||||
} | |||||
//2-1: Are there any [castles_in_the_United_States](yago:type) | |||||
else if(target.mayType && target.tmList != null) | |||||
{ | |||||
String typeName = target.tmList.get(0).typeName; | |||||
String subName = "?" + target.originalForm; | |||||
//System.out.println("typeName="+typeName+" subName="+subName); | |||||
Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100); | |||||
Sparql sparql = new Sparql(); | |||||
sparql.addTriple(triple); | |||||
qlog.rankedSparqls.add(sparql); | |||||
} | |||||
} | |||||
} | |||||
/* | |||||
* One triple recognized but no suitable relation. | |||||
* */ | |||||
public void oneTriple (QueryLogger qlog) | |||||
{ | |||||
if(qlog == null || qlog.semanticUnitList == null) | |||||
return; | |||||
if(qlog.s.sentenceType == SentenceType.SpecialQuestion) | |||||
{ | |||||
Word[] words = qlog.s.words; | |||||
if(qlog.semanticUnitList.size() == 2) | |||||
{ | |||||
Word entWord = null, whWord = null; | |||||
for(int i=0;i<qlog.semanticUnitList.size();i++) | |||||
{ | |||||
if(qlog.semanticUnitList.get(i).centerWord.baseForm.startsWith("wh")) | |||||
whWord = qlog.semanticUnitList.get(i).centerWord; | |||||
if(qlog.semanticUnitList.get(i).centerWord.mayEnt) | |||||
entWord = qlog.semanticUnitList.get(i).centerWord; | |||||
} | |||||
// 1-1: (what) is [ent] | we guess users may want the type of ent. | |||||
if(entWord!=null && whWord!= null && words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be")) | |||||
{ | |||||
int eid = entWord.emList.get(0).entityID; | |||||
String subName = entWord.emList.get(0).entityName; | |||||
Triple triple = new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+whWord.originalForm, null, entWord.emList.get(0).score); | |||||
Sparql sparql = new Sparql(); | |||||
sparql.addTriple(triple); | |||||
qlog.rankedSparqls.add(sparql); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
@@ -0,0 +1,155 @@ | |||||
package addition; | |||||
import nlp.ds.DependencyTree; | |||||
import nlp.ds.DependencyTreeNode; | |||||
import nlp.ds.Word; | |||||
import qa.Globals; | |||||
import rdf.SemanticRelation; | |||||
import rdf.Sparql; | |||||
import rdf.Triple; | |||||
import log.QueryLogger; | |||||
public class AggregationRecognition { | |||||
// Numbers | |||||
static String x[]={"zero","one","two","three","four","five","six","seven","eight","nine"}; | |||||
static String y[]={"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen"}; | |||||
static String z[]={"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"}; | |||||
static int b; | |||||
public static Integer translateNumbers(String str) // 1~100 | |||||
{ | |||||
int flag; | |||||
try { | |||||
b=Integer.valueOf(str); | |||||
flag=1; | |||||
} | |||||
catch (Exception e){ | |||||
flag=2; | |||||
} | |||||
int i,j; | |||||
switch(flag) | |||||
{ | |||||
case 1: | |||||
return b; | |||||
case 2: // Words need to be translated into numbers | |||||
boolean flag1=true; | |||||
for(i=0;i<8;i++) // 20~99 | |||||
{ | |||||
for(j=0;j<10;j++) | |||||
{ | |||||
String str1=z[i],str2=x[j]; | |||||
if(str.equals((str1))){ | |||||
return i*10+20; // 1x | |||||
} | |||||
else if(str.equals((str1+" "+str2))){ | |||||
return i*10+j+20; | |||||
} | |||||
} | |||||
} | |||||
for(i=0;i<10;i++){ | |||||
if(str.equals(x[i])){ | |||||
return i; | |||||
} | |||||
else if(str.equals(y[i])){ | |||||
return 10+i; | |||||
} | |||||
} | |||||
System.out.println("Warning: Can not Translate Number: " + str); | |||||
} | |||||
return 1; | |||||
} | |||||
public void recognize(QueryLogger qlog) | |||||
{ | |||||
DependencyTree ds = qlog.s.dependencyTreeStanford; | |||||
if(qlog.isMaltParserUsed) | |||||
ds = qlog.s.dependencyTreeMalt; | |||||
Word[] words = qlog.s.words; | |||||
// how often | how many | |||||
if(qlog.s.plainText.indexOf("How many")!=-1||qlog.s.plainText.indexOf("How often")!=-1||qlog.s.plainText.indexOf("how many")!=-1||qlog.s.plainText.indexOf("how often")!=-1) | |||||
{ | |||||
for(Sparql sp: qlog.rankedSparqls) | |||||
{ | |||||
sp.countTarget = true; | |||||
// How many pages does War and Peace have? --> res:War_and_Peace dbo:numberOfPages ?n . | |||||
// ?uri dbo:populationTotal ?inhabitants . | |||||
for(Triple triple: sp.tripleList) | |||||
{ | |||||
String p = Globals.pd.getPredicateById(triple.predicateID).toLowerCase(); | |||||
if(p.contains("number") || p.contains("total") || p.contains("calories") || p.contains("satellites")) | |||||
{ | |||||
sp.countTarget = false; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
// more than [num] [node] | |||||
for(DependencyTreeNode dtn: ds.nodesList) | |||||
{ | |||||
if(dtn.word.baseForm.equals("more")) | |||||
{ | |||||
if(dtn.father!=null && dtn.father.word.baseForm.equals("than")) | |||||
{ | |||||
DependencyTreeNode tmp = dtn.father; | |||||
if(tmp.father!=null && tmp.father.word.posTag.equals("CD") && tmp.father.father!=null && tmp.father.father.word.posTag.startsWith("N")) | |||||
{ | |||||
DependencyTreeNode target = tmp.father.father; | |||||
// Which caves have more than 3 entrances | entranceCount | filter | |||||
for(Sparql sp: qlog.rankedSparqls) | |||||
{ | |||||
if(target.father !=null && target.father.word.baseForm.equals("have")) | |||||
{ | |||||
sp.moreThanStr = "GROUP BY ?" + qlog.target.originalForm + "\nHAVING (COUNT(?"+target.word.originalForm + ") > "+tmp.father.word.baseForm+")"; | |||||
} | |||||
else | |||||
{ | |||||
int num = translateNumbers(tmp.father.word.baseForm); | |||||
sp.moreThanStr = "FILTER (?"+target.word.originalForm+"> " + num + ")"; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
// most | |||||
for(Word word: words) | |||||
{ | |||||
if(word.baseForm.equals("most")) | |||||
{ | |||||
Word modifiedWord = word.modifiedWord; | |||||
if(modifiedWord != null) | |||||
{ | |||||
for(Sparql sp: qlog.rankedSparqls) | |||||
{ | |||||
// Which Indian company has the most employees? --> ... dbo:numberOfEmployees ?n . || ?employees dbo:company ... | |||||
sp.mostStr = "ORDER BY DESC(COUNT(?"+modifiedWord.originalForm+"))\nOFFSET 0 LIMIT 1"; | |||||
for(Triple triple: sp.tripleList) | |||||
{ | |||||
String p = Globals.pd.getPredicateById(triple.predicateID).toLowerCase(); | |||||
if(p.contains("number") || p.contains("total")) | |||||
{ | |||||
sp.mostStr = "ORDER BY DESC(?"+modifiedWord.originalForm+")\nOFFSET 0 LIMIT 1"; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
public static void main(String[] args) { | |||||
System.out.println(translateNumbers("Twelve")); | |||||
System.out.println(translateNumbers("thirty two")); | |||||
} | |||||
} |
@@ -0,0 +1,312 @@ | |||||
package fgmt; | |||||
import java.io.IOException; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import rdf.EntityMapping; | |||||
import lcn.EntityFragmentFields; | |||||
import lcn.EntityNameAndScore; | |||||
import lcn.SearchInEntityFragments; | |||||
public class EntityFragment extends Fragment { | |||||
public int eId; | |||||
public HashSet<Integer> inEdges = new HashSet<Integer>(); | |||||
public HashSet<Integer> outEdges = new HashSet<Integer>(); | |||||
public HashSet<Integer> types = new HashSet<Integer>(); | |||||
// in/out entity and the connected edges. Eg, <eId><director><tom> <eId><star><tom>, then outEntMap of eId contains <tom,<director,star>> | |||||
public HashMap<Integer, ArrayList<Integer>> inEntMap = new HashMap<Integer, ArrayList<Integer>>(); // notice the input file should no redundant triple. | |||||
public HashMap<Integer, ArrayList<Integer>> outEntMap = new HashMap<Integer, ArrayList<Integer>>(); | |||||
static double thres1 = 0.4; | |||||
static double thres2 = 0.8; | |||||
static int thres3 = 3; | |||||
static int k = 50; | |||||
/** | |||||
* mention to entity using Lucene index. | |||||
* | |||||
* rule: | |||||
* select top-k results of each phrase. | |||||
* (1)if current lowest score < thres1, drop those score < thres1. | |||||
* (2)if current lowest score > thres2, add those score > thres2. | |||||
* | |||||
* exact match: | |||||
* (1)Lucene score = 1. | |||||
* (2)String match (lowercase): edit distance <= thres3. | |||||
* | |||||
* score: | |||||
* use Lucene score directly. | |||||
* | |||||
* @param phrase | |||||
* @return | |||||
*/ | |||||
public static HashMap<Integer, Double> getCandEntityNames2(String phrase) { | |||||
HashMap<Integer, Double> ret = new HashMap<Integer, Double>(); | |||||
ArrayList<EntityNameAndScore> list1 = getCandEntityNames_subject(phrase, thres1, thres2, k); | |||||
if(list1 == null) | |||||
return ret; | |||||
int iter_size = 0; | |||||
if (list1.size() <= k) { | |||||
iter_size = list1.size(); | |||||
} | |||||
else if (list1.size() > k) { | |||||
if (list1.get(k-1).score >= thres2) { | |||||
iter_size = list1.size(); | |||||
} | |||||
else { | |||||
iter_size = k; | |||||
} | |||||
} | |||||
for(int i = 0; i < iter_size; i ++) { | |||||
if (i < k) { | |||||
ret.put(list1.get(i).entityID, getScore(phrase, list1.get(i).entityName, list1.get(i).score)); | |||||
} | |||||
else if (list1.get(i).score >= thres2) { | |||||
ret.put(list1.get(i).entityID, getScore(phrase, list1.get(i).entityName, list1.get(i).score)); | |||||
} | |||||
else { | |||||
break; | |||||
} | |||||
} | |||||
return ret; | |||||
} | |||||
public static ArrayList<EntityMapping> getEntityMappingList (String n) | |||||
{ | |||||
HashMap<Integer, Double> map = getCandEntityNames2(n); | |||||
ArrayList<EntityMapping> ret = new ArrayList<EntityMapping>(); | |||||
for (int eid : map.keySet()) | |||||
{ | |||||
String s = EntityFragmentFields.entityId2Name.get(eid); | |||||
ret.add(new EntityMapping(eid, s, map.get(eid))); | |||||
} | |||||
Collections.sort(ret); | |||||
return ret; | |||||
} | |||||
public static double getScore (String s1, String s2, double luceneScore) { | |||||
double ret = luceneScore*100.0/(Math.log(calEditDistance(s1, s2)*1.5+1)+1); | |||||
return ret; | |||||
} | |||||
/** | |||||
* Edit distance (all lowercase) | |||||
* @param s1 | |||||
* @param s2 | |||||
* @return | |||||
*/ | |||||
public static int calEditDistance (String s1, String s2) { | |||||
s1 = s1.toLowerCase(); | |||||
s2 = s2.toLowerCase(); | |||||
int d[][]; | |||||
int n = s1.length(); | |||||
int m = s2.length(); | |||||
int i, j, temp; | |||||
char ch1, ch2; | |||||
if(n == 0) { | |||||
return m; | |||||
} | |||||
if(m == 0) { | |||||
return n; | |||||
} | |||||
d = new int[n+1][m+1]; | |||||
for(i=0; i<=n; i++) { | |||||
d[i][0] = i; | |||||
} | |||||
for(j=0; j<=m; j++) { | |||||
d[0][j] = j; | |||||
} | |||||
for(i=1; i<=n; i++) { | |||||
ch1 = s1.charAt(i-1); | |||||
for(j=1; j<=m; j++) { | |||||
ch2 = s2.charAt(j-1); | |||||
if(ch1 == ch2) { | |||||
temp = 0; | |||||
} else { | |||||
temp = 1; | |||||
} | |||||
d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp); | |||||
} | |||||
} | |||||
return d[n][m]; | |||||
} | |||||
private static int min(int a, int b, int c) { | |||||
int ab = a<b?a:b; | |||||
return ab<c?ab:c; | |||||
} | |||||
public static ArrayList<EntityNameAndScore> getCandEntityNames_subject(String phrase, double thres1, double thres2, int k) { | |||||
SearchInEntityFragments sf = new SearchInEntityFragments(); | |||||
//System.out.println("EntityFragment.getCandEntityNames_subject() ..."); | |||||
ArrayList<EntityNameAndScore> ret_sf = null; | |||||
try { | |||||
ret_sf = sf.searchName(phrase, thres1, thres2, k); | |||||
} catch (IOException e) { | |||||
//e.printStackTrace(); | |||||
System.err.println("Reading lcn index error"); | |||||
} | |||||
return ret_sf; | |||||
} | |||||
public static EntityFragment getEntityFragmentByEntityId(Integer entityId) | |||||
{ | |||||
if(!EntityFragmentFields.entityFragmentString.containsKey(entityId)) | |||||
return null; | |||||
String fgmt = EntityFragmentFields.entityFragmentString.get(entityId); | |||||
EntityFragment ef = new EntityFragment(entityId, fgmt); | |||||
return ef; | |||||
} | |||||
public static String getEntityFgmtStringByName(String entityName) | |||||
{ | |||||
int id = EntityFragmentFields.entityName2Id.get(entityName); | |||||
String fgmt = EntityFragmentFields.entityFragmentString.get(id); | |||||
return fgmt; | |||||
} | |||||
public EntityFragment(int eid, String fgmt) | |||||
{ | |||||
eId = eid; | |||||
fragmentType = typeEnum.ENTITY_FRAGMENT; | |||||
//eg: 11 |3961112:2881;410;,4641020:2330;, | |||||
fgmt = fgmt.replace('|', '#'); | |||||
String[] fields = fgmt.split("#"); | |||||
if(fields.length > 0 && fields[0].length() > 0) | |||||
{ | |||||
String[] entEdgesArr = fields[0].split(","); | |||||
for(int i = 0; i < entEdgesArr.length; i ++) | |||||
{ | |||||
String[] nums = entEdgesArr[i].split(":"); | |||||
if(nums.length != 2) | |||||
continue; | |||||
int intEntId = Integer.valueOf(nums[0]); | |||||
String[] intEdges = nums[1].split(";"); | |||||
ArrayList<Integer> intEdgeList = new ArrayList<Integer>(); | |||||
for(String outEdge: intEdges) | |||||
{ | |||||
intEdgeList.add(Integer.valueOf(outEdge)); | |||||
} | |||||
if(intEdgeList.size()>0) | |||||
inEntMap.put(intEntId, intEdgeList); | |||||
} | |||||
} | |||||
if(fields.length > 1 && fields[1].length() > 0) | |||||
{ | |||||
String[] entEdgesArr = fields[1].split(","); | |||||
for(int i = 0; i < entEdgesArr.length; i ++) | |||||
{ | |||||
String[] nums = entEdgesArr[i].split(":"); | |||||
if(nums.length != 2) | |||||
continue; | |||||
int outEntId = Integer.valueOf(nums[0]); | |||||
String[] outEdges = nums[1].split(";"); | |||||
ArrayList<Integer> outEdgeList = new ArrayList<Integer>(); | |||||
for(String outEdge: outEdges) | |||||
{ | |||||
outEdgeList.add(Integer.valueOf(outEdge)); | |||||
} | |||||
if(outEdgeList.size()>0) | |||||
outEntMap.put(outEntId, outEdgeList); | |||||
} | |||||
} | |||||
if(fields.length > 2 && fields[2].length() > 0) { | |||||
String[] nums = fields[2].split(","); | |||||
for(int i = 0; i < nums.length; i ++) { | |||||
if (nums[i].length() > 0) { | |||||
inEdges.add(Integer.parseInt(nums[i])); | |||||
} | |||||
} | |||||
} | |||||
if(fields.length > 3 && fields[3].length() > 0) { | |||||
String[] nums = fields[3].split(","); | |||||
for(int i = 0; i < nums.length; i ++) { | |||||
if (nums[i].length() > 0) { | |||||
outEdges.add(Integer.parseInt(nums[i])); | |||||
} | |||||
} | |||||
} | |||||
if(fields.length > 4 && fields[4].length() > 0) { | |||||
String[] nums = fields[4].split(","); | |||||
for(int i = 0; i < nums.length; i ++) { | |||||
if (nums[i].length() > 0) { | |||||
types.add(Integer.parseInt(nums[i])); | |||||
} | |||||
} | |||||
} | |||||
//TODO: fix data for DBpedia 2014 (should be eliminated when update dataset) | |||||
if(eid==2640237) //Barack_Obama | |||||
{ | |||||
inEdges.add(8432); //spouse | |||||
outEdges.add(8432); | |||||
ArrayList<Integer> outEdgeList = new ArrayList<Integer>(); | |||||
outEdgeList.add(8432); | |||||
inEntMap.put(4953443, outEdgeList); | |||||
outEntMap.put(4953443, outEdgeList); | |||||
} | |||||
} | |||||
@Override | |||||
public String toString() | |||||
{ | |||||
StringBuilder ret = new StringBuilder(""); | |||||
for(Integer inEnt: inEntMap.keySet()) | |||||
{ | |||||
ArrayList<Integer> inEdgeList = inEntMap.get(inEnt); | |||||
if(inEdgeList==null || inEdgeList.size()==0) | |||||
continue; | |||||
ret.append(inEnt+":"); | |||||
for(int inEdge: inEdgeList) | |||||
ret.append(inEdge+";"); | |||||
ret.append(","); | |||||
} | |||||
ret.append('|'); | |||||
for(Integer outEnt: outEntMap.keySet()) | |||||
{ | |||||
ArrayList<Integer> outEdgeList = outEntMap.get(outEnt); | |||||
if(outEdgeList==null || outEdgeList.size()==0) | |||||
continue; | |||||
ret.append(outEnt+":"); | |||||
for(int outEdge: outEdgeList) | |||||
ret.append(outEdge+";"); | |||||
ret.append(","); | |||||
} | |||||
ret.append('|'); | |||||
for(Integer p : inEdges) { | |||||
ret.append(p); | |||||
ret.append(','); | |||||
} | |||||
ret.append('|'); | |||||
for(Integer p : outEdges) { | |||||
ret.append(p); | |||||
ret.append(','); | |||||
} | |||||
ret.append('|'); | |||||
for(Integer t : types) { | |||||
ret.append(t); | |||||
ret.append(','); | |||||
} | |||||
return ret.toString(); | |||||
} | |||||
} |
@@ -0,0 +1,8 @@ | |||||
package fgmt; | |||||
public abstract class Fragment { | |||||
public enum typeEnum {ENTITY_FRAGMENT, RELATION_FRAGMENT, TYPE_FRAGMENT, VAR_FRAGMENT}; | |||||
public typeEnum fragmentType; | |||||
public int fragmentId; | |||||
}; |
@@ -0,0 +1,105 @@ | |||||
package fgmt; | |||||
import java.io.IOException; | |||||
import java.util.ArrayList; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import java.util.List; | |||||
import qa.Globals; | |||||
import utils.FileUtil; | |||||
public class RelationFragment extends Fragment | |||||
{ | |||||
public static HashMap<Integer, ArrayList<RelationFragment>> relFragments = null; | |||||
public static HashMap<String, ArrayList<Integer>> relationShortName2IdList = null; | |||||
public static HashSet<Integer> literalRelationSet = null; | |||||
public HashSet<Integer> inTypes = new HashSet<Integer>(); | |||||
public HashSet<Integer> outTypes = new HashSet<Integer>(); | |||||
public static final int literalTypeId = -176; | |||||
public RelationFragment(String inFgmt, String outFgmt, int fid) | |||||
{ | |||||
fragmentId = fid; | |||||
fragmentType = typeEnum.RELATION_FRAGMENT; | |||||
String[] nums; | |||||
// in | |||||
nums = inFgmt.split(","); | |||||
for(String s: nums) | |||||
if(s.length() > 0) | |||||
inTypes.add(Integer.parseInt(s)); | |||||
// out | |||||
if(outFgmt.equals("itera")) | |||||
outTypes.add(literalTypeId); | |||||
else | |||||
{ | |||||
nums = outFgmt.split(","); | |||||
for(String s: nums) | |||||
if(s.length() > 0) | |||||
outTypes.add(Integer.parseInt(s)); | |||||
} | |||||
} | |||||
public static void load() throws Exception | |||||
{ | |||||
String filename = Globals.localPath + "data/DBpedia2016/fragments/predicate_RDF_fragment/predicate_fragment.txt"; | |||||
List<String> inputs = FileUtil.readFile(filename); | |||||
relFragments = new HashMap<Integer, ArrayList<RelationFragment>>(); | |||||
literalRelationSet = new HashSet<Integer>(); | |||||
for(String line: inputs) | |||||
{ | |||||
String[] lines = line.split("\t"); | |||||
String inString = lines[0].substring(1, lines[0].length()-1); | |||||
int pid = Integer.parseInt(lines[1]); | |||||
String outString = lines[2].substring(1, lines[2].length()-1); | |||||
// Record which relations can connect LITERAL objects. | |||||
if(outString.equals("itera")) // "literal".substring(1, length()-1) | |||||
literalRelationSet.add(pid); | |||||
if(!relFragments.containsKey(pid)) | |||||
relFragments.put(pid, new ArrayList<RelationFragment>()); | |||||
relFragments.get(pid).add(new RelationFragment(inString, outString, pid)); | |||||
} | |||||
loadId(); | |||||
} | |||||
public static void loadId() throws IOException | |||||
{ | |||||
String filename = Globals.localPath + "data/DBpedia2016/fragments/id_mappings/16predicate_id.txt"; | |||||
List<String> inputs = FileUtil.readFile(filename); | |||||
relationShortName2IdList = new HashMap<String, ArrayList<Integer>>(); | |||||
for(String line: inputs) | |||||
{ | |||||
String[] lines = line.split("\t"); | |||||
String rlnShortName = lines[0]; | |||||
if (!relationShortName2IdList.containsKey(rlnShortName)) | |||||
relationShortName2IdList.put(rlnShortName, new ArrayList<Integer>()); | |||||
relationShortName2IdList.get(rlnShortName).add(Integer.parseInt(lines[1])); | |||||
} | |||||
} | |||||
public static boolean isLiteral (String p) | |||||
{ | |||||
for (Integer i : relationShortName2IdList.get(p)) | |||||
if (literalRelationSet.contains(i)) | |||||
return true; | |||||
return false; | |||||
} | |||||
public static boolean isLiteral (int pid) | |||||
{ | |||||
if (literalRelationSet.contains(pid)) | |||||
return true; | |||||
else | |||||
return false; | |||||
} | |||||
} |
@@ -0,0 +1,179 @@ | |||||
package fgmt; | |||||
import java.io.BufferedReader; | |||||
import java.io.File; | |||||
import java.io.FileInputStream; | |||||
import java.io.IOException; | |||||
import java.io.InputStreamReader; | |||||
import java.util.ArrayList; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import qa.Globals; | |||||
public class TypeFragment extends Fragment { | |||||
public static HashMap<Integer, TypeFragment> typeFragments = null; | |||||
public static HashMap<String, ArrayList<Integer>> typeShortName2IdList = null; | |||||
public static HashMap<Integer, String> typeId2ShortName = null; | |||||
public static final int NO_RELATION = -24232; | |||||
public static HashSet<String> yagoTypeList = null; | |||||
public HashSet<Integer> inEdges = new HashSet<Integer>(); | |||||
public HashSet<Integer> outEdges = new HashSet<Integer>(); | |||||
public HashSet<Integer> entSet = new HashSet<Integer>(); | |||||
/* | |||||
* Eliminate some bad YAGO Types which conflict with: | |||||
* 1, ENT: amazon、earth、the_hunger_game、sparkling_wine | |||||
* 2, TYPE: type | |||||
* 3, RELATION: flow、owner、series、shot、part、care | |||||
* 4, others: peace、vice | |||||
*/ | |||||
public static ArrayList<String> stopYagoTypeList = null; | |||||
static void loadStopYagoTypeList() | |||||
{ | |||||
stopYagoTypeList = new ArrayList<String>(); | |||||
stopYagoTypeList.add("Amazon"); | |||||
stopYagoTypeList.add("Earth"); | |||||
stopYagoTypeList.add("TheHungerGames"); | |||||
stopYagoTypeList.add("SparklingWine"); | |||||
stopYagoTypeList.add("Type"); | |||||
stopYagoTypeList.add("Flow"); | |||||
stopYagoTypeList.add("Owner"); | |||||
stopYagoTypeList.add("Series"); | |||||
stopYagoTypeList.add("Shot"); | |||||
stopYagoTypeList.add("Part"); | |||||
stopYagoTypeList.add("Care"); | |||||
stopYagoTypeList.add("Peace"); | |||||
stopYagoTypeList.add("Vice"); | |||||
stopYagoTypeList.add("Dodo"); | |||||
stopYagoTypeList.add("CzechFilms"); | |||||
stopYagoTypeList.add("ChineseFilms"); | |||||
} | |||||
public TypeFragment(String fgmt, int fid) | |||||
{ | |||||
fragmentId = fid; | |||||
fragmentType = typeEnum.TYPE_FRAGMENT; | |||||
fgmt = fgmt.replace('|', '#'); | |||||
String[] ss = fgmt.split("#"); | |||||
String[] nums; | |||||
if (ss[0].length() > 0) { | |||||
nums = ss[0].split(","); | |||||
for(int i = 0; i < nums.length; i ++) { | |||||
if (nums[i].length() > 0) { | |||||
inEdges.add(Integer.parseInt(nums[i])); | |||||
} | |||||
} | |||||
} | |||||
else { | |||||
inEdges.add(NO_RELATION); | |||||
} | |||||
if (ss.length > 1 && ss[1].length() > 0) { | |||||
nums = ss[1].split(","); | |||||
for(int i = 0; i < nums.length; i ++) { | |||||
if (nums[i].length() > 0) { | |||||
outEdges.add(Integer.parseInt(nums[i])); | |||||
} | |||||
} | |||||
} | |||||
else { | |||||
outEdges.add(NO_RELATION); | |||||
} | |||||
if(ss.length > 2 && ss[2].length() > 0) | |||||
{ | |||||
nums = ss[2].split(","); | |||||
for(int i = 0; i < nums.length; i ++) { | |||||
if (nums[i].length() > 0) { | |||||
entSet.add(Integer.parseInt(nums[i])); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
public static void load() throws Exception | |||||
{ | |||||
String filename = Globals.localPath+"data/DBpedia2016/fragments/class_RDF_fragment/16type_fragment.txt"; | |||||
File file = new File(filename); | |||||
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); | |||||
BufferedReader br = new BufferedReader(in); | |||||
typeFragments = new HashMap<Integer, TypeFragment>(); | |||||
System.out.println("Loading type IDs and Fragments ..."); | |||||
String line; | |||||
while((line = br.readLine()) != null) { | |||||
String[] lines = line.split("\t"); | |||||
TypeFragment tfgmt = null; | |||||
if(lines[0].length() > 0 && !lines[0].equals("literal")) { | |||||
int tid = Integer.parseInt(lines[0]); | |||||
try{tfgmt = new TypeFragment(lines[1], tid);} | |||||
catch(Exception e){} | |||||
typeFragments.put(tid, tfgmt); | |||||
} | |||||
} | |||||
br.close(); | |||||
// can fix some data there | |||||
// load Type Id | |||||
loadId(); | |||||
System.out.println("Load "+typeId2ShortName.size()+" basic types and "+yagoTypeList.size()+" yago types."); | |||||
} | |||||
public static void loadId() throws IOException | |||||
{ | |||||
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16basic_types_id.txt"; | |||||
String yagoFileName = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16yago_types_list.txt"; | |||||
File file = new File(filename); | |||||
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); | |||||
BufferedReader br = new BufferedReader(in); | |||||
typeShortName2IdList = new HashMap<String, ArrayList<Integer>>(); | |||||
typeId2ShortName = new HashMap<Integer, String>(); | |||||
String line; | |||||
while((line = br.readLine()) != null) { | |||||
String[] lines = line.split("\t"); | |||||
String typeShortName = lines[0]; | |||||
// reserve typeShortName's capitalization | |||||
if (!typeShortName2IdList.containsKey(typeShortName)) { | |||||
typeShortName2IdList.put(typeShortName, new ArrayList<Integer>()); | |||||
} | |||||
typeShortName2IdList.get(typeShortName).add(Integer.parseInt(lines[1])); | |||||
typeId2ShortName.put(Integer.parseInt(lines[1]), typeShortName); | |||||
} | |||||
// literalType | |||||
typeShortName2IdList.put("literal_HRZ", new ArrayList<Integer>()); | |||||
typeShortName2IdList.get("literal_HRZ").add(RelationFragment.literalTypeId); | |||||
typeId2ShortName.put(RelationFragment.literalTypeId, "literal_HRZ"); | |||||
br.close(); | |||||
//load YAGO types | |||||
in = new InputStreamReader(new FileInputStream(yagoFileName),"utf-8"); | |||||
br = new BufferedReader(in); | |||||
yagoTypeList = new HashSet<String>(); | |||||
while((line = br.readLine())!=null) | |||||
{ | |||||
String[] lines = line.split("\t"); | |||||
String typeName = lines[0]; | |||||
yagoTypeList.add(typeName); | |||||
} | |||||
loadStopYagoTypeList(); | |||||
yagoTypeList.removeAll(stopYagoTypeList); | |||||
} | |||||
} |
@@ -0,0 +1,56 @@ | |||||
package fgmt; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashSet; | |||||
import java.util.Iterator; | |||||
public class VariableFragment extends Fragment { | |||||
public static final int magic_number = -265; | |||||
public ArrayList<HashSet<Integer>> candTypes = null; | |||||
public HashSet<Integer> candEntities = null; | |||||
public boolean mayLiteral = false; | |||||
public VariableFragment() | |||||
{ | |||||
fragmentType = typeEnum.VAR_FRAGMENT; | |||||
candTypes = new ArrayList<HashSet<Integer>>(); | |||||
candEntities = new HashSet<Integer>(); | |||||
} | |||||
@Override | |||||
public String toString() | |||||
{ | |||||
return "("+ candEntities.size() +")"; | |||||
} | |||||
public boolean containsAll(HashSet<Integer> s1) { | |||||
Iterator<HashSet<Integer>> it = candTypes.iterator(); | |||||
while(it.hasNext()) { | |||||
HashSet<Integer> s2 = it.next(); | |||||
if (s2.contains(magic_number)) { | |||||
if (!Collections.disjoint(s1, s2)) { | |||||
return true; | |||||
} | |||||
} | |||||
else { | |||||
if (s1.containsAll(s2) && s2.containsAll(s1)) { | |||||
return true; | |||||
} | |||||
} | |||||
} | |||||
return false; | |||||
} | |||||
public boolean contains(Integer i) { | |||||
Iterator<HashSet<Integer>> it = candTypes.iterator(); | |||||
while(it.hasNext()) { | |||||
HashSet<Integer> s = it.next(); | |||||
if (s.contains(i)) { | |||||
return true; | |||||
} | |||||
} | |||||
return false; | |||||
} | |||||
} |
@@ -0,0 +1,489 @@ | |||||
package jgsc; | |||||
import java.io.*; | |||||
import java.net.*; | |||||
import java.lang.*; | |||||
import java.net.URLEncoder; | |||||
import java.net.URLDecoder; | |||||
import java.io.UnsupportedEncodingException; | |||||
import java.util.List; | |||||
import java.util.Map; | |||||
public class GstoreConnector { | |||||
public static final String defaultServerIP = "127.0.0.1"; | |||||
public static final int defaultServerPort = 9000; | |||||
private String serverIP; | |||||
private int serverPort; | |||||
//private Socket socket = null; | |||||
public GstoreConnector() { | |||||
this.serverIP = GstoreConnector.defaultServerIP; | |||||
this.serverPort = GstoreConnector.defaultServerPort; | |||||
} | |||||
public GstoreConnector(int _port) { | |||||
this.serverIP = GstoreConnector.defaultServerIP; | |||||
this.serverPort = _port; | |||||
} | |||||
public GstoreConnector(String _ip, int _port) { | |||||
this.serverIP = _ip; | |||||
this.serverPort = _port; | |||||
} | |||||
//PERFORMANCE: what if the query result is too large? receive and save to file directly at once | |||||
//In addition, set the -Xmx larger(maybe in scale of Gs) if the query result could be very large, | |||||
//this may help to reduce the GC cost | |||||
public String sendGet(String param) { | |||||
String url = "http://" + this.serverIP + ":" + this.serverPort; | |||||
StringBuffer result = new StringBuffer(); | |||||
BufferedReader in = null; | |||||
System.out.println("parameter: "+param); | |||||
try { | |||||
param = URLEncoder.encode(param, "UTF-8"); | |||||
} | |||||
catch (UnsupportedEncodingException ex) { | |||||
throw new RuntimeException("Broken VM does not support UTF-8"); | |||||
} | |||||
try { | |||||
String urlNameString = url + "/" + param; | |||||
System.out.println("request: "+urlNameString); | |||||
URL realUrl = new URL(urlNameString); | |||||
// 閹垫挸绱戦崪瀛禦L娑斿妫块惃鍕箾閹猴拷 | |||||
URLConnection connection = realUrl.openConnection(); | |||||
// 鐠佸墽鐤嗛柅姘辨暏閻ㄥ嫯顕Ч鍌氱潣閹拷 | |||||
connection.setRequestProperty("accept", "*/*"); | |||||
connection.setRequestProperty("connection", "Keep-Alive"); | |||||
//set agent to avoid: speed limited by server if server think the client not a browser | |||||
connection.setRequestProperty("user-agent", | |||||
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); | |||||
// 瀵よ櫣鐝涚�圭偤妾惃鍕箾閹猴拷 | |||||
connection.connect(); | |||||
long t0 = System.currentTimeMillis(); //ms | |||||
// 閼惧嘲褰囬幍锟介張澶婃惙鎼存柨銇旂�涙顔� | |||||
Map<String, List<String>> map = connection.getHeaderFields(); | |||||
// 闁秴宸婚幍锟介張澶屾畱閸濆秴绨叉径鏉戠摟濞堬拷 | |||||
//for (String key : map.keySet()) { | |||||
// System.out.println(key + "--->" + map.get(key)); | |||||
//} | |||||
long t1 = System.currentTimeMillis(); //ms | |||||
//System.out.println("Time to get header: "+(t1 - t0)+" ms"); | |||||
//System.out.println("============================================"); | |||||
// 鐎规矮绠� BufferedReader鏉堟挸鍙嗗ù浣规降鐠囪褰嘦RL閻ㄥ嫬鎼锋惔锟� | |||||
in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8")); | |||||
String line; | |||||
while ((line = in.readLine()) != null) { | |||||
//PERFORMANCE: this can be very costly if result is very large, because many temporary Strings are produced | |||||
//In this case, just print the line directly will be much faster | |||||
result.append(line+"\n"); | |||||
//System.out.println("get data size: " + line.length()); | |||||
//System.out.println(line); | |||||
} | |||||
long t2 = System.currentTimeMillis(); //ms | |||||
//System.out.println("Time to get data: "+(t2 - t1)+" ms"); | |||||
} catch (Exception e) { | |||||
System.out.println("error in get request: " + e); | |||||
e.printStackTrace(); | |||||
} | |||||
// 娴h法鏁inally閸ф娼甸崗鎶芥4鏉堟挸鍙嗗ù锟� | |||||
finally { | |||||
try { | |||||
if (in != null) { | |||||
in.close(); | |||||
} | |||||
} catch (Exception e2) { | |||||
e2.printStackTrace(); | |||||
} | |||||
} | |||||
return result.toString(); | |||||
} | |||||
public void sendGet(String param, String filename) { | |||||
String url = "http://" + this.serverIP + ":" + this.serverPort; | |||||
BufferedReader in = null; | |||||
System.out.println("parameter: "+param); | |||||
if (filename == null) | |||||
return; | |||||
FileWriter fw = null; | |||||
try { | |||||
fw = new FileWriter(filename); | |||||
} catch (IOException e) { | |||||
System.out.println("can not open " + filename + "!"); | |||||
} | |||||
try { | |||||
param = URLEncoder.encode(param, "UTF-8"); | |||||
} catch (UnsupportedEncodingException ex) { | |||||
throw new RuntimeException("Broken VM does not support UTF-8"); | |||||
} | |||||
try { | |||||
String urlNameString = url + "/" + param; | |||||
System.out.println("request: "+urlNameString); | |||||
URL realUrl = new URL(urlNameString); | |||||
// 閹垫挸绱戦崪瀛禦L娑斿妫块惃鍕箾閹猴拷 | |||||
URLConnection connection = realUrl.openConnection(); | |||||
// 鐠佸墽鐤嗛柅姘辨暏閻ㄥ嫯顕Ч鍌氱潣閹拷 | |||||
connection.setRequestProperty("accept", "*/*"); | |||||
connection.setRequestProperty("connection", "Keep-Alive"); | |||||
//set agent to avoid: speed limited by server if server think the client not a browser | |||||
connection.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); | |||||
// 瀵よ櫣鐝涚�圭偤妾惃鍕箾閹猴拷 | |||||
connection.connect(); | |||||
long t0 = System.currentTimeMillis(); //ms | |||||
// 閼惧嘲褰囬幍锟介張澶婃惙鎼存柨銇旂�涙顔� | |||||
Map<String, List<String>> map = connection.getHeaderFields(); | |||||
// 闁秴宸婚幍锟介張澶屾畱閸濆秴绨叉径鏉戠摟濞堬拷 | |||||
//for (String key : map.keySet()) { | |||||
// System.out.println(key + "--->" + map.get(key)); | |||||
//} | |||||
long t1 = System.currentTimeMillis(); // ms | |||||
//System.out.println("Time to get header: "+(t1 - t0)+" ms"); | |||||
// 鐎规矮绠� BufferedReader鏉堟挸鍙嗗ù浣规降鐠囪褰嘦RL閻ㄥ嫬鎼锋惔锟� | |||||
in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8")); | |||||
char chars[] = new char[2048]; | |||||
int b; | |||||
while ((b = in.read(chars, 0, 2048)) != -1) { | |||||
if (fw != null) | |||||
fw.write(chars); | |||||
chars = new char[2048]; | |||||
} | |||||
long t2 = System.currentTimeMillis(); //ms | |||||
//System.out.println("Time to get data: "+(t2 - t1)+" ms"); | |||||
} catch (Exception e) { | |||||
//System.out.println("error in get request: " + e); | |||||
e.printStackTrace(); | |||||
} | |||||
// 娴h法鏁inally閸ф娼甸崗鎶芥4鏉堟挸鍙嗗ù锟� | |||||
finally { | |||||
try { | |||||
if (in != null) { | |||||
in.close(); | |||||
} | |||||
if (fw != null) { | |||||
fw.close(); | |||||
} | |||||
} catch (Exception e2) { | |||||
e2.printStackTrace(); | |||||
} | |||||
} | |||||
return; | |||||
} | |||||
//NOTICE: no need to connect now, HTTP connection is kept by default | |||||
public boolean load(String _db_name, String _username, String _password) { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.load"); | |||||
return false; | |||||
} | |||||
String cmd = "?operation=load&db_name=" + _db_name + "&username=" + _username + "&password=" + _password; | |||||
String msg = this.sendGet(cmd); | |||||
//if (!send_return) { | |||||
//System.err.println("send load command error. @GstoreConnector.load"); | |||||
//return false; | |||||
//} | |||||
this.disconnect(); | |||||
System.out.println(msg); | |||||
if (msg.equals("load database done.")) { | |||||
return true; | |||||
} | |||||
return false; | |||||
} | |||||
public boolean unload(String _db_name,String _username, String _password) { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.unload"); | |||||
return false; | |||||
} | |||||
String cmd = "?operation=unload&db_name=" + _db_name + "&username=" + _username + "&password=" + _password; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
System.out.println(msg); | |||||
if (msg.equals("unload database done.")) { | |||||
return true; | |||||
} | |||||
return false; | |||||
} | |||||
public boolean build(String _db_name, String _rdf_file_path, String _username, String _password) { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.build"); | |||||
return false; | |||||
} | |||||
//TODO: also use encode to support spaces? | |||||
//Consider change format into ?name=DBname | |||||
String cmd = "?operation=build&db_name=" + _db_name + "&ds_path=" + _rdf_file_path + "&username=" + _username + "&password=" + _password;; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
System.out.println(msg); | |||||
if (msg.equals("import RDF file to database done.")) { | |||||
return true; | |||||
} | |||||
return false; | |||||
} | |||||
//TODO: not implemented | |||||
public boolean drop(String _db_name) { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.drop"); | |||||
return false; | |||||
} | |||||
String cmd = "drop/" + _db_name; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
System.out.println(msg); | |||||
return msg.equals("drop database done."); | |||||
} | |||||
public String query(String _username, String _password, String _db_name, String _sparql) { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.query"); | |||||
return "connect to server error."; | |||||
} | |||||
//URL encode should be used here | |||||
//try { | |||||
//_sparql = URLEncoder.encode("\""+_sparql+"\"", "UTF-8"); | |||||
//} | |||||
//catch (UnsupportedEncodingException ex) { | |||||
//throw new RuntimeException("Broken VM does not support UTF-8"); | |||||
//} | |||||
String cmd = "?operation=query&username=" + _username + "&password=" + _password + "&db_name=" + _db_name + "&format=txt&sparql=" + _sparql; | |||||
//String cmd = "query/\"" + _sparql + "\""; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
return msg; | |||||
} | |||||
public void query(String _username, String _password, String _db_name, String _sparql, String _filename) { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.query"); | |||||
} | |||||
String cmd = "?operation=query&username=" + _username + "&password=" + _password + "&db_name=" + _db_name + "&format=json&sparql=" + _sparql; | |||||
this.sendGet(cmd, _filename); | |||||
this.disconnect(); | |||||
return; | |||||
} | |||||
// public String show() { | |||||
// return this.show(false); | |||||
// } | |||||
//show all databases | |||||
public String show() { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.show"); | |||||
return "connect to server error."; | |||||
} | |||||
String cmd = "?operation=show"; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
return msg; | |||||
} | |||||
public String user(String type, String username1, String password1, String username2, String addtion) { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.show"); | |||||
return "connect to server error."; | |||||
} | |||||
String cmd = "?operation=user&type=" + type + "&username1=" + username1 + "&password1=" + password1 + "&username2=" + username2 + "&addtion=" + addtion; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
return msg; | |||||
} | |||||
public String showUser() { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.show"); | |||||
return "connect to server error."; | |||||
} | |||||
String cmd = "?operation=showUser"; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
return msg; | |||||
} | |||||
public String monitor(String db_name) { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.show"); | |||||
return "connect to server error."; | |||||
} | |||||
String cmd = "?operation=monitor&db_name=" + db_name; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
return msg; | |||||
} | |||||
public String checkpoint(String db_name) { | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.show"); | |||||
return "connect to server error."; | |||||
} | |||||
String cmd = "?operation=checkpoint&db_name=" + db_name; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
return msg; | |||||
} | |||||
public String test_download(String filepath) | |||||
{ | |||||
boolean connect_return = this.connect(); | |||||
if (!connect_return) { | |||||
System.err.println("connect to server error. @GstoreConnector.query"); | |||||
return "connect to server error."; | |||||
} | |||||
//TEST: a small file, a large file | |||||
String cmd = "?operation=delete&download=true&filepath=" + filepath; | |||||
String msg = this.sendGet(cmd); | |||||
this.disconnect(); | |||||
return msg; | |||||
} | |||||
private boolean connect() { | |||||
return true; | |||||
} | |||||
private boolean disconnect() { | |||||
return true; | |||||
} | |||||
private static byte[] packageMsgData(String _msg) { | |||||
//byte[] data_context = _msg.getBytes(); | |||||
byte[] data_context = null; | |||||
try { | |||||
data_context = _msg.getBytes("utf-8"); | |||||
} catch (UnsupportedEncodingException e) { | |||||
// TODO Auto-generated catch block | |||||
e.printStackTrace(); | |||||
System.err.println("utf-8 charset is unsupported."); | |||||
data_context = _msg.getBytes(); | |||||
} | |||||
int context_len = data_context.length + 1; // 1 byte for '\0' at the end of the context. | |||||
int data_len = context_len + 4; // 4 byte for one int(data_len at the data's head). | |||||
byte[] data = new byte[data_len]; | |||||
// padding head(context_len). | |||||
byte[] head = GstoreConnector.intToByte4(context_len); | |||||
for (int i = 0; i < 4; i++) { | |||||
data[i] = head[i]; | |||||
} | |||||
// padding context. | |||||
for (int i = 0; i < data_context.length; i++) { | |||||
data[i + 4] = data_context[i]; | |||||
} | |||||
// in C, there should be '\0' as the terminator at the end of a char array. so we need add '\0' at the end of sending message. | |||||
data[data_len - 1] = 0; | |||||
return data; | |||||
} | |||||
private static byte[] intToByte4(int _x) // with Little Endian format. | |||||
{ | |||||
byte[] ret = new byte[4]; | |||||
ret[0] = (byte) (_x); | |||||
ret[1] = (byte) (_x >>> 8); | |||||
ret[2] = (byte) (_x >>> 16); | |||||
ret[3] = (byte) (_x >>> 24); | |||||
return ret; | |||||
} | |||||
private static int byte4ToInt(byte[] _b) // with Little Endian format. | |||||
{ | |||||
int byte0 = _b[0] & 0xFF, byte1 = _b[1] & 0xFF, byte2 = _b[2] & 0xFF, byte3 = _b[3] & 0xFF; | |||||
int ret = (byte0) | (byte1 << 8) | (byte2 << 16) | (byte3 << 24); | |||||
return ret; | |||||
} | |||||
public static void main(String[] args) { | |||||
// initialize the GStore server's IP address and port. | |||||
GstoreConnector gc = new GstoreConnector("172.31.222.90", 9001); | |||||
// build a new database by a RDF file. | |||||
// note that the relative path is related to gserver. | |||||
//gc.build("db_LUBM10", "example/rdf_triple/LUBM_10_GStore.n3"); | |||||
String sparql = "select ?x where {" | |||||
+ "<Area_51> <location> ?x" | |||||
+ "}"; | |||||
sparql = "select ?countries where { ?countries <type> <Country> . ?caves <type> <Cave> . ?caves <location> ?countries . } " | |||||
+ "GROUP BY ?countries HAVING(COUNT(?caves) > 1000)"; | |||||
sparql = "ASK where { <Proinsulin> <type> <Protein> .}"; | |||||
sparql = "select DISTINCT ?film ?budget where { ?film <type> <Film> . ?film <director> <Paul_W._S._Anderson> . ?film <budget> ?budget . }"; | |||||
// boolean flag = gc.load("dbpedia16", "root", "123456"); | |||||
//System.out.println(flag); | |||||
String answer = gc.query("root", "123456", "dbpedia16", sparql); | |||||
System.out.println(answer); | |||||
//To count the time cost | |||||
//long startTime=System.nanoTime(); //ns | |||||
//long startTime=System.currentTimeMillis(); //ms | |||||
//doSomeThing(); //濞村鐦惃鍕敩閻焦顔� | |||||
//long endTime=System.currentTimeMillis(); //閼惧嘲褰囩紒鎾存将閺冨爼妫� | |||||
//System.out.println("缁嬪绨潻鎰攽閺冨爼妫块敍锟� "+(end-start)+"ms"); | |||||
} | |||||
} | |||||
@@ -0,0 +1,133 @@ | |||||
package lcn; | |||||
import java.io.BufferedReader; | |||||
import java.io.File; | |||||
import java.io.FileInputStream; | |||||
//import java.io.IOException; | |||||
import java.io.InputStreamReader; | |||||
import java.util.Date; | |||||
import org.apache.lucene.analysis.Analyzer; | |||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||||
import org.apache.lucene.document.Document; | |||||
import org.apache.lucene.document.Field; | |||||
import org.apache.lucene.index.IndexWriter; | |||||
import qa.Globals; | |||||
//import qa.Globals; | |||||
/** | |||||
* Lucene建立索引的基本单元是document,同时其中的域filed可以根据需要自己添加 | |||||
* | |||||
* Document是一个记录,用来表示一个条目,相当于数据库中的一行记录,就是搜索建立的倒排索引的条目。 | |||||
* eg:你要搜索自己电脑上的文件,这个时候就可以创建field(字段,相关于数据库中的列。 然后用field组合成document,最后会变成若干文件。 | |||||
* 这个document和文件系统document不是一个概念。 | |||||
* | |||||
* StandardAnalyzer是lucene中内置的"标准分析器",可以做如下功能: | |||||
* 1、对原有句子按照空格进行了分词 | |||||
* 2、所有的大写字母都可以能转换为小写的字母 | |||||
* 3、可以去掉一些没有用处的单词,例如"is","the","are"等单词,也删除了所有的标点 | |||||
*/ | |||||
public class BuildIndexForEntityFragments{ | |||||
public void indexforentity() throws Exception | |||||
{ | |||||
if(EntityFragmentFields.entityId2Name == null) | |||||
EntityFragmentFields.load(); | |||||
long startTime = new Date().getTime(); | |||||
//Try update KB index to DBpedia2015. by husen 2016-04-08 | |||||
//Try update KB index to DBpedia2016. by husen 2018-8-22 | |||||
File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index"); | |||||
File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"); | |||||
Analyzer luceneAnalyzer_en = new StandardAnalyzer(); | |||||
IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true); | |||||
int mergeFactor = 100000; //default 10 | |||||
int maxBufferedDoc = 1000; //default 10 | |||||
int maxMergeDoc = Integer.MAX_VALUE; //INF | |||||
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; | |||||
indexWriter_en.setMergeFactor(mergeFactor); | |||||
indexWriter_en.setMaxBufferedDocs(maxBufferedDoc); | |||||
indexWriter_en.setMaxMergeDocs(maxMergeDoc); | |||||
FileInputStream file = new FileInputStream(sourceDir_en); | |||||
InputStreamReader in = new InputStreamReader(file,"UTF-8"); | |||||
BufferedReader br = new BufferedReader(in); | |||||
int count = 0; | |||||
while(true) | |||||
{ | |||||
String _line = br.readLine(); | |||||
{ | |||||
if(_line == null) break; | |||||
} | |||||
count++; | |||||
if(count % 100000 == 0) | |||||
System.out.println(count); | |||||
String line = _line; | |||||
String temp[] = line.split("\t"); | |||||
if(temp.length != 2) | |||||
continue; | |||||
else | |||||
{ | |||||
int entity_id = Integer.parseInt(temp[0]); | |||||
if(!EntityFragmentFields.entityId2Name.containsKey(entity_id)) | |||||
continue; | |||||
String entity_name = EntityFragmentFields.entityId2Name.get(entity_id); | |||||
String entity_fragment = temp[1]; | |||||
entity_name = entity_name.replace("____", " "); | |||||
entity_name = entity_name.replace("__", " "); | |||||
entity_name = entity_name.replace("_", " "); | |||||
Document document = new Document(); | |||||
Field EntityName = new Field("EntityName", entity_name, Field.Store.YES, | |||||
Field.Index.TOKENIZED, | |||||
Field.TermVector.WITH_POSITIONS_OFFSETS); | |||||
Field EntityId = new Field("EntityId", String.valueOf(entity_id), | |||||
Field.Store.YES, Field.Index.NO); | |||||
Field EntityFragment = new Field("EntityFragment", entity_fragment, | |||||
Field.Store.YES, Field.Index.NO); | |||||
document.add(EntityName); | |||||
document.add(EntityId); | |||||
document.add(EntityFragment); | |||||
indexWriter_en.addDocument(document); | |||||
} | |||||
} | |||||
indexWriter_en.optimize(); | |||||
indexWriter_en.close(); | |||||
br.close(); | |||||
// input the time of Build index | |||||
long endTime = new Date().getTime(); | |||||
System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime)); | |||||
} | |||||
public static void main(String[] args) | |||||
{ | |||||
BuildIndexForEntityFragments bef = new BuildIndexForEntityFragments(); | |||||
try | |||||
{ | |||||
Globals.localPath="D:/husen/gAnswer/"; | |||||
bef.indexforentity(); | |||||
} | |||||
catch (Exception e) | |||||
{ | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} | |||||
@@ -0,0 +1,107 @@ | |||||
package lcn; | |||||
import java.io.File; | |||||
import java.util.ArrayList; | |||||
import java.util.Date; | |||||
import java.util.HashMap; | |||||
import java.util.Iterator; | |||||
import org.apache.lucene.analysis.Analyzer; | |||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||||
import org.apache.lucene.document.Document; | |||||
import org.apache.lucene.document.Field; | |||||
import org.apache.lucene.index.IndexWriter; | |||||
import qa.Globals; | |||||
import fgmt.TypeFragment; | |||||
public class BuildIndexForTypeShortName { | |||||
public static void buildIndex(HashMap<String, ArrayList<Integer>> typeShortName2IdList) throws Exception | |||||
{ | |||||
long startTime = new Date().getTime(); | |||||
File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index"); | |||||
Analyzer luceneAnalyzer_li = new StandardAnalyzer(); | |||||
IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true); | |||||
int mergeFactor = 100000; | |||||
int maxBufferedDoc = 1000; | |||||
int maxMergeDoc = Integer.MAX_VALUE; | |||||
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; | |||||
indexWriter_li.setMergeFactor(mergeFactor); | |||||
indexWriter_li.setMaxBufferedDocs(maxBufferedDoc); | |||||
indexWriter_li.setMaxMergeDocs(maxMergeDoc); | |||||
int count = 0; | |||||
Iterator<String> it = typeShortName2IdList.keySet().iterator(); | |||||
while (it.hasNext()) | |||||
{ | |||||
String sn = it.next(); | |||||
if (sn.length() == 0) { | |||||
continue; | |||||
} | |||||
count ++; | |||||
StringBuilder splittedSn = new StringBuilder(""); | |||||
if(sn.contains("_")) | |||||
{ | |||||
String nsn = sn.replace("_", " "); | |||||
splittedSn.append(nsn.toLowerCase()); | |||||
} | |||||
else | |||||
{ | |||||
int last = 0, i = 0; | |||||
for(i = 0; i < sn.length(); i ++) | |||||
{ | |||||
// if it were not a small letter, then break it. | |||||
if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z')) | |||||
{ | |||||
splittedSn.append(sn.substring(last, i).toLowerCase()); | |||||
splittedSn.append(' '); | |||||
last = i; | |||||
} | |||||
} | |||||
splittedSn.append(sn.substring(last, i).toLowerCase()); | |||||
while(splittedSn.charAt(0) == ' ') { | |||||
splittedSn.deleteCharAt(0); | |||||
} | |||||
} | |||||
System.out.println("SplitttedType: "+splittedSn); | |||||
Document document = new Document(); | |||||
Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(), | |||||
Field.Store.YES, | |||||
Field.Index.TOKENIZED, | |||||
Field.TermVector.WITH_POSITIONS_OFFSETS); | |||||
Field TypeShortName = new Field("TypeShortName", sn, | |||||
Field.Store.YES, Field.Index.NO); | |||||
document.add(SplittedTypeShortName); | |||||
document.add(TypeShortName); | |||||
indexWriter_li.addDocument(document); | |||||
} | |||||
indexWriter_li.optimize(); | |||||
indexWriter_li.close(); | |||||
// input the time of Build index | |||||
long endTime = new Date().getTime(); | |||||
System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime)); | |||||
} | |||||
public static void main (String[] args) { | |||||
try { | |||||
Globals.localPath="D:/husen/gAnswer/"; | |||||
TypeFragment.load(); | |||||
BuildIndexForTypeShortName.buildIndex(TypeFragment.typeShortName2IdList); | |||||
} catch (Exception e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,64 @@ | |||||
package lcn; | |||||
import java.io.BufferedReader; | |||||
import java.io.File; | |||||
import java.io.FileInputStream; | |||||
import java.io.IOException; | |||||
import java.io.InputStreamReader; | |||||
import java.util.HashMap; | |||||
import qa.Globals; | |||||
public class EntityFragmentFields { | |||||
// entity dictionary | |||||
public static HashMap<String, Integer> entityName2Id = null; | |||||
public static HashMap<Integer, String> entityId2Name = null; | |||||
public static HashMap<Integer, String> entityFragmentString = null; | |||||
public static void load() throws IOException | |||||
{ | |||||
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16entity_id.txt"; | |||||
String fragmentFileName = Globals.localPath+"data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"; | |||||
File file = new File(filename); | |||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8")); | |||||
entityName2Id = new HashMap<String, Integer>(); | |||||
entityId2Name = new HashMap<Integer, String>(); | |||||
long t1, t2, t3; | |||||
t1 = System.currentTimeMillis(); | |||||
// load entity id | |||||
System.out.println("Loading entity id ..."); | |||||
String line; | |||||
while((line = br.readLine()) != null) | |||||
{ | |||||
String[] lines = line.split("\t"); | |||||
String entName = lines[0].substring(1, lines[0].length()-1); | |||||
entityName2Id.put(entName, Integer.parseInt(lines[1])); | |||||
entityId2Name.put(Integer.parseInt(lines[1]), entName); | |||||
} | |||||
br.close(); | |||||
t2 = System.currentTimeMillis(); | |||||
System.out.println("Load "+entityId2Name.size()+" entity ids in "+ (t2-t1) + "ms."); | |||||
// load entity fragment | |||||
System.out.println("Loading entity fragments ..."); | |||||
br = new BufferedReader(new InputStreamReader(new FileInputStream(fragmentFileName),"utf-8")); | |||||
entityFragmentString = new HashMap<Integer, String>(); | |||||
while((line = br.readLine()) != null) | |||||
{ | |||||
String[] lines = line.split("\t"); | |||||
if(lines.length != 2) | |||||
continue; | |||||
int eId = Integer.parseInt(lines[0]); | |||||
entityFragmentString.put(eId, lines[1]); | |||||
} | |||||
t3 = System.currentTimeMillis(); | |||||
System.out.println("Load "+entityFragmentString.size()+" entity fragments in "+ (t3-t2) + "ms."); | |||||
br.close(); | |||||
} | |||||
} |
@@ -0,0 +1,31 @@ | |||||
package lcn; | |||||
public class EntityNameAndScore implements Comparable<EntityNameAndScore> { | |||||
public int entityID; | |||||
public String entityName; | |||||
public double score; | |||||
public EntityNameAndScore(int id, String n, double s) { | |||||
entityID = id; | |||||
entityName = n; | |||||
score = s; | |||||
} | |||||
@Override | |||||
public String toString() { | |||||
return entityID + ":<" + entityName + ">\t" + score; | |||||
} | |||||
public int compareTo(EntityNameAndScore o) { | |||||
if(this.score < o.score) { | |||||
return 1; | |||||
} | |||||
else if (this.score > o.score) { | |||||
return -1; | |||||
} | |||||
else { | |||||
return 0; | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,58 @@ | |||||
package lcn; | |||||
//import java.io.IOException; | |||||
//import java.util.ArrayList; | |||||
import java.util.ArrayList; | |||||
import java.util.Scanner; | |||||
import fgmt.EntityFragment; | |||||
import qa.Globals; | |||||
import qa.mapping.EntityFragmentDict; | |||||
public class Main { | |||||
//Test: searching Entities and Types through Lucene Index. | |||||
public static void main(String[] aStrings) throws Exception{ | |||||
//SearchInLiteralSubset se = new SearchInLiteralSubset(); | |||||
SearchInTypeShortName st = new SearchInTypeShortName(); | |||||
SearchInEntityFragments sf = new SearchInEntityFragments(); | |||||
EntityFragmentDict efd = new EntityFragmentDict(); | |||||
EntityFragmentFields eff = null; | |||||
Globals.localPath = "D:/husen/gAnswer/"; | |||||
Scanner sc = new Scanner(System.in); | |||||
System.out.print("input name: "); | |||||
while(sc.hasNextLine()) | |||||
{ | |||||
String literal = sc.nextLine(); | |||||
System.out.println(literal); | |||||
//literal = cnlp.getBaseFormOfPattern(literal); | |||||
//search Type | |||||
ArrayList<String> result = st.searchType(literal, 0.4, 0.8, 10); | |||||
System.out.println("TypeShortName-->RESULT:"); | |||||
for (String s : result) { | |||||
System.out.println("<"+s + ">"); | |||||
} | |||||
//search Ent Fragment | |||||
// int eId = EntityFragmentFields.entityName2Id.get(literal); | |||||
// EntityFragment ef = EntityFragment.getEntityFragmentByEntityId(eId); | |||||
// System.out.println(ef); | |||||
//search Ent Name | |||||
// ArrayList<EntityNameAndScore> result = sf.searchName(literal, 0.4, 0.8, 50); | |||||
// System.out.println("EntityName-->RESULT:"); | |||||
// for(EntityNameAndScore enas: result) | |||||
// { | |||||
// System.out.println(enas); | |||||
// } | |||||
System.out.print("input name: "); | |||||
} | |||||
sc.close(); | |||||
} | |||||
} |
@@ -0,0 +1,84 @@ | |||||
package lcn; | |||||
import java.io.IOException; | |||||
import java.util.ArrayList; | |||||
import org.apache.lucene.analysis.Analyzer; | |||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||||
import org.apache.lucene.queryParser.ParseException; | |||||
import org.apache.lucene.queryParser.QueryParser; | |||||
import org.apache.lucene.search.Hits; | |||||
import org.apache.lucene.search.IndexSearcher; | |||||
import org.apache.lucene.search.Query; | |||||
import qa.Globals; | |||||
public class SearchInEntityFragments { | |||||
/* | |||||
* Search entity in Lucene | |||||
* */ | |||||
public ArrayList<EntityNameAndScore> searchName(String literal, double thres1, double thres2, int k) throws IOException { | |||||
Hits hits = null; | |||||
String queryString = null; | |||||
Query query = null; | |||||
IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/entity_fragment_index"); | |||||
ArrayList<EntityNameAndScore> result = new ArrayList<EntityNameAndScore>(); | |||||
queryString = literal; | |||||
Analyzer analyzer = new StandardAnalyzer(); | |||||
try | |||||
{ | |||||
QueryParser qp = new QueryParser("EntityName", analyzer); | |||||
query = qp.parse(queryString); | |||||
} catch (ParseException e) | |||||
{ | |||||
e.printStackTrace(); | |||||
} | |||||
if (searcher != null) | |||||
{ | |||||
hits = searcher.search(query); | |||||
//System.out.println("search for entity fragment hits.length=" + hits.length()); | |||||
if (hits.length() > 0) | |||||
{ | |||||
//System.out.println("find " + hits.length() + " result!"); | |||||
for (int i=0; i<hits.length(); i++) { | |||||
//System.out.println(i+": <"+hits.doc(i).get("EntityName") +">;" | |||||
// +hits.doc(i).get("EntityFragment") | |||||
// + "; Score: " + hits.score(i) | |||||
// + "; Score2: " + hits.score(i)*(literalLength/hits.doc(i).get("EntityName").length())); | |||||
if(i<k) { | |||||
if (hits.score(i) >= thres1) { | |||||
String en = hits.doc(i).get("EntityName"); | |||||
int id = Integer.parseInt(hits.doc(i).get("EntityId")); | |||||
result.add(new EntityNameAndScore(id, en, hits.score(i))); | |||||
} | |||||
else { | |||||
break; | |||||
} | |||||
} | |||||
else { | |||||
if (hits.score(i) >= thres2) { | |||||
String en = hits.doc(i).get("EntityName"); | |||||
int id = Integer.parseInt(hits.doc(i).get("EntityId")); | |||||
result.add(new EntityNameAndScore(id, en, hits.score(i))); | |||||
} | |||||
else { | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
//Collections.sort(result); | |||||
return result; | |||||
} | |||||
} |
@@ -0,0 +1,176 @@ | |||||
package lcn; | |||||
import java.util.ArrayList; | |||||
import org.apache.lucene.analysis.Analyzer; | |||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||||
import org.apache.lucene.queryParser.ParseException; | |||||
import org.apache.lucene.queryParser.QueryParser; | |||||
import org.apache.lucene.search.Hits; | |||||
import org.apache.lucene.search.IndexSearcher; | |||||
import org.apache.lucene.search.Query; | |||||
import fgmt.TypeFragment; | |||||
import qa.Globals; | |||||
import rdf.TypeMapping; | |||||
public class SearchInTypeShortName { | |||||
// get id and score -- husen | |||||
public ArrayList<TypeMapping> searchTypeScore(String s, double thres1, double thres2, int k) throws Exception | |||||
{ | |||||
Hits hits = null; | |||||
String queryString = s; | |||||
Query query = null; | |||||
IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/type_fragment_index"); | |||||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||||
Analyzer analyzer = new StandardAnalyzer(); | |||||
try { | |||||
QueryParser qp = new QueryParser("SplittedTypeShortName", analyzer); | |||||
query = qp.parse(queryString); | |||||
} catch (ParseException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
if (searcher != null) { | |||||
hits = searcher.search(query); | |||||
//System.out.println("find " + hits.length() + " matched type."); | |||||
if (hits.length() > 0) { | |||||
for (int i=0; i<hits.length(); i++) { | |||||
if (i < k) { | |||||
//System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||||
if(hits.score(i) >= thres1) | |||||
{ | |||||
//System.out.println("Score>=thres1("+thres1+") ---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||||
String type = hits.doc(i).get("TypeShortName"); | |||||
System.out.println("Matched type: " + type + " : " + hits.score(i)); | |||||
ArrayList<Integer> ret_in = TypeFragment.typeShortName2IdList.get(type); | |||||
if(ret_in!=null) | |||||
{ | |||||
for(Integer tid: ret_in) | |||||
{ | |||||
TypeMapping typeMapping = new TypeMapping(tid, hits.doc(i).get("TypeShortName"), hits.score(i)); | |||||
tmList.add(typeMapping); | |||||
} | |||||
} | |||||
} | |||||
else { | |||||
break; | |||||
} | |||||
} | |||||
else { | |||||
if(hits.score(i) >= thres2) | |||||
{ | |||||
System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||||
ArrayList<Integer> ret_in = TypeFragment.typeShortName2IdList.get(s); | |||||
if(ret_in!=null) | |||||
{ | |||||
for(Integer tid: ret_in) | |||||
{ | |||||
TypeMapping typeMapping = new TypeMapping(tid, hits.doc(i).get("TypeShortName"), hits.score(i)); | |||||
tmList.add(typeMapping); | |||||
} | |||||
} | |||||
} | |||||
else { | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
return tmList; | |||||
} | |||||
public ArrayList<String> searchType(String s, double thres1, double thres2, int k) throws Exception | |||||
{ | |||||
Hits hits = null; | |||||
String queryString = null; | |||||
Query query = null; | |||||
IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/type_fragment_index"); | |||||
ArrayList<String> typeNames = new ArrayList<String>(); | |||||
//String[] array = s.split(" "); | |||||
//queryString = array[array.length-1]; | |||||
queryString = s; | |||||
Analyzer analyzer = new StandardAnalyzer(); | |||||
try { | |||||
QueryParser qp = new QueryParser("SplittedTypeShortName", analyzer); | |||||
query = qp.parse(queryString); | |||||
} catch (ParseException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
if (searcher != null) { | |||||
hits = searcher.search(query); | |||||
System.out.println("find " + hits.length() + " answars!"); | |||||
if (hits.length() > 0) { | |||||
for (int i=0; i<hits.length(); i++) { | |||||
if (i < k) { | |||||
System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||||
if(hits.score(i) >= thres1){ | |||||
System.out.println("Score>=thres1("+thres1+") ---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||||
typeNames.add(hits.doc(i).get("TypeShortName")); | |||||
//if (satisfiedStrictly(hits.doc(i).get("SplittedTypeShortName"), queryString)) typeNames.add(hits.doc(i).get("TypeShortName")); | |||||
} | |||||
else { | |||||
//break; | |||||
} | |||||
} | |||||
else { | |||||
if(hits.score(i) >= thres2){ | |||||
System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||||
typeNames.add(hits.doc(i).get("TypeShortName")); | |||||
//if (satisfiedStrictly(hits.doc(i).get("SplittedTypeShortName"), queryString)) typeNames.add(hits.doc(i).get("TypeShortName")); | |||||
} | |||||
else { | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
return typeNames; | |||||
} | |||||
private boolean satisfiedStrictly (String splittedTypeShortName, String queryString) | |||||
{ | |||||
String[] tnames = splittedTypeShortName.toLowerCase().split(" "); | |||||
String[] qnames = queryString.toLowerCase().split(" "); | |||||
for (int i = 0; i < tnames.length; i ++) { | |||||
if (tnames[i].length() == 0) continue; | |||||
boolean matched = false; | |||||
for (int j = 0; j < qnames.length; j ++) { | |||||
if (tnames[i].equals(qnames[j])) { | |||||
matched = true; | |||||
break; | |||||
} | |||||
} | |||||
if (!matched && !Globals.stopWordsList.isStopWord(tnames[i])) { | |||||
return false; | |||||
} | |||||
} | |||||
String qlast = qnames[qnames.length-1]; | |||||
boolean flag = false; | |||||
for (int i = 0; i < tnames.length; i ++) { | |||||
if (tnames[i].length() == 0) continue; | |||||
if (tnames[i].equals(qlast)) { | |||||
flag = true; | |||||
break; | |||||
} | |||||
} | |||||
if (flag) return true; | |||||
else return false; | |||||
} | |||||
} |
@@ -0,0 +1,116 @@ | |||||
package log; | |||||
//import java.io.File; | |||||
//import java.io.FileNotFoundException; | |||||
//import java.io.FileOutputStream; | |||||
//import java.io.OutputStreamWriter; | |||||
//import java.io.UnsupportedEncodingException; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import javax.servlet.http.HttpServletRequest; | |||||
//import qa.Globals; | |||||
import qa.Matches; | |||||
import qa.Query; | |||||
import rdf.EntityMapping; | |||||
import rdf.SemanticRelation; | |||||
import rdf.Sparql; | |||||
import rdf.MergedWord; | |||||
import rdf.SemanticUnit; | |||||
import qa.Answer; | |||||
import nlp.ds.Sentence; | |||||
import nlp.ds.Word; | |||||
public class QueryLogger { | |||||
public Sentence s = null; | |||||
public String ipAdress = null; | |||||
public Word target = null; | |||||
public Sparql sparql = null; | |||||
public Matches match = null; | |||||
public ArrayList<Answer> answers = null; | |||||
public boolean MODE_debug = false; | |||||
public boolean MODE_log = true; | |||||
public boolean MODE_fragment = true; | |||||
public boolean isMaltParserUsed = true; // Notice, we utilize Malt Parser as default parser, which is different from the older version. TODO: some coref rules need changed to fit Malt Parser. | |||||
public HashMap<String, Integer> timeTable = null; | |||||
public ArrayList<MergedWord> mWordList = null; | |||||
public ArrayList<SemanticUnit> semanticUnitList = null; | |||||
public HashMap<Integer, SemanticRelation> semanticRelations = null; | |||||
public HashMap<Integer, SemanticRelation> potentialSemanticRelations = null; | |||||
public HashMap<Word, ArrayList<EntityMapping>> entityDictionary = null; | |||||
public ArrayList<Sparql> rankedSparqls = null; | |||||
public String NRlog = ""; | |||||
public String SQGlog = ""; | |||||
public int gStoreCallTimes = 0; | |||||
public QueryLogger (Query query) | |||||
{ | |||||
timeTable = new HashMap<String, Integer>(); | |||||
rankedSparqls = new ArrayList<Sparql>(); | |||||
mWordList = query.mWordList; | |||||
} | |||||
public void reloadSentence(Sentence sentence) | |||||
{ | |||||
this.s = sentence; | |||||
if(this.semanticUnitList != null) | |||||
this.semanticUnitList.clear(); | |||||
if(this.semanticRelations != null) | |||||
this.semanticRelations.clear(); | |||||
if(this.rankedSparqls != null) | |||||
this.rankedSparqls.clear(); | |||||
} | |||||
// Source code: http://edu.21cn.com/java/g_189_755584-1.htm | |||||
public static String getIpAddr(HttpServletRequest request) { | |||||
String ip = request.getHeader("x-forwarded-for"); | |||||
if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) { | |||||
ip = request.getHeader("Proxy-Client-IP"); | |||||
} | |||||
if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) { | |||||
ip = request.getHeader("WL-Proxy-Client-IP"); | |||||
} | |||||
if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) { | |||||
ip = request.getRemoteAddr(); | |||||
} | |||||
int idx; | |||||
if((idx = ip.indexOf(',')) != -1) { | |||||
ip = ip.substring(0, idx); | |||||
} | |||||
return ip; | |||||
} | |||||
public void reviseAnswers() | |||||
{ | |||||
System.out.println("Revise Answers:"); | |||||
answers = new ArrayList<Answer>(); | |||||
if (match == null || sparql == null || match.answers == null || sparql.questionFocus == null) | |||||
return; | |||||
HashSet<Answer> answerSet = new HashSet<Answer>(); | |||||
String questionFocus = sparql.questionFocus; | |||||
String sparqlString = sparql.toStringForGStore(); | |||||
//System.out.println("mal="+match.answers.length); | |||||
for (int i=0;i<match.answers.length;i++) | |||||
{ | |||||
Answer ans = new Answer(questionFocus, match.answers[i]); | |||||
if (!sparqlString.contains(ans.questionFocusValue)) | |||||
answerSet.add(ans); | |||||
} | |||||
for (Answer ans : answerSet) | |||||
answers.add(ans); | |||||
Collections.sort(answers); | |||||
} | |||||
} |
@@ -0,0 +1,402 @@ | |||||
package nlp.ds; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashMap; | |||||
import java.util.List; | |||||
import java.util.Stack; | |||||
import nlp.tool.CoreNLP; | |||||
import nlp.tool.MaltParser; | |||||
import nlp.tool.StanfordParser; | |||||
import org.maltparser.core.exception.MaltChainedException; | |||||
import org.maltparser.core.syntaxgraph.DependencyStructure; | |||||
import org.maltparser.core.syntaxgraph.node.DependencyNode; | |||||
import edu.stanford.nlp.ling.IndexedWord; | |||||
import edu.stanford.nlp.trees.GrammaticalStructure; | |||||
import edu.stanford.nlp.trees.TypedDependency; | |||||
import edu.stanford.nlp.trees.semgraph.SemanticGraph; | |||||
public class DependencyTree { | |||||
public DependencyTreeNode root = null; | |||||
public ArrayList<DependencyTreeNode> nodesList = null; | |||||
public SemanticGraph dependencies = null; // Method 1: CoreNLP (discarded) | |||||
public GrammaticalStructure gs = null; // Method 2: Stanford Parser | |||||
public DependencyStructure maltGraph = null; // Method 3: MaltParser | |||||
public HashMap<String, ArrayList<DependencyTreeNode>> wordBaseFormIndex = null; | |||||
public DependencyTree (Sentence sentence, CoreNLP coreNLPparser) { | |||||
SemanticGraph dependencies = coreNLPparser.getBasicDependencies(sentence.plainText); | |||||
this.dependencies = dependencies; | |||||
Stack<IndexedWord> stack = new Stack<IndexedWord>(); | |||||
IndexedWord iwRoot = dependencies.getFirstRoot(); | |||||
HashMap<IndexedWord, DependencyTreeNode> map = new HashMap<IndexedWord, DependencyTreeNode>(); | |||||
nodesList = new ArrayList<DependencyTreeNode>(); | |||||
stack.push(iwRoot); | |||||
root = this.setRoot(sentence.getWordByIndex(iwRoot.index())); | |||||
map.put(iwRoot, root); | |||||
while (!stack.empty()) | |||||
{ | |||||
IndexedWord curIWNode = stack.pop(); | |||||
DependencyTreeNode curDTNode = map.get(curIWNode); | |||||
for (IndexedWord iwChild : dependencies.getChildList(curIWNode)) { | |||||
Word w = sentence.getWordByIndex(iwChild.index()); | |||||
DependencyTreeNode newDTNode = this.insert( | |||||
curDTNode, | |||||
w, | |||||
dependencies.reln(curIWNode, iwChild).getShortName()); | |||||
map.put(iwChild, newDTNode); | |||||
stack.push(iwChild); | |||||
} | |||||
curDTNode.sortChildrenList(); | |||||
nodesList.add(curDTNode); | |||||
} | |||||
} | |||||
public DependencyTree (Sentence sentence, StanfordParser stanfordParser) { | |||||
this.gs = stanfordParser.getGrammaticalStructure(sentence.plainText); | |||||
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>(); | |||||
nodesList = new ArrayList<DependencyTreeNode>(); | |||||
List<TypedDependency> tdl = gs.typedDependencies(false); | |||||
// 1. generate all nodes. | |||||
for (TypedDependency td : tdl) { | |||||
// gov | |||||
if (!map.containsKey(td.gov().index()) && !td.reln().getShortName().equals("root")) { | |||||
Word w = sentence.getWordByIndex(td.gov().index()); | |||||
DependencyTreeNode newNode = new DependencyTreeNode(w); | |||||
map.put(td.gov().index(), newNode); | |||||
nodesList.add(newNode); | |||||
} | |||||
// dep | |||||
if (!map.containsKey(td.dep().index())) { | |||||
Word w = sentence.getWordByIndex(td.dep().index()); | |||||
DependencyTreeNode newNode = new DependencyTreeNode(w); | |||||
map.put(td.dep().index(), newNode); | |||||
nodesList.add(newNode); | |||||
} | |||||
} | |||||
// 2. add edges. | |||||
for (TypedDependency td : tdl) { | |||||
if (td.reln().getShortName().equals("root")) { | |||||
this.root = map.get(td.dep().index()); | |||||
this.root.levelInTree = 0; | |||||
this.root.dep_father2child = "root"; | |||||
} | |||||
else { | |||||
DependencyTreeNode gov = map.get(td.gov().index()); | |||||
DependencyTreeNode dep = map.get(td.dep().index()); | |||||
dep.father = gov; | |||||
gov.childrenList.add(dep); | |||||
dep.dep_father2child = td.reln().getShortName(); | |||||
} | |||||
} | |||||
// add levelInTree, sort childrenList & nodesList | |||||
Stack<DependencyTreeNode> stack = new Stack<DependencyTreeNode>(); | |||||
stack.push(this.root); | |||||
while (!stack.empty()) { | |||||
DependencyTreeNode dtn = stack.pop(); | |||||
if (dtn.father != null) { | |||||
dtn.levelInTree = dtn.father.levelInTree + 1; | |||||
dtn.sortChildrenList(); | |||||
} | |||||
for (DependencyTreeNode chd : dtn.childrenList) { | |||||
stack.push(chd); | |||||
} | |||||
} | |||||
Collections.sort(nodesList, new DependencyTreeNodeComparator()); | |||||
for (DependencyTreeNode dtn : nodesList) { | |||||
dtn.linkNN(this); | |||||
} | |||||
} | |||||
public DependencyTree (Sentence sentence, MaltParser maltParser)throws MaltChainedException { | |||||
try { | |||||
// the tokens are parsed in the following line | |||||
DependencyStructure graph = maltParser.getDependencyStructure(sentence); | |||||
this.maltGraph = graph; | |||||
//System.out.println(graph); | |||||
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>(); | |||||
ArrayList<DependencyTreeNode> list = new ArrayList<DependencyTreeNode>(); | |||||
Stack<DependencyNode> stack = new Stack<DependencyNode>(); | |||||
DependencyNode nroot = graph.getDependencyRoot(); | |||||
stack.add(nroot); | |||||
// 1. generate all nodes. | |||||
while (!stack.isEmpty()) { | |||||
DependencyNode n = stack.pop(); | |||||
DependencyNode sib = n.getRightmostDependent(); | |||||
int key = n.getIndex(); | |||||
//System.out.println("[current node][key="+key+"] "+n+" <"+n.getHeadEdge()+">"); | |||||
boolean flag = true; | |||||
while (sib != null) { | |||||
flag = false; | |||||
stack.push(sib); | |||||
sib = sib.getLeftSibling(); | |||||
} | |||||
if (flag) { | |||||
sib = n.getLeftmostDependent(); | |||||
while (sib != null) { | |||||
stack.push(sib); | |||||
sib = sib.getRightSibling(); | |||||
} | |||||
} | |||||
if (n.hasHead() && !map.containsKey(key)) { | |||||
//String snode = n.toString(); | |||||
String sedge = n.getHeadEdge().toString(); | |||||
//System.out.println("[" + snode + "] <" + sedge + ">"); | |||||
/*int position = 0; | |||||
String wordOriginal = null; | |||||
String wordBase; | |||||
String postag = null;*/ | |||||
String dep = null; | |||||
int idx1, idx2; | |||||
/*// position | |||||
idx1 = snode.indexOf("ID:")+3; | |||||
idx2 = snode.indexOf(' ', idx1); | |||||
position = Integer.parseInt(snode.substring(idx1, idx2)); | |||||
// word | |||||
idx1 = snode.indexOf("FORM:", idx2)+5; | |||||
idx2 = snode.indexOf(' ', idx1); | |||||
wordOriginal = snode.substring(idx1, idx2); | |||||
wordBase = Globals.coreNLP.getBaseFormOfPattern(wordOriginal.toLowerCase()); | |||||
// postag | |||||
idx1 = snode.indexOf("POSTAG:", idx2)+7; | |||||
idx2 = snode.indexOf(' ', idx1); | |||||
postag = snode.substring(idx1, idx2);*/ | |||||
// dep | |||||
idx1 = sedge.lastIndexOf(':')+1; | |||||
idx2 = sedge.lastIndexOf(' '); | |||||
dep = sedge.substring(idx1, idx2); | |||||
if (dep.equals("null")) { | |||||
dep = null; | |||||
} | |||||
else if (dep.equals("punct")) {// No consider about punctuation | |||||
continue; | |||||
} | |||||
DependencyTreeNode newNode = new DependencyTreeNode(sentence.getWordByIndex(key)); | |||||
newNode.dep_father2child = dep; | |||||
map.put(key, newNode); | |||||
list.add(newNode); | |||||
} | |||||
} | |||||
// 2. add edges | |||||
for (Integer k : map.keySet()) { | |||||
DependencyNode n = graph.getDependencyNode(k); | |||||
DependencyTreeNode dtn = map.get(k); | |||||
if (dtn.dep_father2child == null) { | |||||
this.setRoot(dtn); | |||||
this.root.levelInTree = 0; | |||||
this.root.dep_father2child = "root"; | |||||
} | |||||
else { | |||||
DependencyTreeNode father = map.get(n.getHead().getIndex()); | |||||
DependencyTreeNode child = map.get(n.getIndex()); | |||||
child.father = father; | |||||
father.childrenList.add(child); | |||||
} | |||||
} | |||||
// Fix the tree for some cases. | |||||
if(list.size() > 11) | |||||
{ | |||||
DependencyTreeNode dt1 = list.get(11), dt2 = list.get(5); | |||||
if(dt1!=null && dt2!=null && dt1.word.baseForm.equals("star") && dt1.father.word.baseForm.equals("be")) | |||||
{ | |||||
if (dt2.word.baseForm.equals("film") || dt2.word.baseForm.equals("movie")) | |||||
{ | |||||
dt1.father.childrenList.remove(dt1); | |||||
dt1.father = dt2; | |||||
dt2.childrenList.add(dt1); | |||||
} | |||||
} | |||||
} | |||||
// add levelInTree, sort childrenList & nodesList | |||||
for (DependencyTreeNode dtn : list) { | |||||
if (dtn.father != null) { | |||||
dtn.levelInTree = dtn.father.levelInTree + 1; | |||||
dtn.sortChildrenList(); | |||||
} | |||||
} | |||||
nodesList = list; | |||||
Collections.sort(nodesList, new DependencyTreeNodeComparator()); | |||||
for (DependencyTreeNode dtn : nodesList) { | |||||
dtn.linkNN(this); | |||||
} | |||||
} catch (MaltChainedException e) { | |||||
//e.printStackTrace(); | |||||
//System.err.println("MaltParser exception: " + e.getMessage()); | |||||
throw e; | |||||
} | |||||
} | |||||
public DependencyTreeNode setRoot(Word w) { | |||||
root = new DependencyTreeNode(w, "root", null); | |||||
return root; | |||||
} | |||||
public DependencyTreeNode setRoot(DependencyTreeNode root) { | |||||
this.root = root; | |||||
return this.root; | |||||
} | |||||
public void buildWordBaseFormIndex () { | |||||
wordBaseFormIndex = new HashMap<String, ArrayList<DependencyTreeNode>>(); | |||||
for (DependencyTreeNode dtn: nodesList) { | |||||
String w = dtn.word.baseForm; | |||||
if (!wordBaseFormIndex.keySet().contains(w)) | |||||
wordBaseFormIndex.put(w, new ArrayList<DependencyTreeNode>()); | |||||
wordBaseFormIndex.get(w).add(dtn); | |||||
} | |||||
} | |||||
public DependencyTreeNode insert(DependencyTreeNode father, Word w, String dep_father2child) { | |||||
if (father == null || w == null) | |||||
return null; | |||||
DependencyTreeNode newNode = new DependencyTreeNode(w, dep_father2child, father); | |||||
father.childrenList.add(newNode); | |||||
return newNode; | |||||
} | |||||
public DependencyTreeNode getRoot() { | |||||
return root; | |||||
} | |||||
public ArrayList<DependencyTreeNode> getNodesList(){ | |||||
return nodesList; | |||||
} | |||||
public ArrayList<DependencyTreeNode> getShortestNodePathBetween(DependencyTreeNode n1, DependencyTreeNode n2) | |||||
{ | |||||
if(n1 == n2) { | |||||
return new ArrayList<DependencyTreeNode>(); | |||||
} | |||||
ArrayList<DependencyTreeNode> path1 = getPath2Root(n1); | |||||
ArrayList<DependencyTreeNode> path2 = getPath2Root(n2); | |||||
int idx1 = path1.size()-1; | |||||
int idx2 = path2.size()-1; | |||||
DependencyTreeNode curNode1 = path1.get(idx1); | |||||
DependencyTreeNode curNode2 = path2.get(idx2); | |||||
while (curNode1 == curNode2) { | |||||
idx1 --; | |||||
idx2 --; | |||||
if(idx1 < 0 || idx2 < 0) break; | |||||
curNode1 = path1.get(idx1); | |||||
curNode2 = path2.get(idx2); | |||||
} | |||||
ArrayList<DependencyTreeNode> shortestPath = new ArrayList<DependencyTreeNode>(); | |||||
for (int i = 0; i <= idx1; i ++) { | |||||
shortestPath.add(path1.get(i)); | |||||
} | |||||
for (int i = idx2+1; i >= 0; i --) { | |||||
shortestPath.add(path2.get(i)); | |||||
} | |||||
System.out.println("Shortest Path between <" + n1 + "> and <" + n2 + ">:"); | |||||
System.out.print("\t-"); | |||||
for (DependencyTreeNode dtn : shortestPath) { | |||||
System.out.print("<" + dtn + ">-"); | |||||
} | |||||
System.out.println(); | |||||
return shortestPath; | |||||
} | |||||
public ArrayList<DependencyTreeNode> getPath2Root(DependencyTreeNode n1) { | |||||
ArrayList<DependencyTreeNode> path = new ArrayList<DependencyTreeNode>(); | |||||
DependencyTreeNode curNode = n1; | |||||
path.add(curNode); | |||||
while (curNode.father != null) { | |||||
curNode = curNode.father; | |||||
path.add(curNode); | |||||
} | |||||
return path; | |||||
} | |||||
public ArrayList<DependencyTreeNode> getTreeNodesListContainsWords(String words) { | |||||
ArrayList<DependencyTreeNode> ret = new ArrayList<DependencyTreeNode>(); | |||||
for (DependencyTreeNode dtn : nodesList) { | |||||
if (dtn.word.originalForm.equalsIgnoreCase(words) | |||||
|| dtn.word.baseForm.equalsIgnoreCase(words) | |||||
|| words.contains(dtn.word.originalForm) | |||||
|| words.contains(dtn.word.baseForm)) | |||||
ret.add(dtn); | |||||
} | |||||
return ret; | |||||
} | |||||
public DependencyTreeNode getNodeByIndex (int posi) { | |||||
for (DependencyTreeNode dt : nodesList) { | |||||
if (dt.word.position == posi) { | |||||
return dt; | |||||
} | |||||
} | |||||
return null; | |||||
} | |||||
public DependencyTreeNode getFirstPositionNodeInList(ArrayList<DependencyTreeNode> list) { | |||||
int firstPosi = Integer.MAX_VALUE; | |||||
DependencyTreeNode firstNode = null; | |||||
for (DependencyTreeNode dtn : list) { | |||||
if (dtn.word.position < firstPosi) { | |||||
firstPosi = dtn.word.position; | |||||
firstNode = dtn; | |||||
} | |||||
} | |||||
return firstNode; | |||||
} | |||||
@Override | |||||
public String toString() { | |||||
String ret = ""; | |||||
Stack<DependencyTreeNode> stack = new Stack<DependencyTreeNode>(); | |||||
stack.push(root); | |||||
while(!stack.empty()) { | |||||
DependencyTreeNode curNode = stack.pop(); | |||||
for (int i = 0; i <= curNode.levelInTree; i ++) | |||||
ret += " "; | |||||
ret += "-> "; | |||||
ret += curNode.word.baseForm; | |||||
ret += "-"; | |||||
ret += curNode.word.posTag; | |||||
ret += " ("; | |||||
ret += curNode.dep_father2child; | |||||
ret += ")"; | |||||
ret += "[" + curNode.word.position + "]\n"; | |||||
for (DependencyTreeNode child : curNode.childrenList) { | |||||
stack.push(child); | |||||
} | |||||
} | |||||
return ret; | |||||
} | |||||
} |
@@ -0,0 +1,150 @@ | |||||
package nlp.ds; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.Comparator; | |||||
import java.util.Stack; | |||||
public class DependencyTreeNode { | |||||
public Word word = null; | |||||
public String dep_father2child = null; | |||||
public DependencyTreeNode father = null; | |||||
public ArrayList<DependencyTreeNode> childrenList = null; | |||||
public int levelInTree = -1; | |||||
/** | |||||
* The constructor for knowing its father | |||||
* | |||||
* @param w | |||||
* @param dep_father2child | |||||
* @param father | |||||
*/ | |||||
public DependencyTreeNode(Word w, String dep_father2child, DependencyTreeNode father) | |||||
{ | |||||
word = w; | |||||
this.dep_father2child = dep_father2child; | |||||
this.father = father; | |||||
this.childrenList = new ArrayList<DependencyTreeNode>(); | |||||
if(father==null) levelInTree = 0; | |||||
else levelInTree = father.levelInTree+1; | |||||
} | |||||
/** | |||||
* The constructor for not knowing the father | |||||
* | |||||
* @param word | |||||
*/ | |||||
public DependencyTreeNode(Word w) | |||||
{ | |||||
this.word = w; | |||||
this.childrenList = new ArrayList<DependencyTreeNode>(); | |||||
} | |||||
public void sortChildrenList () { | |||||
childrenList.trimToSize(); | |||||
Collections.sort(childrenList, new DependencyTreeNodeComparator()); | |||||
} | |||||
@Override | |||||
public String toString(){ | |||||
return word.originalForm + "-" + word.posTag + "(" + dep_father2child + ")[" + word.position + "]"; | |||||
} | |||||
public static void sortArrayList(ArrayList<DependencyTreeNode> list) { | |||||
Collections.sort(list, new DependencyTreeNodeComparator()); | |||||
} | |||||
public DependencyTreeNode containDependencyWithChildren (String dep) { | |||||
for (DependencyTreeNode son : childrenList) { | |||||
if (son.dep_father2child.equals(dep)) return son; | |||||
} | |||||
return null; | |||||
} | |||||
/** | |||||
* equal_or_startWith = true: equal | |||||
* equal_or_startWith = false: startWith | |||||
* | |||||
* @param posChild | |||||
* @param equal_or_startWith | |||||
* @return | |||||
*/ | |||||
public DependencyTreeNode containPosInChildren (String posChild, boolean equal_or_startWith) { | |||||
for (DependencyTreeNode son : childrenList) { | |||||
if (equal_or_startWith) { | |||||
if (son.word.posTag.equals(posChild)) return son; | |||||
} | |||||
else { | |||||
if (son.word.posTag.startsWith(posChild)) return son; | |||||
} | |||||
} | |||||
return null; | |||||
} | |||||
public DependencyTreeNode containWordBaseFormInChildren (String wordBaseFormChild) { | |||||
for (DependencyTreeNode son : childrenList) { | |||||
if (son.word.baseForm.equals(wordBaseFormChild)) return son; | |||||
} | |||||
return null; | |||||
} | |||||
public DependencyTreeNode getNNTopTreeNode (DependencyTree T) { | |||||
if(this.father != null && (this.dep_father2child.equals("nn") || (this.word.posTag.startsWith("NN") && this.dep_father2child.equals("dep")))) { | |||||
return this.father.getNNTopTreeNode(T); | |||||
} | |||||
else return this; | |||||
} | |||||
public Word linkNN(DependencyTree T) { | |||||
// (Now useless) backtracking the NN connections. | |||||
ArrayList<DependencyTreeNode> nn = new ArrayList<DependencyTreeNode>(); | |||||
nn.add(this); | |||||
if(this.father != null && (this.dep_father2child.equals("nn") | |||||
|| (this.word.posTag.startsWith("NN") && this.dep_father2child.equals("dep") && this.father.word.posTag.startsWith("NN")))) { | |||||
nn.add(this.father); | |||||
for(DependencyTreeNode son : this.father.childrenList) { | |||||
if (son != this && son.dep_father2child.equals("nn")) { | |||||
nn.add(son); | |||||
} | |||||
} | |||||
} | |||||
Stack<DependencyTreeNode> stack = new Stack<DependencyTreeNode>(); | |||||
stack.push(this); | |||||
while (!stack.empty()) { | |||||
DependencyTreeNode curNode = stack.pop(); | |||||
for(DependencyTreeNode son : curNode.childrenList) { | |||||
if (son.dep_father2child.equals("nn") | |||||
|| (son.word.posTag.startsWith("NN") && son.dep_father2child.equals("dep") && son.father.word.posTag.startsWith("NN"))) { | |||||
nn.add(son); | |||||
stack.push(son); | |||||
} | |||||
} | |||||
} | |||||
DependencyTreeNode.sortArrayList(nn); | |||||
int size = nn.size() - 1; | |||||
for (int i = 0; i < size; i ++) { | |||||
nn.get(i).word.nnNext = nn.get(i+1).word; | |||||
nn.get(i+1).word.nnPrev = nn.get(i).word; | |||||
} | |||||
return this.word.getNnHead(); | |||||
} | |||||
}; | |||||
class DependencyTreeNodeComparator implements Comparator<DependencyTreeNode> { | |||||
public int compare(DependencyTreeNode n1, DependencyTreeNode n2) { | |||||
return n1.word.position - n2.word.position; | |||||
} | |||||
} |
@@ -0,0 +1,88 @@ | |||||
package nlp.ds; | |||||
import java.util.ArrayList; | |||||
import java.util.HashMap; | |||||
import qa.Globals; | |||||
import qa.Query; | |||||
import rdf.MergedWord; | |||||
public class Sentence { | |||||
public String plainText = null; | |||||
public Word[] words = null; | |||||
public HashMap<String, Word> map = null; | |||||
public DependencyTree dependencyTreeStanford = null; | |||||
public DependencyTree dependencyTreeMalt = null; | |||||
public enum SentenceType {SpecialQuestion,GeneralQuestion,ImperativeSentence} | |||||
public SentenceType sentenceType = SentenceType.SpecialQuestion; | |||||
public Sentence (String s) | |||||
{ | |||||
plainText = s; | |||||
words = Globals.coreNLP.getTaggedWords(plainText); | |||||
map = new HashMap<String, Word>(); | |||||
for (Word w : words) | |||||
map.put(w.key, w); | |||||
} | |||||
public Sentence (Query query, String s) | |||||
{ | |||||
plainText = s; | |||||
words = Globals.coreNLP.getTaggedWords(plainText); | |||||
// inherit NodeRecognition's information | |||||
for(Word word: words) | |||||
{ | |||||
for(MergedWord mWord: query.mWordList) | |||||
{ | |||||
if(word.originalForm.equals(mWord.name)) | |||||
{ | |||||
word.mayLiteral = mWord.mayLiteral; | |||||
word.mayEnt = mWord.mayEnt; | |||||
word.mayType = mWord.mayType; | |||||
word.mayCategory = mWord.mayCategory; | |||||
word.tmList = mWord.tmList; | |||||
word.emList = mWord.emList; | |||||
word.category = mWord.category; | |||||
} | |||||
} | |||||
} | |||||
map = new HashMap<String, Word>(); | |||||
for (Word w : words) | |||||
map.put(w.key, w); | |||||
} | |||||
public ArrayList<Word> getWordsByString (String w) { | |||||
ArrayList<Word> ret = new ArrayList<Word>(); | |||||
for (Word wo: words) { | |||||
if (wo.originalForm.equals(w)) ret.add(wo); | |||||
} | |||||
return ret; | |||||
} | |||||
public Word getWordByIndex (int idx) { | |||||
return words[idx-1]; | |||||
} | |||||
public Word getWordByKey (String k) { | |||||
return map.get(k); | |||||
} | |||||
public boolean hasModifier(Word w) | |||||
{ | |||||
for(Word word: words) | |||||
if(word!=w && word.modifiedWord==w) | |||||
return true; | |||||
return false; | |||||
} | |||||
public void printNERResult () { | |||||
for (Word word : words) { | |||||
System.out.print(word + " "); | |||||
System.out.println("ner=" + word.ner); | |||||
} | |||||
} | |||||
} | |||||
@@ -0,0 +1,126 @@ | |||||
package nlp.ds; | |||||
import java.util.ArrayList; | |||||
import rdf.EntityMapping; | |||||
import rdf.Triple; | |||||
import rdf.TypeMapping; | |||||
public class Word implements Comparable<Word> | |||||
{ | |||||
public boolean mayCategory = false; | |||||
public boolean mayLiteral = false; | |||||
public boolean mayEnt = false; | |||||
public boolean mayType = false; | |||||
public boolean mayExtendVariable = false; | |||||
public String category = null; | |||||
public ArrayList<EntityMapping> emList = null; | |||||
public ArrayList<TypeMapping> tmList = null; | |||||
public Triple embbededTriple = null; | |||||
public String baseForm = null; | |||||
public String originalForm = null; | |||||
public String posTag = null; | |||||
public int position = -1; // Notice the first word's position = 1 | |||||
public String key = null; | |||||
public boolean isCovered = false; | |||||
public boolean isIgnored = false; | |||||
//Notice: These variables are not used because we merge a phrase to a word if it is a node now. | |||||
public String ner = null; // record NER result | |||||
public Word nnNext = null; | |||||
public Word nnPrev = null; | |||||
public Word crr = null; // coreference resolution result | |||||
public Word represent = null; // This word is represented by others, eg, "which book is ..." "which" | |||||
public boolean omitNode = false; // This word can not be node | |||||
public Word modifiedWord = null; // This word modify which word (it modify itself if it is not a modified word) | |||||
public Word (String base, String original, String pos, int posi) { | |||||
baseForm = base; | |||||
originalForm = original; | |||||
posTag = pos; | |||||
position = posi; | |||||
key = new String(originalForm+"["+position+"]"); | |||||
} | |||||
@Override | |||||
public String toString() { | |||||
return key; | |||||
} | |||||
public int compareTo(Word another) { | |||||
return this.position-another.position; | |||||
} | |||||
@Override | |||||
public int hashCode() { | |||||
return key.hashCode(); | |||||
} | |||||
@Override | |||||
public boolean equals(Object o) { | |||||
return (o instanceof Word) | |||||
&& originalForm.equals(((Word)o).originalForm) | |||||
&& position == ((Word)o).position; | |||||
} | |||||
// We now discard all NN information and return the word itself. | husen 2016 | |||||
public Word getNnHead() { | |||||
Word w = this; | |||||
return w; | |||||
// if(w.mayEnt || w.mayType) | |||||
// return w; | |||||
// | |||||
// while (w.nnPrev != null) { | |||||
// w = w.nnPrev; | |||||
// } | |||||
// return w; | |||||
} | |||||
public String getFullEntityName() { | |||||
Word w = this.getNnHead(); | |||||
return w.originalForm; | |||||
// if(w.mayEnt || w.mayType) | |||||
// return w.originalForm; | |||||
// | |||||
// StringBuilder sb = new StringBuilder(""); | |||||
// while (w != null) { | |||||
// sb.append(w.originalForm); | |||||
// sb.append(' '); | |||||
// w = w.nnNext; | |||||
// } | |||||
// sb.deleteCharAt(sb.length()-1); | |||||
// return sb.toString(); | |||||
} | |||||
public String getBaseFormEntityName() { | |||||
Word w = this.getNnHead(); | |||||
if(w.mayEnt || w.mayType) | |||||
return w.baseForm; | |||||
StringBuilder sb = new StringBuilder(""); | |||||
while (w != null) { | |||||
sb.append(w.baseForm); | |||||
sb.append(' '); | |||||
w = w.nnNext; | |||||
} | |||||
sb.deleteCharAt(sb.length()-1); | |||||
return sb.toString(); | |||||
} | |||||
public String isNER () { | |||||
return this.getNnHead().ner; | |||||
} | |||||
public void setIsCovered () { | |||||
Word w = this.getNnHead(); | |||||
while (w != null) { | |||||
w.isCovered = true; | |||||
w = w.nnNext; | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,202 @@ | |||||
package nlp.tool; | |||||
import java.util.List; | |||||
import java.util.Properties; | |||||
import nlp.ds.Word; | |||||
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; | |||||
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; | |||||
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; | |||||
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; | |||||
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; | |||||
import edu.stanford.nlp.ling.CoreLabel; | |||||
import edu.stanford.nlp.pipeline.Annotation; | |||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP; | |||||
import edu.stanford.nlp.trees.Tree; | |||||
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; | |||||
import edu.stanford.nlp.trees.semgraph.SemanticGraph; | |||||
import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; | |||||
import edu.stanford.nlp.util.CoreMap; | |||||
public class CoreNLP { | |||||
// CoreNLP can also recognize TIME and NUMBER (see SUTime) | |||||
private StanfordCoreNLP pipeline_lemma; | |||||
public CoreNLP () { | |||||
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution | |||||
/*Properties props_all = new Properties(); | |||||
props_all.put("annotators", "tokenize, ssplit, pos, lemma, parse"); // full list: "tokenize, ssplit, pos, lemma, ner, parse, dcoref" | |||||
pipeline_all = new StanfordCoreNLP(props_all);*/ | |||||
Properties props_lemma = new Properties(); | |||||
props_lemma.put("annotators", "tokenize, ssplit, pos, lemma"); | |||||
pipeline_lemma = new StanfordCoreNLP(props_lemma); | |||||
} | |||||
// For more efficient usage, refer to "http://www.jarvana.com/jarvana/view/edu/stanford/nlp/stanford-corenlp/1.2.0/stanford-corenlp-1.2.0-javadoc.jar!/edu/stanford/nlp/process/Morphology.html" | |||||
public String getBaseFormOfPattern (String text) { | |||||
String ret = new String(""); | |||||
// create an empty Annotation just with the given text | |||||
Annotation document = new Annotation(text); | |||||
// run all Annotators on this text | |||||
pipeline_lemma.annotate(document); | |||||
// these are all the sentences in this document | |||||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||||
// ��������� | |||||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||||
int count = 0; | |||||
for(CoreMap sentence: sentences) { | |||||
// traversing the words in the current sentence | |||||
// a CoreLabel is a CoreMap with additional token-specific methods | |||||
for (CoreLabel token: sentence.get(TokensAnnotation.class)) { | |||||
// this is the base form (lemma) of the token | |||||
String lemma = token.getString(LemmaAnnotation.class); | |||||
ret += lemma; | |||||
ret += " "; | |||||
} | |||||
count ++; | |||||
if (count % 100 == 0) { | |||||
System.out.println(count); | |||||
} | |||||
} | |||||
return ret.substring(0, ret.length()-1); | |||||
} | |||||
public SemanticGraph getBasicDependencies (String s) { | |||||
// create an empty Annotation just with the given text | |||||
Annotation document = new Annotation(s); | |||||
// run all Annotators on this text | |||||
pipeline_lemma.annotate(document); | |||||
// these are all the sentences in this document | |||||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||||
for(CoreMap sentence: sentences) { | |||||
// this is the Stanford dependency graph of the current sentence | |||||
SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class); | |||||
return dependencies; | |||||
} | |||||
return null; | |||||
} | |||||
public Tree getParseTree (String text) { | |||||
// create an empty Annotation just with the given text | |||||
Annotation document = new Annotation(text); | |||||
// run all Annotators on this text | |||||
pipeline_lemma.annotate(document); | |||||
// these are all the sentences in this document | |||||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||||
for(CoreMap sentence: sentences) { | |||||
// this is the parse tree of the current sentence | |||||
return sentence.get(TreeAnnotation.class); | |||||
} | |||||
return null; | |||||
} | |||||
/** | |||||
* How to use: | |||||
* for (CoreLabel token : sentence.get(TokensAnnotation.class)) { | |||||
* // this is the text of the token | |||||
* String word = token.get(TextAnnotation.class); | |||||
* // this is the POS tag of the token | |||||
* String pos = token.get(PartOfSpeechAnnotation.class); | |||||
* } | |||||
* @param s | |||||
* @return | |||||
*/ | |||||
public CoreMap getPOS (String s) { | |||||
// create an empty Annotation just with the given text | |||||
Annotation document = new Annotation(s); | |||||
// run all Annotators on this text | |||||
pipeline_lemma.annotate(document); | |||||
// these are all the sentences in this document | |||||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||||
for(CoreMap sentence: sentences) { | |||||
// this is the sentence with POS Tags | |||||
return sentence; | |||||
} | |||||
return null; | |||||
} | |||||
public Word[] getTaggedWords (String sentence) { | |||||
CoreMap taggedSentence = getPOS(sentence); | |||||
Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()]; | |||||
int count = 0; | |||||
for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) { | |||||
// this is the text of the token | |||||
String word = token.get(TextAnnotation.class); | |||||
// this is the POS tag of the token | |||||
String pos = token.get(PartOfSpeechAnnotation.class); | |||||
//System.out.println(word+"["+pos+"]"); | |||||
ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1); | |||||
count ++; | |||||
} | |||||
return ret; | |||||
} | |||||
/*public void demo () { | |||||
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution | |||||
Properties props = new Properties(); | |||||
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); | |||||
StanfordCoreNLP pipeline = new StanfordCoreNLP(props); | |||||
// read some text in the text variable | |||||
String text = ... // Add your text here! | |||||
// create an empty Annotation just with the given text | |||||
Annotation document = new Annotation(text); | |||||
// run all Annotators on this text | |||||
pipeline.annotate(document); | |||||
// these are all the sentences in this document | |||||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||||
for(CoreMap sentence: sentences) { | |||||
// traversing the words in the current sentence | |||||
// a CoreLabel is a CoreMap with additional token-specific methods | |||||
for (CoreLabel token: sentence.get(TokensAnnotation.class)) { | |||||
// this is the text of the token | |||||
String word = token.get(TextAnnotation.class); | |||||
// this is the POS tag of the token | |||||
String pos = token.get(PartOfSpeechAnnotation.class); | |||||
// this is the NER label of the token | |||||
String ne = token.get(NamedEntityTagAnnotation.class); | |||||
} | |||||
// this is the parse tree of the current sentence | |||||
Tree tree = sentence.get(TreeAnnotation.class); | |||||
// this is the Stanford dependency graph of the current sentence | |||||
SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); | |||||
} | |||||
// This is the coreference link graph | |||||
// Each chain stores a set of mentions that link to each other, | |||||
// along with a method for getting the most representative mention | |||||
// Both sentence and token offsets start at 1! | |||||
Map<Integer, CorefChain> graph = | |||||
document.get(CorefChainAnnotation.class); | |||||
}*/ | |||||
} |
@@ -0,0 +1,42 @@ | |||||
package nlp.tool; | |||||
import java.io.BufferedReader; | |||||
import java.io.IOException; | |||||
import java.io.InputStreamReader; | |||||
import nlp.ds.DependencyTree; | |||||
import nlp.ds.Sentence; | |||||
import qa.Globals; | |||||
public class Main { | |||||
public static void main (String[] args) { | |||||
Globals.init(); | |||||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||||
try { | |||||
while (true) { | |||||
System.out.println("Test maltparser."); | |||||
System.out.print("Please input the NL question: "); | |||||
String question = br.readLine(); | |||||
if (question.length() <= 3) | |||||
break; | |||||
try { | |||||
long t1 = System.currentTimeMillis(); | |||||
Sentence s = new Sentence(question); | |||||
DependencyTree dt = new DependencyTree(s, Globals.stanfordParser); | |||||
System.out.println("====StanfordDependencies===="); | |||||
System.out.println(dt); | |||||
DependencyTree dt2 = new DependencyTree(s, Globals.maltParser); | |||||
System.out.println("====MaltDependencies===="); | |||||
System.out.println(dt2); | |||||
long t2 = System.currentTimeMillis(); | |||||
System.out.println("time=" + (t2-t1) + "ms"); | |||||
} catch (Exception e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} catch (IOException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,70 @@ | |||||
package nlp.tool; | |||||
import nlp.ds.Sentence; | |||||
import nlp.ds.Word; | |||||
import org.maltparser.MaltParserService; | |||||
import org.maltparser.core.exception.MaltChainedException; | |||||
import org.maltparser.core.syntaxgraph.DependencyStructure; | |||||
import qa.Globals; | |||||
public class MaltParser { | |||||
private MaltParserService service = null; | |||||
public MaltParser() { | |||||
try | |||||
{ | |||||
System.out.print("Loading MaltParser ..."); | |||||
service = new MaltParserService(); | |||||
// Inititalize the parser model 'model0' and sets the working directory to '.' and sets the logging file to 'parser.log' | |||||
//service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w . -lfi parser.log"); | |||||
service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w "+Globals.localPath+"lib/maltparser-1.9.1 -lfi parser.log"); | |||||
firstParse(); | |||||
System.out.println("ok!"); | |||||
} catch (MaltChainedException e) { | |||||
e.printStackTrace(); | |||||
System.err.println("MaltParser exception: " + e.getMessage()); | |||||
} | |||||
} | |||||
private void firstParse() { | |||||
String[] tokens = new String[12]; | |||||
tokens[0] = "1\tIn\t_\tIN\tIN\t_"; | |||||
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; | |||||
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; | |||||
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; | |||||
tokens[4] = "5\tby\t_\tIN\tIN\t_"; | |||||
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; | |||||
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; | |||||
tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; | |||||
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; | |||||
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; | |||||
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; | |||||
tokens[11] = "12\t?\t_\t.\t.\t_"; | |||||
try { | |||||
service.parse(tokens); | |||||
} catch (MaltChainedException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
public DependencyStructure getDependencyStructure (Sentence sentence) { | |||||
try { | |||||
return service.parse(getTaggedTokens(sentence)); | |||||
} catch (MaltChainedException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
return null; | |||||
} | |||||
private String[] getTaggedTokens (Sentence sentence) { | |||||
String[] ret = new String[sentence.words.length]; | |||||
int count = 0; | |||||
for (Word w : sentence.words) { | |||||
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); | |||||
count ++; | |||||
} | |||||
return ret; | |||||
} | |||||
} |
@@ -0,0 +1,73 @@ | |||||
package nlp.tool; | |||||
import java.io.File; | |||||
import java.net.URL; | |||||
import nlp.ds.Sentence; | |||||
import nlp.ds.Word; | |||||
import org.maltparser.concurrent.ConcurrentMaltParserModel; | |||||
import org.maltparser.concurrent.ConcurrentMaltParserService; | |||||
import org.maltparser.concurrent.graph.ConcurrentDependencyGraph; | |||||
import org.maltparser.core.exception.MaltChainedException; | |||||
//import org.maltparser.core.syntaxgraph.DependencyStructure; | |||||
public class MaltParserCon { | |||||
private ConcurrentMaltParserModel model = null; | |||||
public ConcurrentDependencyGraph outputGraph = null; | |||||
public MaltParserCon(){ | |||||
try{ | |||||
System.out.println("Loading Maltparser...\n"); | |||||
URL ModelURL = new File("output/engmalt.linear-1.7.mco").toURI().toURL(); | |||||
model = ConcurrentMaltParserService.initializeParserModel(ModelURL); | |||||
firstTest(); | |||||
System.out.println("ok!\n"); | |||||
}catch(Exception e){ | |||||
e.printStackTrace(); | |||||
System.err.println("MaltParser exception: " + e.getMessage()); | |||||
} | |||||
} | |||||
private void firstTest(){ | |||||
String[] tokens = new String[12]; | |||||
tokens[0] = "1\tIn\t_\tIN\tIN\t_"; | |||||
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; | |||||
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; | |||||
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; | |||||
tokens[4] = "5\tby\t_\tIN\tIN\t_"; | |||||
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; | |||||
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; | |||||
tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; | |||||
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; | |||||
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; | |||||
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; | |||||
tokens[11] = "12\t?\t_\t.\t.\t_"; | |||||
try { | |||||
outputGraph = model.parse(tokens); | |||||
} catch (Exception e) { | |||||
e.printStackTrace(); | |||||
} | |||||
System.out.println(outputGraph); | |||||
} | |||||
public ConcurrentDependencyGraph getDependencyStructure (Sentence sentence) { | |||||
try { | |||||
return model.parse(getTaggedTokens(sentence)); | |||||
} catch (MaltChainedException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
return null; | |||||
} | |||||
private String[] getTaggedTokens (Sentence sentence) { | |||||
String[] ret = new String[sentence.words.length]; | |||||
int count = 0; | |||||
for (Word w : sentence.words) { | |||||
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); | |||||
count ++; | |||||
} | |||||
return ret; | |||||
} | |||||
} |
@@ -0,0 +1,53 @@ | |||||
package nlp.tool; | |||||
import java.util.List; | |||||
import qa.Globals; | |||||
import nlp.ds.Sentence; | |||||
import nlp.ds.Word; | |||||
import edu.stanford.nlp.ie.AbstractSequenceClassifier; | |||||
import edu.stanford.nlp.ie.crf.CRFClassifier; | |||||
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation; | |||||
import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation; | |||||
import edu.stanford.nlp.ling.CoreLabel; | |||||
public class NERecognizer { | |||||
static String serializedClassifier; | |||||
static AbstractSequenceClassifier<CoreLabel> classifier; | |||||
//public static String localPath="E:\\Hanshuo\\gAnswer\\"; | |||||
public NERecognizer() { | |||||
serializedClassifier = Globals.localPath+"lib/stanford-ner-2012-11-11/classifiers/english.all.3class.distsim.crf.ser.gz"; | |||||
classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); | |||||
} | |||||
/*public NERecognizer(String basePath, boolean flag) { | |||||
serializedClassifier = "WEB-INF\\lib\\stanford-ner-2012-11-11\\stanford-ner-2012-11-11\\classifiers\\english.all.3class.distsim.crf.ser.gz"; | |||||
}*/ | |||||
public void recognize(Sentence sentence) { | |||||
List<CoreLabel> lcl = classifier.classify(sentence.plainText).get(0); | |||||
for (CoreLabel cl : lcl) { | |||||
int position = Integer.parseInt(cl.get(PositionAnnotation.class))+1; | |||||
Word w = sentence.getWordByIndex(position); | |||||
String ner = cl.get(AnswerAnnotation.class); | |||||
if (ner.equals("O")) w.ner = null; | |||||
else w.ner = ner; | |||||
} | |||||
} | |||||
public static void main(String[] args) { | |||||
System.out.println("Test NER"); | |||||
Globals.init(); | |||||
Sentence s = new Sentence("I go to school at Stanford University, which is located in California.");//"Which states of Germany are governed by the Social Democratic Party?" | |||||
Globals.nerRecognizer.recognize(s); | |||||
for (Word word : s.words) { | |||||
System.out.print(word + " "); | |||||
System.out.println("ner=" + word.ner); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,51 @@ | |||||
package nlp.tool; | |||||
import java.io.StringReader; | |||||
import java.util.List; | |||||
import edu.stanford.nlp.ling.CoreLabel; | |||||
import edu.stanford.nlp.objectbank.TokenizerFactory; | |||||
import edu.stanford.nlp.parser.lexparser.LexicalizedParser; | |||||
import edu.stanford.nlp.process.CoreLabelTokenFactory; | |||||
import edu.stanford.nlp.process.PTBTokenizer; | |||||
import edu.stanford.nlp.trees.GrammaticalStructure; | |||||
import edu.stanford.nlp.trees.GrammaticalStructureFactory; | |||||
import edu.stanford.nlp.trees.PennTreebankLanguagePack; | |||||
import edu.stanford.nlp.trees.Tree; | |||||
import edu.stanford.nlp.trees.TreebankLanguagePack; | |||||
public class StanfordParser { | |||||
private LexicalizedParser lp; | |||||
private TokenizerFactory<CoreLabel> tokenizerFactory; | |||||
private TreebankLanguagePack tlp; | |||||
private GrammaticalStructureFactory gsf; | |||||
public StanfordParser() { | |||||
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); | |||||
tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); | |||||
tlp = new PennTreebankLanguagePack(); | |||||
gsf = tlp.grammaticalStructureFactory(); | |||||
} | |||||
public GrammaticalStructure getGrammaticalStructure (String sentence) { | |||||
List<CoreLabel> rawWords2 = | |||||
tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); | |||||
// Converts a Sentence/List/String into a Tree. | |||||
// In all circumstances, the input will be treated as a single sentence to be parsed. | |||||
Tree parse = lp.apply(rawWords2); | |||||
return gsf.newGrammaticalStructure(parse); | |||||
/*List<TypedDependency> tdl = gs.typedDependencies(false); | |||||
for (TypedDependency td : tdl) { | |||||
System.out.println(td.reln().getShortName()+"("+td.gov()+","+td.dep()+")"); | |||||
System.out.println("gov="+td.gov() | |||||
+"\tgov.index=" | |||||
+td.gov().index() | |||||
+"\tgov.value=" | |||||
+td.gov().value() | |||||
+"\tgov.pos=" | |||||
+((TreeGraphNode)td.gov().parent()).value()); | |||||
}*/ | |||||
//System.out.println(tdl); | |||||
} | |||||
} |
@@ -0,0 +1,614 @@ | |||||
package nlp.tool; | |||||
import java.util.HashSet; | |||||
import java.util.Arrays; | |||||
public class StopWordsList { | |||||
public static HashSet<String> sw_list = new HashSet<String>(); | |||||
public StopWordsList() { | |||||
initiate(); | |||||
} | |||||
public void initiate() { | |||||
sw_list.addAll(Arrays.asList(sw_array)); | |||||
// some commas | |||||
/*sw_list.add("."); | |||||
sw_list.add(","); | |||||
sw_list.add(";"); | |||||
sw_list.add("?"); | |||||
sw_list.add("!"); | |||||
sw_list.add(":"); | |||||
sw_list.add("("); | |||||
sw_list.add(")"); | |||||
sw_list.add("-");*/ | |||||
} | |||||
/** | |||||
* To judge whether a word is a stop-word | |||||
* @param word_lowercase: the word, should be in lower-case | |||||
* @return if the word is a stop-word, then true; otherwise, false. | |||||
*/ | |||||
public boolean isStopWord(String word_lowercase) { | |||||
if (sw_list.contains(word_lowercase)) return true; | |||||
else return false; | |||||
} | |||||
private static final String sw_array[] = new String[]{ | |||||
"a", | |||||
"able", | |||||
"about", | |||||
"across", | |||||
"after", | |||||
"all", | |||||
"almost", | |||||
"also", | |||||
"am", | |||||
"among", | |||||
"an", | |||||
"and", | |||||
"any", | |||||
"are", | |||||
"as", | |||||
"at", | |||||
//"be", | |||||
"because", | |||||
"been", | |||||
"but", | |||||
"by", | |||||
"can", | |||||
"cannot", | |||||
"could", | |||||
"dear", | |||||
"did", | |||||
"do", | |||||
"does", | |||||
"either", | |||||
"else", | |||||
"ever", | |||||
"every", | |||||
"for", | |||||
"from", | |||||
"get", | |||||
"got", | |||||
"had", | |||||
"has", | |||||
"have", | |||||
"he", | |||||
"her", | |||||
"hers", | |||||
"him", | |||||
"his", | |||||
"how", | |||||
"however", | |||||
"i", | |||||
"if", | |||||
"in", | |||||
"into", | |||||
"is", | |||||
"it", | |||||
"its", | |||||
"just", | |||||
"least", | |||||
"let", | |||||
"like", | |||||
"likely", | |||||
"may", | |||||
"me", | |||||
"might", | |||||
"most", | |||||
"must", | |||||
"my", | |||||
"neither", | |||||
"no", | |||||
"nor", | |||||
"not", | |||||
"of", | |||||
"off", | |||||
"often", | |||||
"on", | |||||
"only", | |||||
"or", | |||||
"other", | |||||
"our", | |||||
"own", | |||||
"rather", | |||||
"said", | |||||
"say", | |||||
"says", | |||||
"she", | |||||
"should", | |||||
"since", | |||||
"so", | |||||
"some", | |||||
"than", | |||||
"that", | |||||
"the", | |||||
"their", | |||||
"them", | |||||
"then", | |||||
"there", | |||||
"these", | |||||
"they", | |||||
"this", | |||||
"tis", | |||||
"to", | |||||
"too", | |||||
"twas", | |||||
"us", | |||||
"wants", | |||||
"was", | |||||
"we", | |||||
"were", | |||||
"what", | |||||
"when", | |||||
"where", | |||||
"which", | |||||
"while", | |||||
"who", | |||||
"whom", | |||||
"why", | |||||
"will", | |||||
"with", | |||||
"would", | |||||
"yet", | |||||
"you", | |||||
"your" | |||||
}; | |||||
}; | |||||
/*// stop word 308 | |||||
// http://norm.al/2009/04/14/list-of-english-stop-words/ | |||||
private static final String sw_array[] = new String[]{ | |||||
"a", | |||||
"about", | |||||
"above", | |||||
"across", | |||||
"after", | |||||
"afterwards", | |||||
"again", | |||||
"against", | |||||
"all", | |||||
"almost", | |||||
"alone", | |||||
"along", | |||||
"already", | |||||
"also", | |||||
"although", | |||||
"always", | |||||
"am", | |||||
"among", | |||||
"amongst", | |||||
"amoungst", | |||||
"amount", | |||||
"an", | |||||
"and", | |||||
"another", | |||||
"any", | |||||
"anyhow", | |||||
"anyone", | |||||
"anything", | |||||
"anyway", | |||||
"anywhere", | |||||
"are", | |||||
"around", | |||||
"as", | |||||
"at", | |||||
"back", | |||||
"be", | |||||
"became", | |||||
"because", | |||||
"become", | |||||
"becomes", | |||||
"becoming", | |||||
"been", | |||||
"before", | |||||
"beforehand", | |||||
"behind", | |||||
"being", | |||||
"below", | |||||
"beside", | |||||
"besides", | |||||
"between", | |||||
"beyond", | |||||
"bill", | |||||
"both", | |||||
"bottom", | |||||
"but", | |||||
"by", | |||||
"call", | |||||
"can", | |||||
"cannot", | |||||
"cant", | |||||
"co", | |||||
"computer", | |||||
"con", | |||||
"could", | |||||
"couldnt", | |||||
"cry", | |||||
"de", | |||||
"describe", | |||||
"detail", | |||||
"do", | |||||
"did", | |||||
"done", | |||||
"down", | |||||
"due", | |||||
"during", | |||||
"each", | |||||
"eg", | |||||
"eight", | |||||
"either", | |||||
"eleven", | |||||
"else", | |||||
"elsewhere", | |||||
"empty", | |||||
"enough", | |||||
"etc", | |||||
"even", | |||||
"ever", | |||||
"every", | |||||
"everyone", | |||||
"everything", | |||||
"everywhere", | |||||
"except", | |||||
"few", | |||||
"fifteen", | |||||
"fify", | |||||
"fill", | |||||
"find", | |||||
"fire", | |||||
"first", | |||||
"five", | |||||
"for", | |||||
"former", | |||||
"formerly", | |||||
"forty", | |||||
"found", | |||||
"four", | |||||
"from", | |||||
"front", | |||||
"full", | |||||
"further", | |||||
"get", | |||||
"give", | |||||
"go", | |||||
"had", | |||||
"has", | |||||
"hasnt", | |||||
"have", | |||||
"he", | |||||
"hence", | |||||
"her", | |||||
"here", | |||||
"here", | |||||
"hereafter", | |||||
"hereby", | |||||
"herein", | |||||
"hereupon", | |||||
"hers", | |||||
"herself", | |||||
"him", | |||||
"himself", | |||||
"his", | |||||
"how", | |||||
"however", | |||||
"hundred", | |||||
"i", | |||||
"ie", | |||||
"if", | |||||
"in", | |||||
"inc", | |||||
"indeed", | |||||
"interest", | |||||
"into", | |||||
"is", | |||||
"it", | |||||
"its", | |||||
"itself", | |||||
"keep", | |||||
"last", | |||||
"latter", | |||||
"latterly", | |||||
"least", | |||||
"less", | |||||
"ltd", | |||||
"made", | |||||
"many", | |||||
"may", | |||||
"me", | |||||
"meanwhile", | |||||
"might", | |||||
"mill", | |||||
"mine", | |||||
"more", | |||||
"moreover", | |||||
"most", | |||||
"mostly", | |||||
"move", | |||||
"much", | |||||
"must", | |||||
"my", | |||||
"myself", | |||||
"name", | |||||
"namely", | |||||
"neither", | |||||
"never", | |||||
"nevertheless", | |||||
"next", | |||||
"nine", | |||||
"no", | |||||
"nobody", | |||||
"none", | |||||
"noone", | |||||
"nor", | |||||
"not", | |||||
"nothing", | |||||
"now", | |||||
"nowhere", | |||||
"of", | |||||
"off", | |||||
"often", | |||||
"on", | |||||
"once", | |||||
"one", | |||||
"only", | |||||
"onto", | |||||
"or", | |||||
"other", | |||||
"others", | |||||
"otherwise", | |||||
"our", | |||||
"ours", | |||||
"ourselves", | |||||
"out", | |||||
"over", | |||||
"own", | |||||
"part", | |||||
"per", | |||||
"perhaps", | |||||
"please", | |||||
"put", | |||||
"rather", | |||||
"re", | |||||
"same", | |||||
"see", | |||||
"seem", | |||||
"seemed", | |||||
"seeming", | |||||
"seems", | |||||
"serious", | |||||
"several", | |||||
"she", | |||||
"should", | |||||
"show", | |||||
"side", | |||||
"since", | |||||
"sincere", | |||||
"six", | |||||
"sixty", | |||||
"so", | |||||
"some", | |||||
"somehow", | |||||
"someone", | |||||
"something", | |||||
"sometime", | |||||
"sometimes", | |||||
"somewhere", | |||||
"still", | |||||
"such", | |||||
"system", | |||||
"take", | |||||
"ten", | |||||
"than", | |||||
"that", | |||||
"the", | |||||
"their", | |||||
"them", | |||||
"themselves", | |||||
"then", | |||||
"thence", | |||||
"there", | |||||
"thereafter", | |||||
"thereby", | |||||
"therefore", | |||||
"therein", | |||||
"thereupon", | |||||
"these", | |||||
"they", | |||||
"thick", | |||||
"thin", | |||||
"third", | |||||
"this", | |||||
"those", | |||||
"though", | |||||
"throughout", | |||||
"thru", | |||||
"thus", | |||||
"to", | |||||
"together", | |||||
"too", | |||||
"top", | |||||
"toward", | |||||
"towards", | |||||
"twelve", | |||||
"twenty", | |||||
"two", | |||||
"un", | |||||
"under", | |||||
"until", | |||||
"up", | |||||
"upon", | |||||
"us", | |||||
"very", | |||||
"via", | |||||
"was", | |||||
"we", | |||||
"we", | |||||
"well", | |||||
"were", | |||||
"what", | |||||
"whatever", | |||||
"when", | |||||
"whence", | |||||
"whenever", | |||||
"where", | |||||
"whereafter", | |||||
"whereas", | |||||
"whereby", | |||||
"wherein", | |||||
"whereupon", | |||||
"wherever", | |||||
"whether", | |||||
"which", | |||||
"while", | |||||
"whither", | |||||
"who", | |||||
"whoever", | |||||
"whole", | |||||
"whom", | |||||
"whose", | |||||
"why", | |||||
"will", | |||||
"with", | |||||
"within", | |||||
"without", | |||||
"would", | |||||
"yet", | |||||
"you", | |||||
"your", | |||||
"yours", | |||||
"yourself", | |||||
"yourselves" | |||||
}; | |||||
*/ | |||||
/* // stop words 119 | |||||
// http://www.textfixer.com/resources/common-english-words.txt | |||||
private static final String sw_array[] = new String[]{ | |||||
"a", | |||||
"able", | |||||
"about", | |||||
"across", | |||||
"after", | |||||
"all", | |||||
"almost", | |||||
"also", | |||||
"am", | |||||
"among", | |||||
"an", | |||||
"and", | |||||
"any", | |||||
"are", | |||||
"as", | |||||
"at", | |||||
"be", | |||||
"because", | |||||
"been", | |||||
"but", | |||||
"by", | |||||
"can", | |||||
"cannot", | |||||
"could", | |||||
"dear", | |||||
"did", | |||||
"do", | |||||
"does", | |||||
"either", | |||||
"else", | |||||
"ever", | |||||
"every", | |||||
"for", | |||||
"from", | |||||
"get", | |||||
"got", | |||||
"had", | |||||
"has", | |||||
"have", | |||||
"he", | |||||
"her", | |||||
"hers", | |||||
"him", | |||||
"his", | |||||
"how", | |||||
"however", | |||||
"i", | |||||
"if", | |||||
"in", | |||||
"into", | |||||
"is", | |||||
"it", | |||||
"its", | |||||
"just", | |||||
"least", | |||||
"let", | |||||
"like", | |||||
"likely", | |||||
"may", | |||||
"me", | |||||
"might", | |||||
"most", | |||||
"must", | |||||
"my", | |||||
"neither", | |||||
"no", | |||||
"nor", | |||||
"not", | |||||
"of", | |||||
"off", | |||||
"often", | |||||
"on", | |||||
"only", | |||||
"or", | |||||
"other", | |||||
"our", | |||||
"own", | |||||
"rather", | |||||
"said", | |||||
"say", | |||||
"says", | |||||
"she", | |||||
"should", | |||||
"since", | |||||
"so", | |||||
"some", | |||||
"than", | |||||
"that", | |||||
"the", | |||||
"their", | |||||
"them", | |||||
"then", | |||||
"there", | |||||
"these", | |||||
"they", | |||||
"this", | |||||
"tis", | |||||
"to", | |||||
"too", | |||||
"twas", | |||||
"us", | |||||
"wants", | |||||
"was", | |||||
"we", | |||||
"were", | |||||
"what", | |||||
"when", | |||||
"where", | |||||
"which", | |||||
"while", | |||||
"who", | |||||
"whom", | |||||
"why", | |||||
"will", | |||||
"with", | |||||
"would", | |||||
"yet", | |||||
"you", | |||||
"your" | |||||
}; | |||||
*/ |
@@ -0,0 +1,441 @@ | |||||
package paradict; | |||||
import java.io.BufferedReader; | |||||
import java.io.File; | |||||
import java.io.FileInputStream; | |||||
import java.io.IOException; | |||||
import java.io.InputStreamReader; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import java.util.Iterator; | |||||
import nlp.tool.CoreNLP; | |||||
import qa.Globals; | |||||
public class ParaphraseDictionary { | |||||
public static String localDataPath; | |||||
public static String dbpedia_relation_paraphrases_baseform_withScore; | |||||
public static String dbpedia_relation_paraphrases_baseform_withScore_rerank; | |||||
public static String dbpedia_relation_paraphrases_handwrite; | |||||
public static String dbpedia_predicate_id; | |||||
public static String dbpedia_dbo_predicate; | |||||
public HashMap<String, Integer> predicate_2_id = null; | |||||
public HashMap<Integer, String> id_2_predicate = null; | |||||
public HashSet<Integer> dbo_predicate_id = null; | |||||
public HashMap<String, ArrayList<PredicateIDAndSupport>> nlPattern_2_predicateList = null; | |||||
public HashMap<String, ArrayList<String>> invertedIndex = null; | |||||
public HashSet<String> relns_subject; | |||||
public HashSet<String> relns_object; | |||||
public HashSet<String> prepositions; | |||||
public HashSet<String> bannedTypes; | |||||
//public final int typePredicateID = 1541; //dbpedia2015 <type>=1541 | |||||
public final int typePredicateID = 5157; //Dbpedia 2016 <type>=5166 | |||||
public int totalPredCount = 0; | |||||
public int paraphrasedPredCount = 0; | |||||
public int lineCount = 0; | |||||
/** | |||||
* constructor | |||||
* @param parser | |||||
* @param ner | |||||
*/ | |||||
public ParaphraseDictionary () { | |||||
String fixedPath = Globals.localPath; | |||||
System.out.println(System.getProperty("user.dir")); | |||||
localDataPath = fixedPath + "data/DBpedia2016/parapharse/"; | |||||
dbpedia_relation_paraphrases_baseform_withScore_rerank = localDataPath + "dbpedia-relation-paraphrases-withScore-baseform-merge-sorted-rerank-slct.txt"; | |||||
dbpedia_relation_paraphrases_handwrite = localDataPath + "dbpedia-relation-paraphrase-handwrite.txt"; | |||||
dbpedia_predicate_id = localDataPath + "16predicate_id.txt"; | |||||
dbpedia_dbo_predicate = localDataPath + "16dbo_predicates.txt"; | |||||
bannedTypes = new HashSet<String>(); | |||||
bannedTypes.add("Mayor"); | |||||
relns_subject = new HashSet<String>(); | |||||
relns_subject.add("subj"); | |||||
relns_subject.add("csubjpass"); | |||||
relns_subject.add("csubj"); | |||||
relns_subject.add("xsubj"); | |||||
relns_subject.add("nsubjpass"); | |||||
relns_subject.add("nsubj"); | |||||
relns_subject.add("poss"); // Obama's wife | |||||
relns_subject.add("dobj"); | |||||
relns_object = new HashSet<String>(); | |||||
relns_object.add("dobj"); | |||||
relns_object.add("iobj"); | |||||
relns_object.add("obj"); | |||||
relns_object.add("pobj"); | |||||
prepositions = new HashSet<String>(); | |||||
prepositions.add("in");//in at on with to from before after of for | |||||
prepositions.add("at"); | |||||
prepositions.add("on"); | |||||
prepositions.add("with"); | |||||
prepositions.add("to"); | |||||
prepositions.add("from"); | |||||
prepositions.add("before"); | |||||
prepositions.add("after"); | |||||
prepositions.add("of"); | |||||
prepositions.add("for"); | |||||
prepositions.add("as"); | |||||
try { | |||||
loadPredicateId(); | |||||
loadDboPredicate(); | |||||
loadParaDict(); | |||||
buildInvertedIndex(); | |||||
} catch (Exception e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
/** | |||||
* Load the mapping between predicates and their IDs. | |||||
* @throws IOException | |||||
*/ | |||||
public void loadPredicateId () throws IOException { | |||||
predicate_2_id = new HashMap<String, Integer>(); | |||||
id_2_predicate = new HashMap<Integer, String>(); | |||||
String input_filename = dbpedia_predicate_id; | |||||
File file = new File(input_filename); | |||||
InputStreamReader in = null; | |||||
BufferedReader br = null; | |||||
try{ | |||||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||||
br = new BufferedReader(in); | |||||
String line = null; | |||||
while ((line = br.readLine())!= null) { | |||||
String[] lines = line.split("\t"); | |||||
predicate_2_id.put(lines[0], Integer.parseInt(lines[1])); | |||||
id_2_predicate.put(Integer.parseInt(lines[1]), lines[0]); | |||||
} | |||||
}catch(IOException e){ | |||||
System.out.println("NLPatterns.loadPredicateId() : IOException!"); | |||||
e.printStackTrace(); | |||||
}finally{ | |||||
if(br != null){ | |||||
try{ | |||||
br.close(); | |||||
}catch(IOException e){ | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} | |||||
System.out.println("NLPatterns.loadPredicateId() : ok!"); | |||||
} | |||||
public void loadDboPredicate() throws IOException | |||||
{ | |||||
dbo_predicate_id = new HashSet<Integer>(); | |||||
int cnt = 0; | |||||
String input_filename = dbpedia_dbo_predicate; | |||||
InputStreamReader in = null; | |||||
BufferedReader br = null; | |||||
try{ | |||||
File file = new File(input_filename); | |||||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||||
br = new BufferedReader(in); | |||||
String line = null; | |||||
while ((line = br.readLine())!= null) | |||||
{ | |||||
if (!predicate_2_id.containsKey(line)) | |||||
{ | |||||
cnt++; | |||||
//System.out.println("error: not found "+line+" id."); | |||||
continue; | |||||
} | |||||
dbo_predicate_id.add(predicate_2_id.get(line)); | |||||
} | |||||
}catch(IOException e){ | |||||
System.out.println("NLPatterns.loadDboPredicate() : IOException!"); | |||||
}finally{ | |||||
if(br!=null){ | |||||
try{ | |||||
br.close(); | |||||
}catch(IOException e){ | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} | |||||
System.out.println("Warning: DBO not found id count: "+cnt); | |||||
System.out.println("NLPatterns.loadDboPredicate() : ok!"); | |||||
} | |||||
/** | |||||
* Get predicate by its id | |||||
* @param predicateID | |||||
* @return | |||||
*/ | |||||
public String getPredicateById (int predicateID) { | |||||
return id_2_predicate.get(predicateID); | |||||
} | |||||
public void loadParaDict () throws Exception { | |||||
nlPattern_2_predicateList = new HashMap<String, ArrayList<PredicateIDAndSupport>>(); | |||||
HashSet<String> missInDBP2014 = new HashSet<String>(); | |||||
InputStreamReader in = null; | |||||
BufferedReader br = null; | |||||
try{ | |||||
String inputFileName = dbpedia_relation_paraphrases_baseform_withScore_rerank; | |||||
File file = new File(inputFileName); | |||||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||||
br = new BufferedReader(in); | |||||
String line = null; | |||||
int lineCount = 0; | |||||
//line = br.readLine();//read the first line which indicates the format | |||||
while ((line = br.readLine()) != null) | |||||
{ | |||||
if (line.startsWith("#")) continue; | |||||
lineCount ++; | |||||
String[] content = line.split("\t"); | |||||
if(!predicate_2_id.containsKey(content[0])) | |||||
{ | |||||
missInDBP2014.add(content[0]); | |||||
continue; | |||||
} | |||||
int predicateID = predicate_2_id.get(content[0]); | |||||
String nlPattern = content[1].toLowerCase(); | |||||
int support = Integer.parseInt(content[2]); | |||||
//double score = Double.parseDouble(content[3]); | |||||
String []slctString = content[3].split(" "); | |||||
double[] slct = new double[slctString.length]; | |||||
for (int i=0; i < slct.length; i++) { | |||||
slct[i] = Double.parseDouble(slctString[i]); | |||||
} | |||||
if (!nlPattern_2_predicateList.containsKey(nlPattern)) { | |||||
nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>()); | |||||
} | |||||
nlPattern_2_predicateList.get(nlPattern).add(new PredicateIDAndSupport(predicateID, support, slct)); | |||||
} | |||||
System.out.println("Number of NL-Patterns-to-predicate mappings = " + lineCount); | |||||
System.out.println("NLPatterns.size = " + nlPattern_2_predicateList.size()); | |||||
System.out.println("Predicate.size = " + predicate_2_id.size()); | |||||
System.out.println("Warning: Predicates not in DBpedia 2014 count: "+missInDBP2014.size()); | |||||
// Notice predicate itself and handwritten patterns have no wordSelectivity. | |||||
addPredicateAsNLPattern(); // This is very important. | |||||
addHandwriteAsNLPattern(); | |||||
Iterator<String> it = nlPattern_2_predicateList.keySet().iterator(); | |||||
while (it.hasNext()) { | |||||
Collections.sort(nlPattern_2_predicateList.get(it.next())); | |||||
} | |||||
}catch(IOException e){ | |||||
System.out.println("NLPatterns.Paradict() : IOException!"); | |||||
}finally{ | |||||
if(br!=null){ | |||||
try{ | |||||
br.close(); | |||||
}catch(IOException e){ | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} | |||||
System.out.println("NLPatterns.Paradict() : ok!"); | |||||
} | |||||
/** | |||||
* A set of very important NL patterns are the predicates themselves! | |||||
*/ | |||||
public void addPredicateAsNLPattern () { | |||||
final int support = 200; | |||||
int predicate_id; | |||||
for (String p : predicate_2_id.keySet()) | |||||
{ | |||||
// TODO: Omitting some bad relations (should be discarded in future) | |||||
if(p.equals("state") || p.equals("states")) | |||||
continue; | |||||
predicate_id = predicate_2_id.get(p); | |||||
StringBuilder pattern = new StringBuilder(""); | |||||
// Work/runtime 11,SpaceStation/volume 68 and some predicates have prefix (DBpedia 2015), discard the prefix when generating pattern | |||||
if(p.contains("/")) | |||||
{ | |||||
if(p.charAt(0)>='A' && p.charAt(0)<='Z') | |||||
p = p.substring(p.indexOf("/")+1); | |||||
//gameW/l 1974 | |||||
else | |||||
p = p.replace("/", ""); | |||||
} | |||||
int last = 0, i = 0; | |||||
for(i = 0; i < p.length(); i ++) { | |||||
// if it were not a small letter, then break it. | |||||
if(!(p.charAt(i)>='a' && p.charAt(i)<='z')) { | |||||
pattern.append(p.substring(last, i).toLowerCase()); | |||||
pattern.append(" "); | |||||
last = i; | |||||
} | |||||
} | |||||
pattern.append(p.substring(last, i).toLowerCase()); | |||||
for (i = 3; i < pattern.length(); i ++) { | |||||
// the blank between two digits should be deleted. | |||||
if (pattern.charAt(i)>='0' && pattern.charAt(i)<='9' | |||||
&& pattern.charAt(i-1)==' ' | |||||
&& pattern.charAt(i-2)>='0' && pattern.charAt(i-2)<='9') { | |||||
pattern.deleteCharAt(i-1); | |||||
} | |||||
// the blank between I and D should be deleted. | |||||
else if (pattern.charAt(i)=='d' | |||||
&& pattern.charAt(i-1)==' ' | |||||
&& pattern.charAt(i-2)=='i' | |||||
&& pattern.charAt(i-3)==' ') { | |||||
pattern.deleteCharAt(i-1); | |||||
} | |||||
// the blank between D and B should be deleted. | |||||
else if (pattern.charAt(i)=='b' | |||||
&& pattern.charAt(i-1)==' ' | |||||
&& pattern.charAt(i-2)=='d' | |||||
&& pattern.charAt(i-3)==' ') { | |||||
pattern.deleteCharAt(i-1); | |||||
} | |||||
} | |||||
// pattern -> base form | |||||
/*String[] ptns = pattern.toString().split(" "); | |||||
pattern = new StringBuilder(""); | |||||
for (String s : ptns) { | |||||
pattern.append(Globals.coreNLPparser.getBaseFormOfPattern(s)); | |||||
pattern.append(" "); | |||||
} | |||||
pattern.deleteCharAt(pattern.length()-1); | |||||
String patternString = pattern.toString();*/ | |||||
// Special case cannot use base form, eg, foundingYear //TODO: maybe Porter's Algorithm | |||||
String patternString = Globals.coreNLP.getBaseFormOfPattern(pattern.toString()); | |||||
//System.out.println(p + "-->" + patternString); | |||||
if (!nlPattern_2_predicateList.containsKey(patternString)) { | |||||
nlPattern_2_predicateList.put(patternString, new ArrayList<PredicateIDAndSupport>()); | |||||
} | |||||
nlPattern_2_predicateList.get(patternString).add( | |||||
new PredicateIDAndSupport(predicate_id, | |||||
support, | |||||
PredicateIDAndSupport.genSlct(patternString.split(" ").length))); | |||||
} | |||||
System.out.println("NLPatterns.addPredicateAsNLPattern(): ok!"); | |||||
} | |||||
public void addHandwriteAsNLPattern() throws IOException { | |||||
String inputFileName = dbpedia_relation_paraphrases_handwrite; | |||||
InputStreamReader in = null; | |||||
BufferedReader br = null; | |||||
try{ | |||||
File file = new File(inputFileName); | |||||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||||
br = new BufferedReader(in); | |||||
String line = null; | |||||
//int lineCount = 0; | |||||
//line = br.readLine();//read the first line which indicates the format | |||||
while ((line = br.readLine()) != null) { | |||||
if (line.startsWith("#") || line.isEmpty()) continue; | |||||
//lineCount ++; | |||||
String[] content = line.split("\t"); | |||||
if(!predicate_2_id.containsKey(content[0])) | |||||
continue; | |||||
int predicateID = predicate_2_id.get(content[0]); | |||||
String nlPattern = content[1].toLowerCase(); | |||||
int support = Integer.parseInt(content[2]); | |||||
if (!nlPattern_2_predicateList.containsKey(nlPattern)) { | |||||
nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>()); | |||||
} | |||||
nlPattern_2_predicateList.get(nlPattern).add( | |||||
new PredicateIDAndSupport(predicateID, | |||||
support, | |||||
PredicateIDAndSupport.genSlct(nlPattern.split(" ").length))); | |||||
} | |||||
}catch(IOException e){ | |||||
System.out.println("NLPatterns.addHandwriteAsNLPattern(): IOException!"); | |||||
}finally{ | |||||
if(br!=null){ | |||||
try{ | |||||
br.close(); | |||||
}catch(IOException e){ | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} | |||||
System.out.println("NLPatterns.addHandwriteAsNLPattern(): ok!"); | |||||
} | |||||
/** | |||||
* Show the NLPatterns | |||||
*/ | |||||
public void showNLPatterns () { | |||||
/*for (String s: syntacticMarker) { | |||||
System.out.println(s); | |||||
} | |||||
GlobalTools.systemPause();*/ | |||||
System.out.println("predicate-->id"); | |||||
for (String s : predicate_2_id.keySet()) { | |||||
System.out.println(s + "-->" + predicate_2_id.get(s)); | |||||
} | |||||
Globals.systemPause(); | |||||
int count = 1; | |||||
System.out.println("nlPattern-->predicate<support>"); | |||||
for (String p : nlPattern_2_predicateList.keySet()) { | |||||
System.out.print("" + (count++) + ".\t" + p + "\t[" + nlPattern_2_predicateList.get(p).size() + "]\t"); | |||||
for (PredicateIDAndSupport i : nlPattern_2_predicateList.get(p)) { | |||||
System.out.print(id_2_predicate.get(i.predicateID) + "<" + i.support + ">" + ", "); | |||||
} | |||||
System.out.println(); | |||||
} | |||||
} | |||||
/** | |||||
* Build the inverted index, where each word will be mapped to the patterns that it occurs | |||||
*/ | |||||
public void buildInvertedIndex () { | |||||
invertedIndex = new HashMap<String, ArrayList<String>>(); | |||||
// traversing all patterns | |||||
for (String p : nlPattern_2_predicateList.keySet()) { | |||||
String[] tokens = p.split(" "); | |||||
for (String token : tokens) { | |||||
if (token.length() < 1) continue; | |||||
if (!invertedIndex.containsKey(token)) { | |||||
invertedIndex.put(token, new ArrayList<String>()); | |||||
} | |||||
invertedIndex.get(token).add(p); | |||||
} | |||||
} | |||||
System.out.println("NLPatterns.buildInvertedIndex(): ok!"); | |||||
} | |||||
public static void main (String[] args) { | |||||
Globals.coreNLP = new CoreNLP(); | |||||
Globals.pd = new ParaphraseDictionary(); | |||||
//Globals.pd.showNLPatterns(); | |||||
} | |||||
} |
@@ -0,0 +1,24 @@ | |||||
package paradict; | |||||
public class PredicateIDAndSupport implements Comparable<PredicateIDAndSupport> { | |||||
public int predicateID; | |||||
public int support; | |||||
public double[] wordSelectivity = null; // wordSelectivity helps PATTY patterns ranking more accurate. | |||||
public PredicateIDAndSupport(int _pid, int _support, double[] _slct) { | |||||
predicateID = _pid; | |||||
support = _support; | |||||
wordSelectivity = _slct; | |||||
} | |||||
public int compareTo(PredicateIDAndSupport o) { | |||||
return o.support - this.support; | |||||
} | |||||
// only use for predicate itself and handwriting paraphrase | |||||
public static double[] genSlct(int size) { | |||||
double[] ret = new double[size]; | |||||
for (int i=0;i<size;i++) ret[i] = 1.0; | |||||
return ret; | |||||
} | |||||
} |
@@ -0,0 +1,105 @@ | |||||
package qa; | |||||
import java.util.ArrayList; | |||||
public class Answer implements Comparable<Answer>{ | |||||
public String questionFocusKey=null; | |||||
public String questionFocusValue=null; | |||||
public ArrayList<String> otherInformationKey = null; | |||||
public ArrayList<String> otherInformationValue = null; | |||||
public Answer(String qf, String[] ans) { | |||||
otherInformationKey = new ArrayList<String>(); | |||||
otherInformationValue = new ArrayList<String>(); | |||||
int p1, p2; | |||||
for (String line : ans) { | |||||
System.out.println("line=" + line); | |||||
if (line.startsWith(qf)) { | |||||
questionFocusKey = qf; | |||||
p1 = line.indexOf('<'); | |||||
p2 = line.lastIndexOf('>'); | |||||
String value = null; | |||||
if (p1 != -1 && p2 != -1) { | |||||
value = line.substring(p1+1, p2); | |||||
} | |||||
else { | |||||
p1 = line.indexOf('\"'); | |||||
p2 = line.lastIndexOf('\"'); | |||||
if(p1 != -1 && p2 != -1) | |||||
value = line.substring(p1+1, p2); | |||||
else | |||||
{ | |||||
p1 = line.indexOf(':'); | |||||
value = line.substring(p1+1); | |||||
} | |||||
} | |||||
questionFocusValue = value; | |||||
} | |||||
else { | |||||
p1 = line.indexOf(':'); | |||||
String key = line.substring(0, p1); | |||||
p1 = line.indexOf('<'); | |||||
p2 = line.lastIndexOf('>'); | |||||
String value = null; | |||||
if (p1 != -1 && p2 != -1) { | |||||
value = line.substring(p1+1, p2); | |||||
} | |||||
else { | |||||
p1 = line.indexOf('\"'); | |||||
p2 = line.lastIndexOf('\"'); | |||||
if(p1 != -1 && p2 != -1) | |||||
value = line.substring(p1+1, p2); | |||||
else | |||||
{ | |||||
p1 = line.indexOf(':'); | |||||
value = line.substring(p1+1); | |||||
} | |||||
} | |||||
otherInformationKey.add(key); | |||||
otherInformationValue.add(value); | |||||
} | |||||
} | |||||
// Sove BUG: GStore return messy code in questionFocusKey | |||||
if (questionFocusKey==null || questionFocusValue==null) | |||||
{ | |||||
questionFocusKey = qf; | |||||
String line = ans[0]; | |||||
p1 = line.indexOf('<'); | |||||
p2 = line.lastIndexOf('>'); | |||||
String value = null; | |||||
if (p1 != -1 && p2 != -1) { | |||||
value = line.substring(p1+1, p2); | |||||
} | |||||
else { | |||||
p1 = line.indexOf('\"'); | |||||
p2 = line.lastIndexOf('\"'); | |||||
if(p1 != -1 && p2 != -1) | |||||
value = line.substring(p1+1, p2); | |||||
else | |||||
{ | |||||
p1 = line.indexOf(':'); | |||||
value = line.substring(p1+1); | |||||
} | |||||
} | |||||
questionFocusValue = value; | |||||
otherInformationKey.clear(); | |||||
otherInformationValue.clear(); | |||||
} | |||||
/*System.out.println("otherInformationKey.size=" + otherInformationKey.size()); | |||||
for (String k : otherInformationKey) { | |||||
System.out.println("otherInfoKey = " + k); | |||||
}*/ | |||||
} | |||||
public int compareTo (Answer p) | |||||
{ | |||||
return questionFocusValue.compareTo(p.questionFocusValue); | |||||
} | |||||
} |
@@ -0,0 +1,376 @@ | |||||
package qa; | |||||
import java.io.*; | |||||
import java.net.Socket; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashSet; | |||||
import java.util.List; | |||||
import jgsc.GstoreConnector; | |||||
import log.QueryLogger; | |||||
import nlp.ds.Sentence; | |||||
import nlp.ds.Sentence.SentenceType; | |||||
import qa.parsing.QuestionParsing; | |||||
import qa.parsing.BuildQueryGraph; | |||||
import rdf.Sparql; | |||||
import utils.FileUtil; | |||||
import addition.AddtionalFix; | |||||
import qa.Globals; | |||||
public class GAnswer { | |||||
public static final int MAX_SPQ_NUM = 3; | |||||
public static void init() { | |||||
System.out.println("gAnswer2 init ..."); | |||||
Globals.init(); | |||||
System.out.println("gAnswer2 init ... ok!"); | |||||
} | |||||
public QueryLogger getSparqlList(String input) | |||||
{ | |||||
QueryLogger qlog = null; | |||||
try | |||||
{ | |||||
if (input.length() <= 5) | |||||
return null; | |||||
System.out.println("[Input:] "+input); | |||||
// step 0: Node (entity & type & literal) Recognition | |||||
long t0 = System.currentTimeMillis(), t, NRtime; | |||||
Query query = new Query(input); | |||||
qlog = new QueryLogger(query); | |||||
ArrayList<Sparql> rankedSparqls = new ArrayList<Sparql>(); | |||||
NRtime = (int)(System.currentTimeMillis()-t0); | |||||
System.out.println("step0 [Node Recognition] : "+ NRtime +"ms"); | |||||
// Try to solve each NR plan, and combine the ranked SPARQLs. | |||||
// We only reserve LOG of BEST NR plan for convenience. | |||||
for(int i=query.sList.size()-1; i>=0; i--) | |||||
{ | |||||
Sentence possibleSentence = query.sList.get(i); | |||||
qlog.reloadSentence(possibleSentence); | |||||
// qlog.isMaltParserUsed = true; | |||||
// LOG | |||||
System.out.println("transQ: "+qlog.s.plainText); | |||||
qlog.NRlog = query.preLog; | |||||
qlog.SQGlog = "Id: "+query.queryId+"\nQuery: "+query.NLQuestion+"\n"; | |||||
qlog.SQGlog += qlog.NRlog; | |||||
qlog.timeTable.put("step0", (int)NRtime); | |||||
// step 1: question parsing (dependency tree, sentence type) | |||||
t = System.currentTimeMillis(); | |||||
QuestionParsing step1 = new QuestionParsing(); | |||||
step1.process(qlog); | |||||
qlog.timeTable.put("step1", (int)(System.currentTimeMillis()-t)); | |||||
// step 2: build query graph (structure construction, relation extraction, top-k join) | |||||
t = System.currentTimeMillis(); | |||||
BuildQueryGraph step2 = new BuildQueryGraph(); | |||||
step2.process(qlog); | |||||
// step2.processEXP(qlog); | |||||
qlog.timeTable.put("step2", (int)(System.currentTimeMillis()-t)); | |||||
// step 3: some fix (such as "one-node" or "ask-one-triple") and aggregation | |||||
t = System.currentTimeMillis(); | |||||
AddtionalFix step3 = new AddtionalFix(); | |||||
step3.process(qlog); | |||||
// Collect SPARQLs. | |||||
rankedSparqls.addAll(qlog.rankedSparqls); | |||||
qlog.timeTable.put("step3", (int)(System.currentTimeMillis()-t)); | |||||
} | |||||
// deduplicate in SPARQL | |||||
for(Sparql spq: rankedSparqls) | |||||
spq.deduplicate(); | |||||
// Sort (descending order). | |||||
Collections.sort(rankedSparqls); | |||||
qlog.rankedSparqls = rankedSparqls; | |||||
System.out.println("number of rankedSparqls = " + qlog.rankedSparqls.size()); | |||||
// Detect question focus. | |||||
for (int i=0; i<qlog.rankedSparqls.size(); i++) | |||||
{ | |||||
// First detect by SPARQLs. | |||||
Sparql spq = qlog.rankedSparqls.get(i); | |||||
String questionFocus = QuestionParsing.detectQuestionFocus(spq); | |||||
// If failed, use TARGET directly. | |||||
if(questionFocus == null) | |||||
questionFocus = "?"+qlog.target.originalForm; | |||||
spq.questionFocus = questionFocus; | |||||
} | |||||
return qlog; | |||||
} | |||||
catch (Exception e) { | |||||
e.printStackTrace(); | |||||
return qlog; | |||||
} | |||||
} | |||||
public String getStdSparqlWoPrefix(QueryLogger qlog, Sparql curSpq) | |||||
{ | |||||
if(qlog == null || curSpq == null) | |||||
return null; | |||||
String res = ""; | |||||
if (qlog.s.sentenceType==SentenceType.GeneralQuestion) | |||||
res += "ask where"; | |||||
else | |||||
{ | |||||
if(!curSpq.countTarget) | |||||
res += ("select DISTINCT " + curSpq.questionFocus + " where"); | |||||
else | |||||
res += ("select COUNT(DISTINCT " + curSpq.questionFocus + ") where"); | |||||
} | |||||
res += "\n"; | |||||
res += curSpq.toStringForGStore(); | |||||
if(curSpq.moreThanStr != null) | |||||
{ | |||||
res += curSpq.moreThanStr+"\n"; | |||||
} | |||||
if(curSpq.mostStr != null) | |||||
{ | |||||
res += curSpq.mostStr+"\n"; | |||||
} | |||||
return res; | |||||
} | |||||
// Notice, this will change the original SPARQL. | |||||
public Sparql getUntypedSparql (Sparql spq) | |||||
{ | |||||
if(spq == null) | |||||
return null; | |||||
spq.removeAllTypeInfo(); | |||||
if (spq.tripleList.size() == 0) return null; | |||||
return spq; | |||||
} | |||||
/** | |||||
* Get answers from Virtuoso + DBpedia, this function require OLD version Virtuoso + Virtuoso Handler. | |||||
* Virtuoso can solve "Aggregation" | |||||
**/ | |||||
// public Matches getAnswerFromVirtuoso (QueryLogger qlog, Sparql spq) | |||||
// { | |||||
// Matches ret = new Matches(); | |||||
// try | |||||
// { | |||||
// Socket socket = new Socket(Globals.QueryEngineIP, 1112); | |||||
// DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream())); | |||||
// | |||||
// //formatting SPARQL & evaluate | |||||
// String formatedSpq = spq.toStringForVirtuoso(); | |||||
// dos.writeUTF(formatedSpq); | |||||
// dos.flush(); | |||||
// System.out.println("STD SPARQL:\n"+formatedSpq+"\n"); | |||||
// | |||||
// ArrayList<String> rawLines = new ArrayList<String>(); | |||||
// DataInputStream dis = new DataInputStream(new BufferedInputStream(socket.getInputStream())); | |||||
// while (true) | |||||
// { | |||||
// String line = dis.readUTF(); | |||||
// if (line.equals("[[finish]]")) break; | |||||
// rawLines.add(line); | |||||
// } | |||||
// | |||||
// // ASK query was translated to SELECT query, whose answer need translation. | |||||
// // It is no need to translate, use "ASK WHERE" directly ! 2018-12-11 | |||||
// if(qlog.s.sentenceType == SentenceType.GeneralQuestion) | |||||
// { | |||||
// ret.answersNum = 1; | |||||
// ret.answers = new String[1][1]; | |||||
// if(rawLines.size() == 0) | |||||
// { | |||||
// ret.answers[0][0] = "general:false"; | |||||
// } | |||||
// else | |||||
// { | |||||
// ret.answers[0][0] = "general:true"; | |||||
// } | |||||
// System.out.println("general question answer:" + ret.answers[0][0]); | |||||
// dos.close(); | |||||
// dis.close(); | |||||
// socket.close(); | |||||
// return ret; | |||||
// } | |||||
// | |||||
// //select but no results | |||||
// if (rawLines.size() == 0) | |||||
// { | |||||
// ret.answersNum = 0; | |||||
// dos.close(); | |||||
// dis.close(); | |||||
// socket.close(); | |||||
// return ret; | |||||
// } | |||||
// | |||||
// int ansNum = rawLines.size(); | |||||
// int varNum = variables.size(); | |||||
// ArrayList<String> valist = new ArrayList<String>(variables); | |||||
// ret.answers = new String[ansNum][varNum]; | |||||
// | |||||
// System.out.println("ansNum=" + ansNum); | |||||
// System.out.println("varNum=" + varNum); | |||||
// for (int i=0;i<rawLines.size();i++) | |||||
// { | |||||
// String[] ansLineContents = rawLines.get(i).split("\t"); | |||||
// for (int j=0;j<varNum;j++) | |||||
// { | |||||
// ret.answers[i][j] = valist.get(j) + ":" + ansLineContents[j]; | |||||
// } | |||||
// } | |||||
// | |||||
// dos.close(); | |||||
// dis.close(); | |||||
// socket.close(); | |||||
// } | |||||
// catch (Exception e) { | |||||
// e.printStackTrace(); | |||||
// } | |||||
// | |||||
// return ret; | |||||
// } | |||||
public Matches getAnswerFromGStore2 (Sparql spq) | |||||
{ | |||||
// modified by Lin Yinnian using ghttp - 2018-9-28 | |||||
GstoreConnector gc = new GstoreConnector("172.31.222.90", 9001); | |||||
String answer = gc.query("root", "123456", "dbpedia16", spq.toStringForGStore2()); | |||||
System.out.println(answer); | |||||
String[] rawLines = answer.split("\n"); | |||||
Matches ret = new Matches(); | |||||
if (rawLines.length == 0 || rawLines[0].equals("[empty result]")) | |||||
{ | |||||
ret.answersNum = 0; | |||||
return ret; | |||||
} | |||||
int ansNum = rawLines.length-1; | |||||
String[] varLineContents = rawLines[0].split("\t"); | |||||
int varNum = varLineContents.length; | |||||
ret.answers = new String[ansNum][varNum]; | |||||
System.out.println("ansNum=" + ansNum); | |||||
System.out.println("varNum=" + varNum); | |||||
System.out.println("rawLines.length=" + rawLines.length); | |||||
for (int i=1;i<rawLines.length;i++) | |||||
{ | |||||
// if one answer of rawAnswer contains '\n', it may leads error so we just return. | |||||
if(i-1 >= ansNum) | |||||
break; | |||||
String[] ansLineContents = rawLines[i].split("\t"); | |||||
for (int j=0;j<varNum;j++) | |||||
{ | |||||
ret.answers[i-1][j] = varLineContents[j] + ":" + ansLineContents[j]; | |||||
} | |||||
} | |||||
return ret; | |||||
} | |||||
public static void main (String[] args) | |||||
{ | |||||
Globals.init(); | |||||
GAnswer ga = new GAnswer(); | |||||
int i =1; | |||||
//file in/output | |||||
List<String> inputList = FileUtil.readFile("E:/Linyinnian/qald6_special.txt"); | |||||
for(String input: inputList) | |||||
{ | |||||
ArrayList<String> outputs = new ArrayList<String>(); | |||||
ArrayList<String> spqs = new ArrayList<String>(); | |||||
spqs.add("id:"+String.valueOf(i)); | |||||
i++; | |||||
long parsing_st_time = System.currentTimeMillis(); | |||||
QueryLogger qlog = ga.getSparqlList(input); | |||||
if(qlog == null || qlog.rankedSparqls == null) | |||||
continue; | |||||
long parsing_ed_time = System.currentTimeMillis(); | |||||
System.out.println("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); | |||||
System.out.println("TripleCheck time: "+ qlog.timeTable.get("TripleCheck") + "ms"); | |||||
System.out.println("SparqlCheck time: "+ qlog.timeTable.get("SparqlCheck") + "ms"); | |||||
System.out.println("Ranked Sparqls: " + qlog.rankedSparqls.size()); | |||||
outputs.add(qlog.SQGlog); | |||||
outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms"); | |||||
outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms"); | |||||
outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); | |||||
long excuting_st_time = System.currentTimeMillis(); | |||||
Matches m = null; | |||||
System.out.println("[RESULT]"); | |||||
ArrayList<String> lastSpqList = new ArrayList<String>(); | |||||
int idx; | |||||
// Consider top-5 SPARQLs | |||||
for(idx=1; idx<=Math.min(qlog.rankedSparqls.size(), 5); idx++) | |||||
{ | |||||
Sparql curSpq = qlog.rankedSparqls.get(idx-1); | |||||
String stdSPQwoPrefix = ga.getStdSparqlWoPrefix(qlog, curSpq); | |||||
lastSpqList.add(stdSPQwoPrefix); | |||||
System.out.println("[" + idx + "]" + "score=" + curSpq.score); | |||||
System.out.println(stdSPQwoPrefix); | |||||
// Print top-3 SPARQLs to file. | |||||
if(idx <= MAX_SPQ_NUM) | |||||
// spqs.add("[" + idx + "]" + "score=" + curSpq.score + "\n" + stdSPQwoPrefix); | |||||
outputs.add("[" + idx + "]" + "score=" + curSpq.score + "\n" + stdSPQwoPrefix); | |||||
// // Execute by Virtuoso or GStore when answers not found | |||||
if(m == null || m.answers == null) | |||||
{ | |||||
if (curSpq.tripleList.size()>0 && curSpq.questionFocus!=null) | |||||
{ | |||||
// if(ga.isBGP(qlog, curSpq)) | |||||
m = ga.getAnswerFromGStore2(curSpq); | |||||
// else | |||||
// m = ga.getAnswerFromVirtuoso(qlog, curSpq); | |||||
} | |||||
if (m != null && m.answers != null) | |||||
{ | |||||
// Found results using current SPQ, then we can break and print result. | |||||
qlog.sparql = curSpq; | |||||
qlog.match = m; | |||||
qlog.reviseAnswers(); | |||||
System.out.println("Query Executing time: "+ (int)(System.currentTimeMillis() - excuting_st_time)+ "ms"); | |||||
} | |||||
} | |||||
} | |||||
// Some TYPEs can be omitted, (such as <type> <yago:Wife>) | |||||
if(!qlog.rankedSparqls.isEmpty()) | |||||
{ | |||||
Sparql untypedSparql = ga.getUntypedSparql(qlog.rankedSparqls.get(0)); | |||||
if(untypedSparql != null) | |||||
{ | |||||
String stdSPQwoPrefix = ga.getStdSparqlWoPrefix(qlog, untypedSparql); | |||||
if(!lastSpqList.contains(stdSPQwoPrefix)) | |||||
// spqs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n"); | |||||
outputs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n"); | |||||
} | |||||
} | |||||
outputs.add(qlog.match.toString()); | |||||
FileUtil.writeFile(outputs, "E:/Linyinnian/qald6_special_out.txt", true); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,118 @@ | |||||
package qa; | |||||
import java.io.BufferedReader; | |||||
import java.io.IOException; | |||||
import java.io.InputStreamReader; | |||||
import lcn.EntityFragmentFields; | |||||
import fgmt.RelationFragment; | |||||
import fgmt.TypeFragment; | |||||
import paradict.ParaphraseDictionary; | |||||
import qa.mapping.DBpediaLookup; | |||||
import nlp.tool.NERecognizer; | |||||
import nlp.tool.CoreNLP; | |||||
import nlp.tool.MaltParser; | |||||
import nlp.tool.StanfordParser; | |||||
import nlp.tool.StopWordsList; | |||||
public class Globals { | |||||
// nlp tools | |||||
public static CoreNLP coreNLP; | |||||
public static StanfordParser stanfordParser; | |||||
public static StopWordsList stopWordsList; | |||||
public static MaltParser maltParser; | |||||
public static NERecognizer nerRecognizer; | |||||
// relation paraphrase dictionary | |||||
public static ParaphraseDictionary pd; | |||||
// entity linking system | |||||
public static DBpediaLookup dblk; | |||||
public static int MaxAnswerNum = 100; | |||||
/* | |||||
* evaluationMethod: | |||||
* 1. baseline(SQG), does not allow CIRCLE and WRONG edge. The structure may be different by changing the TARGET. | |||||
* 2. super SQG, allow CIRCLE and WRONG edge. The structure is decided by DS tree, and can be changed in query evaluation(TOP-K match) stage. | |||||
* */ | |||||
public static int evaluationMethod = 2; | |||||
public static boolean isRunAsWebServer = false; // Run Local: false; Run Server: true | |||||
public static String runningBenchmark = "QALD"; // WQ:WebQuestions; WQSP:WebQuestionsSP; CQ:ComplexQuestions | |||||
// using different method and Freebase Version (in Virtuoso.java) | |||||
public static boolean usingOperationCondition = false; // only for EXP: try state transition operations only when condition are satisfied. | |||||
public static String localPath = "/media/wip/husen/NBgAnswer/"; | |||||
public static String QueryEngineIP = "127.0.0.1"; // Notice, PORT number is in the evaluation function. | |||||
public static void init () | |||||
{ | |||||
System.out.println("====== gAnswer2.0 over DBpedia ======"); | |||||
if(isRunAsWebServer == false) | |||||
{ | |||||
localPath = "D:/husen/gAnswer/"; | |||||
QueryEngineIP = "172.31.222.72"; | |||||
} | |||||
long t1, t2, t3, t4, t5, t6, t7, t8, t9; | |||||
t1 = System.currentTimeMillis(); | |||||
coreNLP = new CoreNLP(); | |||||
t2 = System.currentTimeMillis(); | |||||
stanfordParser = new StanfordParser(); | |||||
t3 = System.currentTimeMillis(); | |||||
maltParser = new MaltParser(); | |||||
t4 = System.currentTimeMillis(); | |||||
nerRecognizer = new NERecognizer(); | |||||
t5 = System.currentTimeMillis(); | |||||
stopWordsList = new StopWordsList(); | |||||
t6 = System.currentTimeMillis(); | |||||
pd = new ParaphraseDictionary(); | |||||
t7 = System.currentTimeMillis(); | |||||
try | |||||
{ | |||||
EntityFragmentFields.load(); | |||||
RelationFragment.load(); | |||||
TypeFragment.load(); | |||||
} | |||||
catch (Exception e1) { | |||||
System.out.println("EntityIDs and RelationFragment and TypeFragment loading error!"); | |||||
e1.printStackTrace(); | |||||
} | |||||
t8 = System.currentTimeMillis(); | |||||
dblk = new DBpediaLookup(); | |||||
t9 = System.currentTimeMillis(); | |||||
System.out.println("======Initialization======"); | |||||
System.out.println("CoreNLP(Lemma): " + (t2-t1) + "ms."); | |||||
System.out.println("StanfordParser: " + (t3-t2) + "ms."); | |||||
System.out.println("MaltParser: " + (t4-t3) + "ms."); | |||||
System.out.println("NERecognizer: " + (t5-t4) + "ms."); | |||||
System.out.println("StopWordsList: " + (t6-t5) + "ms."); | |||||
System.out.println("ParaphraseDict & posTagPattern: " + (t7-t6) + "ms."); | |||||
System.out.println("GraphFragments: " + (t8-t7) + "ms."); | |||||
System.out.println("DBpediaLookup: " + (t9-t8) + "ms."); | |||||
System.out.println("* Total *: " + (t9-t1) + "ms."); | |||||
System.out.println("=========================="); | |||||
} | |||||
/** | |||||
* Use as system("pause") in C | |||||
*/ | |||||
public static void systemPause () { | |||||
System.out.println("System pause ..."); | |||||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||||
try { | |||||
br.readLine(); | |||||
} catch (IOException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,9 @@ | |||||
package qa; | |||||
public class Matches { | |||||
public String[][] answers = null; | |||||
public int answersNum = 0; | |||||
public long time = 0; | |||||
public static final int pageNum = 3000; | |||||
} |
@@ -0,0 +1,128 @@ | |||||
package qa; | |||||
import java.util.ArrayList; | |||||
import nlp.ds.Sentence; | |||||
import qa.extract.EntityRecognition; | |||||
import rdf.MergedWord; | |||||
/** | |||||
* 1. preprocessing of question | |||||
* 2. Node Recognition | |||||
* @author husen | |||||
*/ | |||||
public class Query | |||||
{ | |||||
public String NLQuestion = null; | |||||
public String TransferedQuestion = null; | |||||
public ArrayList<String> MergedQuestionList = null; | |||||
public ArrayList<Sentence> sList = null; | |||||
public String queryId = null; | |||||
public String preLog = ""; | |||||
public ArrayList<MergedWord> mWordList = null; | |||||
public Query(){} | |||||
public Query(String _question) | |||||
{ | |||||
NLQuestion = _question; | |||||
NLQuestion = removeQueryId(NLQuestion); | |||||
TransferedQuestion = getTransferedQuestion(NLQuestion); | |||||
// step1. NODE Recognition | |||||
MergedQuestionList = getMergedQuestionList(TransferedQuestion); | |||||
// build Sentence | |||||
sList = new ArrayList<Sentence>(); | |||||
for(String mergedQuestion: MergedQuestionList) | |||||
{ | |||||
Sentence sentence = new Sentence(this, mergedQuestion); | |||||
sList.add(sentence); | |||||
} | |||||
} | |||||
public boolean isDigit(char ch) | |||||
{ | |||||
if(ch>='0' && ch<='9') | |||||
return true; | |||||
return false; | |||||
} | |||||
public boolean isUpperWord(char ch) | |||||
{ | |||||
if(ch>='A' && ch<='Z') | |||||
return true; | |||||
return false; | |||||
} | |||||
/** | |||||
* some words -> equivalent words | |||||
* 1、stanfordParser often parse incorrect. | |||||
* 2、Synonyms unify. eg, movie->film | |||||
* @param question | |||||
* @return transfered question | |||||
*/ | |||||
public String getTransferedQuestion(String question) | |||||
{ | |||||
//rule1: discard ".", because "." and "_" will be disconnected by parser. Discard word tail's "'", which may pollutes NER | |||||
question = question.replace("' ", " "); | |||||
String [] words = question.split(" "); | |||||
String ret = ""; | |||||
for(String word: words) | |||||
{ | |||||
String retWord = word; | |||||
//TODO: now just check NUM in head/tail | |||||
if(word.length()>=2 && !isDigit(word.charAt(0)) && !isDigit(word.charAt(word.length()-1))) | |||||
{ | |||||
retWord = retWord.replace(".", ""); | |||||
} | |||||
ret += retWord + " "; | |||||
} | |||||
if(ret.length()>1) | |||||
ret = ret.substring(0,ret.length()-1); | |||||
ret = ret.replace("-", " "); | |||||
ret = ret.replace("in america", "in United States"); | |||||
//rule2: as well as -> and | |||||
ret = ret.replace("as well as", "and"); | |||||
//rule3: movie -> film | |||||
ret = ret.replace(" movie", " film"); | |||||
ret = ret.replace(" movies", " films"); | |||||
return ret; | |||||
} | |||||
/** | |||||
* Recognize entity & type & literal in KB and replace " " in Phrases with "_" | |||||
* @param question | |||||
* @return merged question list | |||||
*/ | |||||
public ArrayList<String> getMergedQuestionList(String question) | |||||
{ | |||||
ArrayList<String> mergedQuestionList = null; | |||||
//entity & type recognize | |||||
EntityRecognition er = new EntityRecognition(); | |||||
mergedQuestionList = er.process(question); | |||||
preLog = er.preLog; | |||||
mWordList = er.mWordList; | |||||
return mergedQuestionList; | |||||
} | |||||
public String removeQueryId(String question) | |||||
{ | |||||
String ret = question; | |||||
int st = question.indexOf("\t"); | |||||
if(st!=-1 && question.length()>1 && question.charAt(0)>='0' && question.charAt(0)<='9') | |||||
{ | |||||
queryId = question.substring(0,st); | |||||
ret = question.substring(st+1); | |||||
System.out.println("Extract QueryId :"+queryId); | |||||
} | |||||
return ret; | |||||
} | |||||
} |
@@ -0,0 +1,153 @@ | |||||
package qa.extract; | |||||
import java.util.ArrayList; | |||||
import java.util.HashSet; | |||||
import qa.Globals; | |||||
import log.QueryLogger; | |||||
import nlp.ds.DependencyTree; | |||||
import nlp.ds.DependencyTreeNode; | |||||
import nlp.ds.Word; | |||||
import rdf.SimpleRelation; | |||||
public class CorefResolution { | |||||
/** | |||||
* 1. a very simple reference resolution | |||||
* 2. Coref Resolution should be done after relation extraction and before items mapping | |||||
*/ | |||||
public void process(ArrayList<SimpleRelation> simpleRelations, QueryLogger qlog) { | |||||
if (qlog.s.words.length <= 4) return; // if the sentence is too short, skip the coref step. | |||||
System.out.println("=====Co-reference resolution======="); | |||||
ArrayList<SimpleRelation> deleteList = new ArrayList<SimpleRelation>(); | |||||
for(SimpleRelation sr : simpleRelations) { | |||||
Word w1=null, w2=null; | |||||
if (sr.extractingMethod == 'S') { | |||||
w1 = getRefWord(sr.arg1Word.getNnHead(), qlog.s.dependencyTreeStanford, qlog); | |||||
w2 = getRefWord(sr.arg2Word.getNnHead(), qlog.s.dependencyTreeStanford, qlog); | |||||
} | |||||
else if (sr.extractingMethod == 'M') { | |||||
w1 = getRefWord(sr.arg1Word.getNnHead(), qlog.s.dependencyTreeMalt, qlog); | |||||
w2 = getRefWord(sr.arg2Word.getNnHead(), qlog.s.dependencyTreeMalt, qlog); | |||||
} | |||||
else { | |||||
continue; | |||||
} | |||||
if (w1 != null) { | |||||
sr.arg1Word_beforeCRR = sr.arg1Word; | |||||
sr.arg1Word = w1; | |||||
} | |||||
if (w2 != null) { | |||||
sr.arg2Word_beforeCRR = sr.arg2Word; | |||||
sr.arg2Word = w2; | |||||
} | |||||
if (sr.arg1Word == sr.arg2Word) | |||||
deleteList.add(sr); | |||||
} | |||||
simpleRelations.removeAll(deleteList); | |||||
printCRR(qlog); | |||||
System.out.println("==================================="); | |||||
} | |||||
// return the reference word of w | |||||
public Word getRefWord (Word w, DependencyTree dt, QueryLogger qlog) { | |||||
w = w.getNnHead(); | |||||
if (w.crr != null) { | |||||
return w.crr; | |||||
} | |||||
/* | |||||
* method: (suitable for stanford parser (old version)) | |||||
* (1) WDT --det--> [] eg: Which city is located in China? | |||||
* (2) WDT -------> V/J --rcmod--> [] eg: Who is married to someone that was born in Rome? | |||||
* "when is the sth" is conflict with this rule, so discarded. (3) W -------> be <------- [] eg: Who is the author of WikiLeaks? | |||||
* (4) WDT -------> V --ccomp--> [] eg: The actor that married the child of a politician. | |||||
* (5) DT(that, which) --dep--> V eg:The actors that married an athlete. // DS parser error. | |||||
* (6) W(position=1) ------> NN eg:What are the language used in China? // DS parser error, should eliminate "WRB":When was Carlo Giuliani shot? | |||||
* (7) where <--advmod-- V <--advcl-- V --prep/pobj--> [] eg: Who graduate from the school where Keqiang Li graduates? | |||||
*/ | |||||
DependencyTreeNode dtn = dt.getNodeByIndex(w.position); | |||||
// no need for root | |||||
if (dtn.father == null) return null; | |||||
try { | |||||
if(dtn.word.posTag.equals("WDT") && dtn.dep_father2child.equals("det")) { // (1) | |||||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead()); | |||||
w.crr = dtn.father.word.getNnHead(); | |||||
} | |||||
else if(dtn.word.posTag.startsWith("W") && !dtn.word.posTag.equals("WRB") && dtn.word.position == 1 && dtn.father.word.posTag.equals("NN")) { // (6) | |||||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead()); | |||||
w.crr = dtn.father.word.getNnHead(); | |||||
} | |||||
else if(dtn.word.posTag.equals("DT") | |||||
&& dtn.dep_father2child.equals("dep") | |||||
&& (dtn.word.baseForm.equals("that")||dtn.word.baseForm.equals("which"))) { // (5) | |||||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead()); | |||||
w.crr = dtn.father.word.getNnHead(); | |||||
} | |||||
// else if(dtn.word.posTag.startsWith("W") | |||||
// && dtn.father.word.baseForm.equals("be")) { // (3) //&& dtn.dep_father2child.equals("attr") | |||||
// DependencyTreeNode target = dtn.father.containDependencyWithChildren("nsubj"); | |||||
// if (target != null) { | |||||
// if(qlog.MODE_debug) System.out.println(w + "-->" + target.word.getNnHead()); | |||||
// w.crr = target.word.getNnHead(); | |||||
// } | |||||
// } | |||||
else if(dtn.word.posTag.equals("WDT") | |||||
&& (dtn.father.word.posTag.startsWith("V") || dtn.father.word.posTag.startsWith("J")) | |||||
&& dtn.father.dep_father2child.equals("rcmod")) { // (2) | |||||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.father.word.getNnHead()); | |||||
w.crr = dtn.father.father.word.getNnHead(); | |||||
} | |||||
else if(dtn.word.posTag.equals("WDT") | |||||
&& dtn.father.word.posTag.startsWith("V") | |||||
&& dtn.father.dep_father2child.equals("ccomp")) { // (4) | |||||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.father.word.getNnHead()); | |||||
w.crr = dtn.father.father.word.getNnHead(); | |||||
} | |||||
else if (dtn.word.baseForm.equals("where") | |||||
&& dtn.dep_father2child.equals("advmod") | |||||
&& dtn.father.dep_father2child.equals("advcl")) { // (7) | |||||
DependencyTreeNode target = dtn.father.father.containDependencyWithChildren("prep"); | |||||
if (target != null) { | |||||
target = target.containDependencyWithChildren("pobj"); | |||||
} | |||||
else { | |||||
for (DependencyTreeNode n : dtn.father.father.childrenList) { | |||||
if (Globals.pd.relns_object.contains(n.dep_father2child)) { | |||||
target = n; | |||||
} | |||||
} | |||||
} | |||||
if (target != null) { | |||||
if(qlog.MODE_debug) System.out.println(w + "-->" + target.word.getNnHead()); | |||||
w.crr = target.word.getNnHead(); | |||||
} | |||||
} | |||||
} catch (Exception e) {} | |||||
return w.crr; | |||||
} | |||||
public void printCRR (QueryLogger qlog) { | |||||
HashSet<Word> printed = new HashSet<Word>(); | |||||
for (Word w : qlog.s.words) { | |||||
w = w.getNnHead(); | |||||
if (printed.contains(w)) | |||||
continue; | |||||
if (w.crr != null) | |||||
System.out.println("\""+w.getFullEntityName() + "\" is resoluted to \"" + w.crr.getFullEntityName() + "\""); | |||||
printed.add(w); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,918 @@ | |||||
package qa.extract; | |||||
import java.io.BufferedReader; | |||||
//import java.io.File; | |||||
//import java.io.FileInputStream; | |||||
//import java.io.FileNotFoundException; | |||||
//import java.io.FileOutputStream; | |||||
import java.io.IOException; | |||||
import java.io.InputStreamReader; | |||||
//import java.io.OutputStreamWriter; | |||||
//import java.io.UnsupportedEncodingException; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.Comparator; | |||||
import java.util.HashMap; | |||||
import java.util.List; | |||||
import lcn.EntityFragmentFields; | |||||
import fgmt.EntityFragment; | |||||
import nlp.ds.Word; | |||||
import qa.Globals; | |||||
import rdf.EntityMapping; | |||||
import rdf.NodeSelectedWithScore; | |||||
import rdf.TypeMapping; | |||||
import rdf.MergedWord; | |||||
import utils.FileUtil; | |||||
import addition.*; | |||||
/** | |||||
* Core class of Node Recognition | |||||
* @author husen | |||||
*/ | |||||
public class EntityRecognition { | |||||
public String preLog = ""; | |||||
public String stopEntFilePath = Globals.localPath + "data/DBpedia2016/parapharse/stopEntDict.txt"; | |||||
double EntAcceptedScore = 26; | |||||
double TypeAcceptedScore = 0.5; | |||||
double AcceptedDiffScore = 1; | |||||
public HashMap<String, String> m2e = null; | |||||
public ArrayList<MergedWord> mWordList = null; | |||||
public ArrayList<String> stopEntList = null; | |||||
public ArrayList<String> badTagListForEntAndType = null; | |||||
ArrayList<ArrayList<Integer>> selectedList = null; | |||||
TypeRecognition tr = null; | |||||
AddtionalFix af = null; | |||||
public EntityRecognition() | |||||
{ | |||||
// LOG | |||||
preLog = ""; | |||||
loadStopEntityDict(); | |||||
// Bad posTag for entity | |||||
badTagListForEntAndType = new ArrayList<String>(); | |||||
badTagListForEntAndType.add("RBS"); | |||||
badTagListForEntAndType.add("JJS"); | |||||
badTagListForEntAndType.add("W"); | |||||
badTagListForEntAndType.add("."); | |||||
badTagListForEntAndType.add("VBD"); | |||||
badTagListForEntAndType.add("VBN"); | |||||
badTagListForEntAndType.add("VBZ"); | |||||
badTagListForEntAndType.add("VBP"); | |||||
badTagListForEntAndType.add("POS"); | |||||
// !Handwriting entity linking; (lower case) | |||||
m2e = new HashMap<String, String>(); | |||||
m2e.put("bipolar_syndrome", "Bipolar_disorder"); | |||||
m2e.put("battle_in_1836_in_san_antonio", "Battle_of_San_Jacinto"); | |||||
m2e.put("federal_minister_of_finance_in_germany", "Federal_Ministry_of_Finance_(Germany)"); | |||||
// Additional fix for CATEGORY (in DBpedia) | |||||
af = new AddtionalFix(); | |||||
tr = new TypeRecognition(); | |||||
System.out.println("EntityRecognizer Initial : ok!"); | |||||
} | |||||
public void loadStopEntityDict() | |||||
{ | |||||
stopEntList = new ArrayList<String>(); | |||||
try | |||||
{ | |||||
List<String> inputs = FileUtil.readFile(stopEntFilePath); | |||||
for(String line: inputs) | |||||
{ | |||||
if(line.startsWith("#")) | |||||
continue; | |||||
stopEntList.add(line); | |||||
} | |||||
} | |||||
catch (Exception e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
public ArrayList<String> process(String question) | |||||
{ | |||||
ArrayList<String> fixedQuestionList = new ArrayList<String>(); | |||||
ArrayList<Integer> literalList = new ArrayList<Integer>(); | |||||
HashMap<Integer, Double> entityScores = new HashMap<Integer, Double>(); | |||||
HashMap<Integer, Integer> entityMappings = new HashMap<Integer, Integer>(); | |||||
HashMap<Integer, Double> typeScores = new HashMap<Integer, Double>(); | |||||
HashMap<Integer, String> typeMappings = new HashMap<Integer, String>(); | |||||
HashMap<Integer, Double> mappingScores = new HashMap<Integer, Double>(); | |||||
ArrayList<Integer> mustSelectedList = new ArrayList<Integer>(); | |||||
System.out.println("--------- entity/type recognition start ---------"); | |||||
Word[] words = Globals.coreNLP.getTaggedWords(question); | |||||
mWordList = new ArrayList<MergedWord>(); | |||||
long t1 = System.currentTimeMillis(); | |||||
int checkEntCnt = 0, checkTypeCnt = 0, hitEntCnt = 0, hitTypeCnt = 0, allCnt = 0; | |||||
boolean needRemoveCommas = false; | |||||
// Check entity & type | |||||
// Notice, ascending order by length | |||||
StringBuilder tmpOW = new StringBuilder(); | |||||
StringBuilder tmpBW = new StringBuilder(); | |||||
for(int len=1; len<=words.length; len++) | |||||
{ | |||||
for(int st=0,ed=st+len; ed<=words.length; st++,ed++) | |||||
{ | |||||
String originalWord = "", baseWord = "", allUpperWord = ""; | |||||
//String[] posTagArr = new String[len]; | |||||
for(int j=st; j<ed; j++) | |||||
{ | |||||
//posTagArr[j-st] = words[j].posTag; | |||||
//originalWord += words[j].originalForm; | |||||
//baseWord += words[j].baseForm; | |||||
tmpOW.append(words[j].originalForm); | |||||
tmpBW.append(words[j].baseForm); | |||||
String tmp = words[j].originalForm; | |||||
if(tmp.length()>0 && tmp.charAt(0) >='a' && tmp.charAt(0)<='z') | |||||
{ | |||||
String pre = tmp.substring(0,1).toUpperCase(); | |||||
tmp = pre + tmp.substring(1); | |||||
} | |||||
allUpperWord += tmp; | |||||
if(j < ed-1) | |||||
{ | |||||
//originalWord += "_"; | |||||
//baseWord += "_"; | |||||
tmpOW.append("_"); | |||||
tmpBW.append("_"); | |||||
} | |||||
} | |||||
originalWord = tmpOW.toString(); | |||||
baseWord=tmpBW.toString(); | |||||
tmpOW.setLength(0); | |||||
tmpBW.setLength(0); | |||||
allCnt++; | |||||
/* | |||||
* Filters to save time and drop some bad cases. | |||||
*/ | |||||
boolean entOmit = false, typeOmit = false; | |||||
int prep_cnt=0; | |||||
// Upper words can pass filter. eg: "Melbourne , Florida" | |||||
int UpperWordCnt = 0; | |||||
for(int i=st;i<ed;i++) | |||||
if((words[i].originalForm.charAt(0)>='A' && words[i].originalForm.charAt(0)<='Z') | |||||
|| ((words[i].posTag.equals(",") || words[i].originalForm.equals("'")) && i>st && i<ed-1)) | |||||
UpperWordCnt++; | |||||
// Filters | |||||
if(UpperWordCnt<len || st==0) | |||||
{ | |||||
if(st==0) | |||||
{ | |||||
if(!words[st].posTag.startsWith("DT") && !words[st].posTag.startsWith("N")) | |||||
{ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
} | |||||
else if(st>0) | |||||
{ | |||||
Word formerWord = words[st-1]; | |||||
//as princess | |||||
if(formerWord.baseForm.equals("as")) | |||||
entOmit = true; | |||||
//how many dogs? | |||||
if(formerWord.baseForm.equals("many")) | |||||
entOmit = true; | |||||
//obama's daughter ; your height | len=1 to avoid: Asimov's Foundation series | |||||
if(len == 1 && (formerWord.posTag.startsWith("POS") || formerWord.posTag.startsWith("PRP"))) | |||||
entOmit = true; | |||||
//the father of you | |||||
if(ed<words.length) | |||||
{ | |||||
Word nextWord = words[ed]; | |||||
if(formerWord.posTag.equals("DT") && nextWord.posTag.equals("IN")) | |||||
entOmit = true; | |||||
} | |||||
//the area code of ; the official language of | |||||
boolean flag1=false, flag2=false; | |||||
for(int i=0;i<=st;i++) | |||||
if(words[i].posTag.equals("DT")) | |||||
flag1 = true; | |||||
for(int i=ed-1;i<words.length;i++) | |||||
if(words[i].posTag.equals("IN")) | |||||
flag2 = true; | |||||
if(flag1 && flag2) | |||||
entOmit = true; | |||||
} | |||||
if(ed < words.length) | |||||
{ | |||||
Word nextWord = words[ed]; | |||||
// (lowerCase)+(UpperCase) | |||||
if(nextWord.originalForm.charAt(0)>='A' && nextWord.originalForm.charAt(0)<='Z') | |||||
entOmit = true; | |||||
} | |||||
for(int i=st;i<ed;i++) | |||||
{ | |||||
if(words[i].posTag.startsWith("I")) | |||||
prep_cnt++; | |||||
for(String badTag: badTagListForEntAndType) | |||||
{ | |||||
if(words[i].posTag.startsWith(badTag)) | |||||
{ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
break; | |||||
} | |||||
} | |||||
if(words[i].posTag.startsWith("P") && (i!=ed-1 || len==1)){ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
// First word | |||||
if(i==st) | |||||
{ | |||||
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("EX") || words[i].posTag.startsWith("TO")) | |||||
{ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
if(words[i].posTag.startsWith("D") && len==2){ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
if(words[i].baseForm.startsWith("list") || words[i].baseForm.startsWith("many")) | |||||
{ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
if(words[i].baseForm.equals("and")) | |||||
{ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
} | |||||
// Last word. | |||||
if(i==ed-1) | |||||
{ | |||||
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("D") || words[i].posTag.startsWith("TO")) | |||||
{ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
if(words[i].baseForm.equals("and")) | |||||
{ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
} | |||||
// Single word. | |||||
if(len==1) | |||||
{ | |||||
//TODO: Omit general noun. eg: father, book ... | |||||
if(!words[i].posTag.startsWith("N")) | |||||
{ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
} | |||||
} | |||||
// Too many preposition. | |||||
if(prep_cnt >= 3) | |||||
{ | |||||
entOmit = true; | |||||
typeOmit = true; | |||||
} | |||||
} | |||||
/* | |||||
* Filter done. | |||||
*/ | |||||
// Search category | highest priority | |||||
String category = null; | |||||
if(af.pattern2category.containsKey(baseWord)) | |||||
{ | |||||
typeOmit = true; | |||||
entOmit = true; | |||||
category = af.pattern2category.get(baseWord); | |||||
} | |||||
// Search type | |||||
int hitMethod = 0; // 1=dbo(baseWord), 2=dbo(originalWord), 3=yago|extend() | |||||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||||
if(!typeOmit) | |||||
{ | |||||
System.out.println("Type Check: "+originalWord); | |||||
//checkTypeCnt++; | |||||
//search standard type | |||||
tmList = tr.getTypeIDsAndNamesByStr(baseWord); | |||||
if(tmList == null || tmList.size() == 0) | |||||
{ | |||||
tmList = tr.getTypeIDsAndNamesByStr(originalWord); | |||||
if(tmList != null && tmList.size()>0) | |||||
hitMethod = 2; | |||||
} | |||||
else | |||||
hitMethod = 1; | |||||
//Search extend type (YAGO type) | |||||
if(tmList == null || tmList.size() == 0) | |||||
{ | |||||
tmList = tr.getExtendTypeByStr(allUpperWord); | |||||
if(tmList != null && tmList.size() > 0) | |||||
{ | |||||
preLog += "++++ Extend Type detect: "+baseWord+": "+" prefferd relaiton:"+tmList.get(0).prefferdRelation+"\n"; | |||||
hitMethod = 3; | |||||
} | |||||
} | |||||
} | |||||
// Search entity | |||||
ArrayList<EntityMapping> emList = new ArrayList<EntityMapping>(); | |||||
if(!entOmit && !stopEntList.contains(baseWord)) | |||||
{ | |||||
System.out.println("Ent Check: "+originalWord); | |||||
checkEntCnt++; | |||||
// Notice, the second parameter is whether use DBpedia Lookup. | |||||
emList = getEntityIDsAndNamesByStr(originalWord, (UpperWordCnt>=len-1 || len==1),len); | |||||
if(emList == null || emList.size() == 0) | |||||
{ | |||||
emList = getEntityIDsAndNamesByStr(baseWord, (UpperWordCnt>=len-1 || len==1), len); | |||||
} | |||||
if(emList!=null && emList.size()>10) | |||||
{ | |||||
ArrayList<EntityMapping> tmpList = new ArrayList<EntityMapping>(); | |||||
for(int i=0;i<10;i++) | |||||
{ | |||||
tmpList.add(emList.get(i)); | |||||
} | |||||
emList = tmpList; | |||||
} | |||||
} | |||||
MergedWord mWord = new MergedWord(st,ed,originalWord); | |||||
// Add category | |||||
if(category != null) | |||||
{ | |||||
mWord.mayCategory = true; | |||||
mWord.category = category; | |||||
int key = st*(words.length+1) + ed; | |||||
mustSelectedList.add(key); | |||||
} | |||||
// Add literal | |||||
if(len==1 && checkLiteralWord(words[st])) | |||||
{ | |||||
mWord.mayLiteral = true; | |||||
int key = st*(words.length+1) + ed; | |||||
literalList.add(key); | |||||
} | |||||
// Add type mappings | |||||
if(tmList!=null && tmList.size()>0) | |||||
{ | |||||
// Drop by score threshold | |||||
if(tmList.get(0).score < TypeAcceptedScore) | |||||
typeOmit = true; | |||||
// Only allow EXACT MATCH when method=1|2 | |||||
// TODO: consider approximate match and taxonomy. eg, actor->person | |||||
String likelyType = tmList.get(0).typeName.toLowerCase(); | |||||
String candidateBase = baseWord.replace("_", ""), candidateOriginal = originalWord.replace("_", "").toLowerCase(); | |||||
if(!candidateBase.equals(likelyType) && hitMethod == 1) | |||||
typeOmit = true; | |||||
if(!candidateOriginal.equals(likelyType) && hitMethod == 2) | |||||
typeOmit = true; | |||||
if(!typeOmit) | |||||
{ | |||||
mWord.mayType = true; | |||||
mWord.tmList = tmList; | |||||
int key = st*(words.length+1) + ed; | |||||
typeMappings.put(key, tmList.get(0).typeName); | |||||
typeScores.put(key, tmList.get(0).score); | |||||
} | |||||
} | |||||
// Add entity mappings | |||||
if(emList!=null && emList.size()>0) | |||||
{ | |||||
// Drop by score threshold | |||||
if(emList.get(0).score < EntAcceptedScore) | |||||
entOmit = true; | |||||
// Drop: the [German Shepherd] dog | |||||
else if(len > 2) | |||||
{ | |||||
for(int key: entityMappings.keySet()) | |||||
{ | |||||
//int te=key%(words.length+1); | |||||
int ts=key/(words.length+1); | |||||
if(ts == st+1 && ts <= ed) | |||||
{ | |||||
//DT in lowercase (allow uppercase, such as: [The Pillars of the Earth]) | |||||
if(words[st].posTag.startsWith("DT") && !(words[st].originalForm.charAt(0)>='A'&&words[st].originalForm.charAt(0)<='Z')) | |||||
{ | |||||
entOmit = true; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
// Record info in merged word | |||||
if(!entOmit) | |||||
{ | |||||
mWord.mayEnt = true; | |||||
mWord.emList = emList; | |||||
// use to remove duplicate and select | |||||
int key = st*(words.length+1) + ed; | |||||
entityMappings.put(key, emList.get(0).entityID); | |||||
// fix entity score | conflict resolution | |||||
double score = emList.get(0).score; | |||||
String likelyEnt = emList.get(0).entityName.toLowerCase().replace(" ", "_"); | |||||
String lowerOriginalWord = originalWord.toLowerCase(); | |||||
// !Award: whole match | |||||
if(likelyEnt.equals(lowerOriginalWord)) | |||||
score *= len; | |||||
// !Award: COVER (eg, Robert Kennedy: [Robert] [Kennedy] [Robert Kennedy]) | |||||
//像Social_Democratic_Party,这三个word任意组合都是ent,导致方案太多;相比较“冲突选哪个”,“连or不应该连”显得更重要(而且实际错误多为连或不连的错误),所以这里直接抛弃被覆盖的小ent | |||||
//像Abraham_Lincoln,在“不连接”的方案中,会把他们识别成两个node,最后得分超过了正确答案的得分;故对于这种词设置为必选 | |||||
if(len>1) | |||||
{ | |||||
boolean[] flag = new boolean[words.length+1]; | |||||
ArrayList<Integer> needlessEntList = new ArrayList<Integer>(); | |||||
double tmpScore=0; | |||||
for(int preKey: entityMappings.keySet()) | |||||
{ | |||||
if(preKey == key) | |||||
continue; | |||||
int te=preKey%(words.length+1),ts=preKey/(words.length+1); | |||||
for(int i=ts;i<te;i++) | |||||
flag[i] = true; | |||||
if(st<=ts && ed>= te) | |||||
{ | |||||
needlessEntList.add(preKey); | |||||
tmpScore += entityScores.get(preKey); | |||||
} | |||||
} | |||||
int hitCnt = 0; | |||||
for(int i=st;i<ed;i++) | |||||
if(flag[i]) | |||||
hitCnt++; | |||||
// WHOLE match || HIGH match & HIGH upper || WHOLE upper | |||||
if(hitCnt == len || ((double)hitCnt/(double)len > 0.6 && (double)UpperWordCnt/(double)len > 0.6) || UpperWordCnt == len || len>=4) | |||||
{ | |||||
//如中间有逗号,则要求两边的词都在mapping的entity中出现 | |||||
//例如 Melbourne_,_Florida: Melbourne, Florida 是必须选的,而 California_,_USA: Malibu, California,认为不一定正确 | |||||
boolean commaTotalRight = true; | |||||
if(originalWord.contains(",")) | |||||
{ | |||||
String candidateCompactString = originalWord.replace(",","").replace("_", "").toLowerCase(); | |||||
String likelyCompactEnt = likelyEnt.replace(",","").replace("_", ""); | |||||
if(!candidateCompactString.equals(likelyCompactEnt)) | |||||
commaTotalRight = false; | |||||
else | |||||
{ | |||||
mWord.name = mWord.name.replace("_,_","_"); | |||||
needRemoveCommas = true; | |||||
} | |||||
} | |||||
if(commaTotalRight) | |||||
{ | |||||
mustSelectedList.add(key); | |||||
if(tmpScore>score) | |||||
score = tmpScore+1; | |||||
for(int preKey: needlessEntList) | |||||
{ | |||||
entityMappings.remove(preKey); | |||||
mustSelectedList.remove(Integer.valueOf(preKey)); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
//NOTICE: score in mWord have no changes. we only change the score in entityScores. | |||||
entityScores.put(key,score); | |||||
} | |||||
} | |||||
if(mWord.mayCategory || mWord.mayEnt || mWord.mayType || mWord.mayLiteral) | |||||
mWordList.add(mWord); | |||||
} | |||||
} | |||||
/* Print all candidates (use fixed score).*/ | |||||
System.out.println("------- Result ------"); | |||||
for(MergedWord mWord: mWordList) | |||||
{ | |||||
int key = mWord.st * (words.length+1) + mWord.ed; | |||||
if(mWord.mayCategory) | |||||
{ | |||||
System.out.println("Detect category mapping: "+mWord.name+": "+ mWord.category +" score: 100.0"); | |||||
preLog += "++++ Category detect: "+mWord.name+": "+mWord.category+" score: 100.0\n"; | |||||
} | |||||
if(mWord.mayEnt) | |||||
{ | |||||
System.out.println("Detect entity mapping: "+mWord.name+": ["); | |||||
for(EntityMapping em: mWord.emList) | |||||
System.out.print(em.entityName + ", "); | |||||
System.out.println("]"); | |||||
preLog += "++++ Entity detect: "+mWord.name+": "+mWord.emList.get(0).entityName+" score:"+entityScores.get(key)+"\n"; | |||||
hitEntCnt++; | |||||
} | |||||
if(mWord.mayType) | |||||
{ | |||||
System.out.println("Detect type mapping: "+mWord.name+": ["); | |||||
for(TypeMapping tm: mWord.tmList) | |||||
System.out.print(tm.typeName + ", "); | |||||
System.out.println("]"); | |||||
preLog += "++++ Type detect: "+mWord.name+": "+mWord.tmList.get(0).typeName +" score:"+typeScores.get(key)+"\n"; | |||||
hitTypeCnt++; | |||||
} | |||||
if(mWord.mayLiteral) | |||||
{ | |||||
System.out.println("Detect literal: "+mWord.name); | |||||
preLog += "++++ Literal detect: "+mWord.name+"\n"; | |||||
} | |||||
} | |||||
/* | |||||
* Sort by score and remove duplicate. | |||||
* eg, <"video_game" "ent:Video game" "50.0"> <"a_video_game" "ent:Video game" "45.0">. | |||||
* Notice, reserve all information in mWordList. | |||||
*/ | |||||
// one ENT maps different mergedWord in query, reserve the higher score. | |||||
ByValueComparator bvc = new ByValueComparator(entityScores,words.length+1); | |||||
List<Integer> keys = new ArrayList<Integer>(entityMappings.keySet()); | |||||
Collections.sort(keys, bvc); | |||||
for(Integer key : keys) | |||||
{ | |||||
if(!mappingScores.containsKey(entityMappings.get(key))) | |||||
mappingScores.put(entityMappings.get(key), entityScores.get(key)); | |||||
else | |||||
entityMappings.remove(key); | |||||
} | |||||
selectedList = new ArrayList<ArrayList<Integer>>(); | |||||
ArrayList<Integer> selected = new ArrayList<Integer>(); | |||||
// Some phrases must be selected. | |||||
selected.addAll(mustSelectedList); | |||||
for(Integer key: typeMappings.keySet()) | |||||
{ | |||||
// !type(len>1) (Omit len=1 because: [Brooklyn Bridge] is a entity. | |||||
int ed = key%(words.length+1), st = key/(words.length+1); | |||||
if(st+1 < ed) | |||||
{ | |||||
boolean beCovered = false; | |||||
//Entity cover type, eg:[prime_minister of Spain] | |||||
for(int preKey: entityMappings.keySet()) | |||||
{ | |||||
int te=preKey%(words.length+1),ts=preKey/(words.length+1); | |||||
//Entiy should longer than type | |||||
if(ts <= st && te >= ed && ed-st < te-ts) | |||||
{ | |||||
beCovered = true; | |||||
} | |||||
} | |||||
if(!beCovered) | |||||
selected.add(key); | |||||
} | |||||
} | |||||
// Conflict resolution | |||||
ArrayList<Integer> noConflictSelected = new ArrayList<Integer>(); | |||||
//select longer one when conflict | |||||
boolean[] flag = new boolean[words.length]; | |||||
ByLenComparator blc = new ByLenComparator(words.length+1); | |||||
Collections.sort(selected,blc); | |||||
for(Integer key : selected) | |||||
{ | |||||
int ed = key%(words.length+1), st = (key-ed)/(words.length+1); | |||||
boolean omit = false; | |||||
for(int i=st;i<ed;i++) | |||||
{ | |||||
if(flag[i]) | |||||
{ | |||||
omit = true; | |||||
break; | |||||
} | |||||
} | |||||
if(omit) | |||||
continue; | |||||
for(int i=st;i<ed;i++) | |||||
flag[i]=true; | |||||
noConflictSelected.add(key); | |||||
} | |||||
// Scoring and ranking --> top-k decision | |||||
dfs(keys,0,noConflictSelected,words.length+1); | |||||
ArrayList<NodeSelectedWithScore> nodeSelectedWithScoreList = new ArrayList<NodeSelectedWithScore>(); | |||||
for(ArrayList<Integer> select: selectedList) | |||||
{ | |||||
double score = 0; | |||||
for(Integer key: select) | |||||
{ | |||||
if(entityScores.containsKey(key)) | |||||
score += entityScores.get(key); | |||||
if(typeScores.containsKey(key)) | |||||
score += typeScores.get(key); | |||||
} | |||||
NodeSelectedWithScore tmp = new NodeSelectedWithScore(select, score); | |||||
nodeSelectedWithScoreList.add(tmp); | |||||
} | |||||
Collections.sort(nodeSelectedWithScoreList); | |||||
// Replace | |||||
int cnt = 0; | |||||
for(int k=0; k<nodeSelectedWithScoreList.size(); k++) | |||||
{ | |||||
if(k >= nodeSelectedWithScoreList.size()) | |||||
break; | |||||
selected = nodeSelectedWithScoreList.get(k).selected; | |||||
Collections.sort(selected); | |||||
int j = 0; | |||||
String res = question; | |||||
if(selected.size()>0) | |||||
{ | |||||
res = words[0].originalForm; | |||||
int tmp = selected.get(j++), st = tmp/(words.length+1), ed = tmp%(words.length+1); | |||||
for(int i=1;i<words.length;i++) | |||||
{ | |||||
if(i>st && i<ed) | |||||
{ | |||||
res = res+"_"+words[i].originalForm; | |||||
} | |||||
else | |||||
{ | |||||
res = res+" "+words[i].originalForm; | |||||
} | |||||
if(i >= ed && j<selected.size()) | |||||
{ | |||||
tmp = selected.get(j++); | |||||
st = tmp/(words.length+1); | |||||
ed = tmp%(words.length+1); | |||||
} | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
res = words[0].originalForm; | |||||
for(int i=1;i<words.length;i++) | |||||
{ | |||||
res = res+" "+words[i].originalForm; | |||||
} | |||||
} | |||||
boolean ok = true; | |||||
for(String str: fixedQuestionList) | |||||
if(str.equals(res)) | |||||
ok = false; | |||||
if(!ok) | |||||
continue; | |||||
if(needRemoveCommas) | |||||
res = res.replace("_,_","_"); | |||||
System.out.println("Merged: "+res); | |||||
preLog += "plan "+cnt+": "+res+"\n"; | |||||
fixedQuestionList.add(res); | |||||
cnt++; | |||||
if(cnt >= 3) // top-3 | |||||
break; | |||||
} | |||||
long t2 = System.currentTimeMillis(); | |||||
// preLog += "Total hit/check/all ent num: "+hitEntCnt+" / "+checkEntCnt+" / "+allCnt+"\n"; | |||||
// preLog += "Total hit/check/all type num: "+hitTypeCnt+" / "+checkTypeCnt+" / "+allCnt+"\n"; | |||||
preLog += "Node Recognition time: "+ (t2-t1) + "ms\n"; | |||||
System.out.println("Total check time: "+ (t2-t1) + "ms"); | |||||
System.out.println("--------- pre entity/type recognition end ---------"); | |||||
return fixedQuestionList; | |||||
} | |||||
public void dfs(List<Integer> keys,int dep,ArrayList<Integer> selected,int size) | |||||
{ | |||||
if(dep == keys.size()) | |||||
{ | |||||
ArrayList<Integer> tmpList = (ArrayList<Integer>) selected.clone(); | |||||
selectedList.add(tmpList); | |||||
} | |||||
else | |||||
{ | |||||
//off: dep-th mWord | |||||
dfs(keys,dep+1,selected,size); | |||||
//on: no conflict | |||||
boolean conflict = false; | |||||
for(int preKey: selected) | |||||
{ | |||||
int curKey = keys.get(dep); | |||||
int preEd = preKey%size, preSt = (preKey-preEd)/size; | |||||
int curEd = curKey%size, curSt = (curKey-curEd)/size; | |||||
if(!(preSt<preEd && preEd<=curSt && curSt<curEd) && !(curSt<curEd && curEd<=preSt && preSt<preEd)) | |||||
conflict = true; | |||||
} | |||||
if(!conflict) | |||||
{ | |||||
selected.add(keys.get(dep)); | |||||
dfs(keys,dep+1,selected,size); | |||||
selected.remove(keys.get(dep)); | |||||
} | |||||
} | |||||
} | |||||
public ArrayList<EntityMapping> getEntityIDsAndNamesByStr(String entity, boolean useDblk, int len) | |||||
{ | |||||
String n = entity; | |||||
ArrayList<EntityMapping> ret= new ArrayList<EntityMapping>(); | |||||
//1. Handwriting | |||||
if(m2e.containsKey(entity)) | |||||
{ | |||||
String eName = m2e.get(entity); | |||||
EntityMapping em = new EntityMapping(EntityFragmentFields.entityName2Id.get(eName), eName, 1000); | |||||
ret.add(em); | |||||
return ret; //handwriting is always correct | |||||
} | |||||
//2. Lucene index | |||||
ret.addAll(EntityFragment.getEntityMappingList(n)); | |||||
//3. DBpedia Lookup (some cases) | |||||
if (useDblk) | |||||
{ | |||||
ret.addAll(Globals.dblk.getEntityMappings(n, null)); | |||||
} | |||||
Collections.sort(ret); | |||||
if (ret.size() > 0) return ret; | |||||
else return null; | |||||
} | |||||
public int preferDBpediaLookupOrLucene(String entityName) | |||||
{ | |||||
int cntUpperCase = 0; | |||||
int cntSpace = 0; | |||||
int cntPoint = 0; | |||||
int length = entityName.length(); | |||||
for (int i=0; i<length; i++) | |||||
{ | |||||
char c = entityName.charAt(i); | |||||
if (c==' ') | |||||
cntSpace++; | |||||
else if (c=='.') | |||||
cntPoint++; | |||||
else if (c>='A' && c<='Z') | |||||
cntUpperCase++; | |||||
} | |||||
if ((cntUpperCase>0 || cntPoint>0) && cntSpace<3) | |||||
return 1; | |||||
if (cntUpperCase == length) | |||||
return 1; | |||||
return 0; | |||||
} | |||||
static class ByValueComparator implements Comparator<Integer> { | |||||
HashMap<Integer, Double> base_map; | |||||
int base_size; | |||||
double eps = 1e-8; | |||||
int dblcmp(double a,double b) | |||||
{ | |||||
if(a+eps < b) | |||||
return -1; | |||||
return b+eps<a ? 1:0; | |||||
} | |||||
public ByValueComparator(HashMap<Integer, Double> base_map, Integer size) { | |||||
this.base_map = base_map; | |||||
this.base_size = size; | |||||
} | |||||
public int compare(Integer arg0, Integer arg1) { | |||||
if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) { | |||||
return 0; | |||||
} | |||||
if (dblcmp(base_map.get(arg0),base_map.get(arg1))<0) { | |||||
return 1; | |||||
} | |||||
else if (dblcmp(base_map.get(arg0),base_map.get(arg1))==0) | |||||
{ | |||||
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; | |||||
if (len0 < len1) { | |||||
return 1; | |||||
} else if (len0 == len1) { | |||||
return 0; | |||||
} else { | |||||
return -1; | |||||
} | |||||
} | |||||
else { | |||||
return -1; | |||||
} | |||||
} | |||||
} | |||||
static class ByLenComparator implements Comparator<Integer> { | |||||
int base_size; | |||||
public ByLenComparator(int size) { | |||||
this.base_size = size; | |||||
} | |||||
public int compare(Integer arg0, Integer arg1) { | |||||
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; | |||||
if (len0 < len1) { | |||||
return 1; | |||||
} else if (len0 == len1) { | |||||
return 0; | |||||
} else { | |||||
return -1; | |||||
} | |||||
} | |||||
} | |||||
public boolean isDigit(char ch) | |||||
{ | |||||
if(ch>='0' && ch<='9') | |||||
return true; | |||||
return false; | |||||
} | |||||
//TODO: other literal words. | |||||
public boolean checkLiteralWord(Word word) | |||||
{ | |||||
boolean ok = false; | |||||
if(word.posTag.equals("CD")) | |||||
ok = true; | |||||
return ok; | |||||
} | |||||
public static void main (String[] args) | |||||
{ | |||||
Globals.init(); | |||||
EntityRecognition er = new EntityRecognition(); | |||||
try | |||||
{ | |||||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||||
while (true) | |||||
{ | |||||
System.out.println("Please input the question: "); | |||||
String question = br.readLine(); | |||||
er.process(question); | |||||
} | |||||
// File inputFile = new File("D:\\husen\\gAnswer\\data\\test\\test_in.txt"); | |||||
// File outputFile = new File("D:\\husen\\gAnswer\\data\\test\\test_out.txt"); | |||||
// BufferedReader fr = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile),"utf-8")); | |||||
// OutputStreamWriter fw = new OutputStreamWriter(new FileOutputStream(outputFile,true),"utf-8"); | |||||
// | |||||
// String input; | |||||
// while((input=fr.readLine())!=null) | |||||
// { | |||||
// String[] strArray = input.split("\t"); | |||||
// String id = ""; | |||||
// String question = strArray[0]; | |||||
// if(strArray.length>1) | |||||
// { | |||||
// question = strArray[1]; | |||||
// id = strArray[0]; | |||||
// } | |||||
// //Notice "?" may leads lucene/dbpedia lookup error | |||||
// if(question.length()>1 && question.charAt(question.length()-1)=='.' || question.charAt(question.length()-1)=='?') | |||||
// question = question.substring(0,question.length()-1); | |||||
// if(question.isEmpty()) | |||||
// continue; | |||||
// er.process(question); | |||||
// fw.write("Id: "+id+"\nQuery: "+question+"\n"); | |||||
// fw.write(er.preLog+"\n"); | |||||
// } | |||||
// | |||||
// fr.close(); | |||||
// fw.close(); | |||||
} catch (IOException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,467 @@ | |||||
package qa.extract; | |||||
import java.io.BufferedReader; | |||||
//import java.io.IOException; | |||||
import java.io.InputStreamReader; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.Comparator; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import java.util.List; | |||||
import paradict.ParaphraseDictionary; | |||||
import qa.Globals; | |||||
import rdf.Sparql; | |||||
import rdf.Triple; | |||||
import rdf.ImplicitRelation; | |||||
import lcn.EntityFragmentFields; | |||||
import log.QueryLogger; | |||||
import fgmt.EntityFragment; | |||||
import fgmt.TypeFragment; | |||||
import nlp.ds.Word; | |||||
import nlp.tool.CoreNLP; | |||||
public class ExtractImplicitRelation { | |||||
static final int SamplingNumber = 100; // the maximum sampling number in calculation | |||||
static final int k = 3; // select top-k when many suitable relations; select top-k entities for a word | |||||
public HashMap<String, Integer> implicitEntRel = new HashMap<String, Integer>(); | |||||
/* | |||||
* Implicit Relations: | |||||
* eg, Which is the film directed by Obama and starred by a Chinese ?x | |||||
* 1. [What] is in a [chocolate_chip_cookie] ?var + ent | |||||
* 2. What [country] is [Sitecore] from ?type + ent = [?var p ent + ?var<-type] | |||||
* 3. Czech movies | Chinese actor ent + ?type | |||||
* 4. President Obama type + ent | |||||
* 5. Andy Liu's Hero(film) ent + ent | |||||
* */ | |||||
public ExtractImplicitRelation() | |||||
{ | |||||
//orignal word to lower case | |||||
implicitEntRel.put("american", Globals.pd.predicate_2_id.get("country")); | |||||
implicitEntRel.put("united_states", Globals.pd.predicate_2_id.get("country")); | |||||
} | |||||
// Notice, it is usually UNNECESSARY for two constant, so we unimplemented this function. | |||||
// eg, "president Obama", "Andy Liu's Hero(film)". | |||||
public ArrayList<Integer> getPrefferdPidListBetweenTwoConstant(Word w1, Word w2) | |||||
{ | |||||
ArrayList<Integer> res = new ArrayList<Integer>(); | |||||
int w1Role = 0, w2Role = 0; // 0:var 1:ent 2:type | |||||
if(w1.mayEnt && w1.emList.size()>0) | |||||
w1Role = 1; | |||||
if(w1.mayType && w1.tmList.size()>0) | |||||
w1Role = 2; | |||||
if(w2.mayEnt && w2.emList.size()>0) | |||||
w2Role = 1; | |||||
if(w2.mayType && w2.tmList.size()>0) | |||||
w2Role = 2; | |||||
//Reject variables | two types | |||||
if(w1Role == 0 || w2Role == 0 || (w1Role == 2 && w2Role == 2)) | |||||
return null; | |||||
//ent1 & ent2 | |||||
//if(w1Role == 1 && w2Role == 1) | |||||
//{ | |||||
//EntityFragment ef = null; | |||||
// TODO: implement. | |||||
//} | |||||
return res; | |||||
} | |||||
public ArrayList<Triple> supplementTriplesByModifyWord(QueryLogger qlog) | |||||
{ | |||||
ArrayList<Triple> res = new ArrayList<Triple>(); | |||||
ArrayList<Word> typeVariableList = new ArrayList<Word>(); | |||||
// Modifier | |||||
for(Word word: qlog.s.words) | |||||
{ | |||||
if(word.modifiedWord != null && word.modifiedWord != word) | |||||
{ | |||||
ArrayList<ImplicitRelation> irList = null; | |||||
// ent -> typeVariable | eg, Chinese actor, Czech movies | TODO: consider more types of modifier | |||||
if(word.mayEnt && word.modifiedWord.mayType) | |||||
{ | |||||
typeVariableList.add(word.modifiedWord); | |||||
int tId = word.modifiedWord.tmList.get(0).typeID; // select the top-1 type | |||||
String tName = word.modifiedWord.originalForm; | |||||
for(int i=0; i<k&&i<word.emList.size(); i++) // select the top-k entities | |||||
{ | |||||
int eId = word.emList.get(i).entityID; | |||||
String eName = word.emList.get(i).entityName; | |||||
irList = getPrefferdPidListBetween_Entity_TypeVariable(eId, tId); | |||||
// !Handwriting implicit relations | |||||
if(irList != null && implicitEntRel.containsKey(word.originalForm.toLowerCase())) | |||||
{ | |||||
int pId = implicitEntRel.get(word.originalForm.toLowerCase()); | |||||
ImplicitRelation ir = new ImplicitRelation(tId, eId, pId, 1000); | |||||
irList.add(0, ir); | |||||
} | |||||
if(irList!=null && irList.size()>0) | |||||
{ | |||||
ImplicitRelation ir = irList.get(0); | |||||
String subjName = null, objName = null; | |||||
Word subjWord = null, objWord = null; | |||||
if(ir.subjId == eId) | |||||
{ | |||||
subjName = eName; | |||||
objName = "?"+tName; | |||||
subjWord = word; | |||||
objWord = word.modifiedWord; | |||||
} | |||||
else | |||||
{ | |||||
subjName = "?"+tName; | |||||
objName = eName; | |||||
subjWord = word.modifiedWord; | |||||
objWord = word; | |||||
} | |||||
Triple triple = new Triple(ir.subjId, subjName, ir.pId, ir.objId, objName, null, ir.score, subjWord, objWord); | |||||
res.add(triple); | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
if(qlog.rankedSparqls == null || qlog.rankedSparqls.size() == 0) | |||||
{ | |||||
if(res != null && res.size() > 0) | |||||
{ | |||||
Sparql spq = new Sparql(); | |||||
for(Triple t: res) | |||||
spq.addTriple(t); | |||||
// Add type info | |||||
for(Word typeVar: typeVariableList) | |||||
{ | |||||
Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+typeVar.originalForm, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeVar.tmList.get(0).typeName, null, 100); | |||||
spq.addTriple(triple); | |||||
} | |||||
qlog.rankedSparqls.add(spq); | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
// Supplement implicit relations (modified) for each SPARQL. | |||||
for(Sparql spq: qlog.rankedSparqls) | |||||
{ | |||||
for(Triple t: res) | |||||
spq.addTriple(t); | |||||
} | |||||
} | |||||
return res; | |||||
} | |||||
/* | |||||
* eg:Czech|ent movies|?type Chinese|ent actor|?type | |||||
* type variable + entity -> entities belong to type + entity | |||||
* */ | |||||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_TypeVariable(Integer entId, Integer typeId) | |||||
{ | |||||
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>(); | |||||
TypeFragment tf = TypeFragment.typeFragments.get(typeId); | |||||
EntityFragment ef2 = EntityFragment.getEntityFragmentByEntityId(entId); | |||||
if(tf == null || ef2 == null) | |||||
{ | |||||
System.out.println("Error in getPrefferdPidListBetween_TypeVariable_Entity :Type(" + | |||||
TypeFragment.typeId2ShortName.get(typeId) + ") or Entity(" + EntityFragmentFields.entityId2Name.get(entId) + ") no fragments."); | |||||
return null; | |||||
} | |||||
// select entities belong to type, count relations | TODO: random select | |||||
int samplingCnt = 0; | |||||
HashMap<ImplicitRelation, Integer> irCount = new HashMap<ImplicitRelation, Integer>(); | |||||
for(int candidateEid: tf.entSet) | |||||
{ | |||||
EntityFragment ef1 = EntityFragment.getEntityFragmentByEntityId(candidateEid); | |||||
if(ef1 == null) | |||||
continue; | |||||
ArrayList<ImplicitRelation> tmp = getPrefferdPidListBetween_TwoEntities(ef1, ef2); | |||||
if(tmp == null || tmp.size() == 0) | |||||
continue; | |||||
if(samplingCnt++ > SamplingNumber) | |||||
break; | |||||
for(ImplicitRelation ir: tmp) | |||||
{ | |||||
if(ir.subjId == candidateEid) | |||||
ir.setSubjectId(Triple.VAR_ROLE_ID); | |||||
else if(ir.objId == candidateEid) | |||||
ir.setObjectId(Triple.VAR_ROLE_ID); | |||||
if(irCount.containsKey(ir)) | |||||
irCount.put(ir, irCount.get(ir)+1); | |||||
else | |||||
irCount.put(ir, 1); | |||||
} | |||||
} | |||||
//sort, get top-k | |||||
ByValueComparator bvc = new ByValueComparator(irCount); | |||||
List<ImplicitRelation> keys = new ArrayList<ImplicitRelation>(irCount.keySet()); | |||||
Collections.sort(keys, bvc); | |||||
for(ImplicitRelation ir: keys) | |||||
{ | |||||
res.add(ir); | |||||
if(res.size() >= k) | |||||
break; | |||||
} | |||||
return res; | |||||
} | |||||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_TypeVariable(String entName, String typeName) | |||||
{ | |||||
if(!TypeFragment.typeShortName2IdList.containsKey(typeName) || !EntityFragmentFields.entityName2Id.containsKey(entName)) | |||||
return null; | |||||
return getPrefferdPidListBetween_Entity_TypeVariable(EntityFragmentFields.entityName2Id.get(entName), TypeFragment.typeShortName2IdList.get(typeName).get(0)); | |||||
} | |||||
static class ByValueComparator implements Comparator<ImplicitRelation> { | |||||
HashMap<ImplicitRelation, Integer> base_map; | |||||
public ByValueComparator(HashMap<ImplicitRelation, Integer> base_map) { | |||||
this.base_map = base_map; | |||||
} | |||||
public int compare(ImplicitRelation arg0, ImplicitRelation arg1) { | |||||
if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) | |||||
return 0; | |||||
if (base_map.get(arg0) < base_map.get(arg1)) | |||||
return 1; | |||||
else if (base_map.get(arg0) == base_map.get(arg1)) | |||||
return 0; | |||||
else | |||||
return -1; | |||||
} | |||||
} | |||||
/* | |||||
* Notice, this function has not been used in fact. | |||||
* eg:[What] is in a [chocolate_chip_cookie] | |||||
* Just guess by single entity: select the most frequent edge. | |||||
* */ | |||||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_Variable(Integer entId, String var) | |||||
{ | |||||
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>(); | |||||
EntityFragment ef = null; | |||||
ef = EntityFragment.getEntityFragmentByEntityId(entId); | |||||
if(ef == null) | |||||
{ | |||||
System.out.println("Error in getPrefferdPidListBetween_Entity_Variable: Entity No Fragments!"); | |||||
return null; | |||||
} | |||||
// find most frequent inEdge | |||||
int pid = findMostFrequentEdge(ef.inEntMap, ef.inEdges); | |||||
if(pid != -1) | |||||
res.add(new ImplicitRelation(Triple.VAR_ROLE_ID, entId, pid, 100)); | |||||
// find most frequent outEdge | |||||
pid = findMostFrequentEdge(ef.outEntMap, ef.outEdges); | |||||
if(pid != -1) | |||||
res.add(new ImplicitRelation(entId, Triple.VAR_ROLE_ID, pid, 100)); | |||||
return res; | |||||
} | |||||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_Variable(String entName, String var) | |||||
{ | |||||
return getPrefferdPidListBetween_Entity_Variable(EntityFragmentFields.entityName2Id.get(entName), var); | |||||
} | |||||
public int findMostFrequentEdge(HashMap<Integer, ArrayList<Integer>> entMap, HashSet<Integer> edges) | |||||
{ | |||||
int mfPredicateId = -1, maxCount = 0; | |||||
HashMap<Integer, Integer> edgeCount = new HashMap<Integer, Integer>(); | |||||
for(int key: entMap.keySet()) | |||||
{ | |||||
for(int edge: entMap.get(key)) | |||||
{ | |||||
if(!edgeCount.containsKey(edge)) | |||||
edgeCount.put(edge, 1); | |||||
else | |||||
edgeCount.put(edge, edgeCount.get(edge)+1); | |||||
if(maxCount < edgeCount.get(edge)) | |||||
{ | |||||
maxCount = edgeCount.get(edge); | |||||
mfPredicateId = edge; | |||||
} | |||||
} | |||||
} | |||||
return mfPredicateId; | |||||
} | |||||
// Unnecessary. | |||||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TypeConstant_Entity(Integer typeId, Integer entId) | |||||
{ | |||||
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>(); | |||||
TypeFragment tf = TypeFragment.typeFragments.get(typeId); | |||||
if(tf == null) | |||||
{ | |||||
System.out.println("Error in getPrefferdPidListBetween_TypeConstant_Entity: Type No Fragments!"); | |||||
return null; | |||||
} | |||||
// subj : ent1 | |||||
if(tf.entSet.contains(entId)) | |||||
{ | |||||
ImplicitRelation ir = new ImplicitRelation(entId, typeId, Globals.pd.typePredicateID, 100); | |||||
res.add(ir); | |||||
} | |||||
return res; | |||||
} | |||||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TwoEntities(String eName1, String eName2) | |||||
{ | |||||
return getPrefferdPidListBetween_TwoEntities(EntityFragmentFields.entityName2Id.get(eName1), EntityFragmentFields.entityName2Id.get(eName2)); | |||||
} | |||||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TwoEntities(Integer eId1, Integer eId2) | |||||
{ | |||||
EntityFragment ef1 = null, ef2 = null; | |||||
ef1 = EntityFragment.getEntityFragmentByEntityId(eId1); | |||||
ef2 = EntityFragment.getEntityFragmentByEntityId(eId2); | |||||
if(ef1 == null || ef2 == null) | |||||
{ | |||||
System.out.println("Error in GetPrefferdPidListBetweenTwoEntities: Entity No Fragments!"); | |||||
return null; | |||||
} | |||||
return getPrefferdPidListBetween_TwoEntities(ef1,ef2); | |||||
} | |||||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TwoEntities(EntityFragment ef1, EntityFragment ef2) | |||||
{ | |||||
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>(); | |||||
if(ef1 == null || ef2 == null) | |||||
return null; | |||||
int eId1 = ef1.eId; | |||||
int eId2 = ef2.eId; | |||||
// subj : ent1 | |||||
if(ef1.outEntMap.containsKey(eId2)) | |||||
{ | |||||
ArrayList<Integer> pidList = ef1.outEntMap.get(eId2); | |||||
for(int pid: pidList) | |||||
{ | |||||
// TODO: other score strategy | |||||
ImplicitRelation ir = new ImplicitRelation(eId1, eId2, pid, 100); | |||||
res.add(ir); | |||||
} | |||||
} | |||||
// subj : ent2 | |||||
else if(ef2.outEntMap.containsKey(eId1)) | |||||
{ | |||||
ArrayList<Integer> pidList = ef2.outEntMap.get(eId1); | |||||
for(int pid: pidList) | |||||
{ | |||||
ImplicitRelation ir = new ImplicitRelation(eId2, eId1, pid, 100); | |||||
res.add(ir); | |||||
} | |||||
} | |||||
return res; | |||||
} | |||||
public static void main(String[] args) throws Exception { | |||||
Globals.coreNLP = new CoreNLP(); | |||||
Globals.pd = new ParaphraseDictionary(); | |||||
try | |||||
{ | |||||
EntityFragmentFields.load(); | |||||
TypeFragment.load(); | |||||
} catch (Exception e) { | |||||
// TODO Auto-generated catch block | |||||
e.printStackTrace(); | |||||
} | |||||
ExtractImplicitRelation eir = new ExtractImplicitRelation(); | |||||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||||
String name1,name2; | |||||
while(true) | |||||
{ | |||||
System.out.println("Input two node to extract their implicit relations:"); | |||||
name1 = br.readLine(); | |||||
name2 = br.readLine(); | |||||
ArrayList<ImplicitRelation> irList = null; | |||||
irList = eir.getPrefferdPidListBetween_TwoEntities(name1, name2); | |||||
if(irList == null || irList.size()==0) | |||||
System.out.println("Can't find!"); | |||||
else | |||||
{ | |||||
for(ImplicitRelation ir: irList) | |||||
{ | |||||
int pId = ir.pId; | |||||
String p = Globals.pd.getPredicateById(pId); | |||||
System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId); | |||||
System.out.println(ir.subj+"\t"+p+"\t"+ir.obj); | |||||
} | |||||
} | |||||
// irList = eir.getPrefferdPidListBetween_TypeConstant_Entity(name1, name2); | |||||
// if(irList == null || irList.size()==0) | |||||
// System.out.println("Can't find!"); | |||||
// else | |||||
// { | |||||
// for(ImplicitRelation ir: irList) | |||||
// { | |||||
// int pId = ir.pId; | |||||
// String p = Globals.pd.getPredicateById(pId); | |||||
// System.out.println(ir.subj+"\t"+p+"\t"+ir.obj); | |||||
// } | |||||
// } | |||||
// irList = eir.getPrefferdPidListBetween_Entity_Variable(name1, name2); | |||||
// if(irList == null || irList.size()==0) | |||||
// System.out.println("Can't find!"); | |||||
// else | |||||
// { | |||||
// for(ImplicitRelation ir: irList) | |||||
// { | |||||
// int pId = ir.pId; | |||||
// String p = Globals.pd.getPredicateById(pId); | |||||
// System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId); | |||||
// } | |||||
// } | |||||
// irList = eir.getPrefferdPidListBetween_Entity_TypeVariable(name1, name2); | |||||
// if(irList == null || irList.size()==0) | |||||
// System.out.println("Can't find!"); | |||||
// else | |||||
// { | |||||
// for(ImplicitRelation ir: irList) | |||||
// { | |||||
// int pId = ir.pId; | |||||
// String p = Globals.pd.getPredicateById(pId); | |||||
// System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId); | |||||
// } | |||||
// } | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,472 @@ | |||||
package qa.extract; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import java.util.LinkedList; | |||||
import java.util.Queue; | |||||
import log.QueryLogger; | |||||
import nlp.ds.DependencyTree; | |||||
import nlp.ds.DependencyTreeNode; | |||||
//import nlp.ds.Word; | |||||
import paradict.ParaphraseDictionary; | |||||
import qa.Globals; | |||||
import rdf.SimpleRelation; | |||||
import rdf.PredicateMapping; | |||||
import rdf.SemanticRelation; | |||||
import rdf.SemanticUnit; | |||||
public class ExtractRelation { | |||||
public static final int notMatchedCountThreshold = 1; // the bigger, the looser (more relations can be extracted) | |||||
public static final int notCoverageCountThreshold = 2; | |||||
/* | |||||
* Find relations by dependency tree & paraphrases. | |||||
* */ | |||||
public ArrayList<SimpleRelation> findRelationsBetweenTwoUnit(SemanticUnit su1, SemanticUnit su2, QueryLogger qlog) | |||||
{ | |||||
DependencyTree T = qlog.s.dependencyTreeStanford; | |||||
if(qlog.isMaltParserUsed) | |||||
T = qlog.s.dependencyTreeMalt; | |||||
DependencyTreeNode n1 = T.getNodeByIndex(su1.centerWord.position), n2 = T.getNodeByIndex(su2.centerWord.position); | |||||
ArrayList<DependencyTreeNode> shortestPath = T.getShortestNodePathBetween(n1,n2); | |||||
ArrayList<SimpleRelation> ret = new ArrayList<SimpleRelation>(); | |||||
HashSet<String> BoW_T = new HashSet<String>(); | |||||
HashSet<String> SubBoW_T = new HashSet<String>(); | |||||
// (Fix shortest path) Some cases consider the words not in shortest path | eg: What [be] [ent] (famous) for? | |||||
// what-be-[ent], the word [be] is useless but we need (famous) | |||||
if(shortestPath.size() == 3 && shortestPath.get(1).word.baseForm.equals("be") && T.nodesList.size() > shortestPath.get(2).word.position) | |||||
{ | |||||
shortestPath.remove(1); | |||||
shortestPath.add(1, T.getNodeByIndex(shortestPath.get(1).word.position + 1)); | |||||
} | |||||
// Shortest path -> SubBag of Words | |||||
for(DependencyTreeNode curNode: shortestPath) | |||||
{ | |||||
String text = curNode.word.baseForm; | |||||
if(!curNode.word.isIgnored && !Globals.stopWordsList.isStopWord(text)) | |||||
{ | |||||
//!split words |eg, soccer club -> soccer_club(after node recognition) -> soccer club(used in matching paraphrase) | |||||
if(curNode.word.mayEnt || curNode.word.mayType) | |||||
{ | |||||
String [] strArray = curNode.word.baseForm.split("_"); | |||||
for(String str: strArray) | |||||
SubBoW_T.add(str); | |||||
} | |||||
else | |||||
{ | |||||
SubBoW_T.add(text); | |||||
} | |||||
} | |||||
} | |||||
// DS tree -> Bag of Words | |||||
for (DependencyTreeNode curNode : T.getNodesList()) | |||||
{ | |||||
if (!curNode.word.isIgnored) | |||||
{ | |||||
String text = curNode.word.baseForm; | |||||
if(curNode.word.mayEnt || curNode.word.mayType) | |||||
{ | |||||
String [] strArray = curNode.word.baseForm.split("_"); | |||||
for(String str: strArray) | |||||
BoW_T.add(str); | |||||
} | |||||
else | |||||
{ | |||||
BoW_T.add(text); | |||||
} | |||||
} | |||||
} | |||||
// Find candidate patterns by SubBoW_T & inveretdIndex | |||||
HashSet<String> candidatePatterns = new HashSet<String>(); | |||||
for (String curWord : SubBoW_T) | |||||
{ | |||||
ArrayList<String> postingList = Globals.pd.invertedIndex.get(curWord); | |||||
if (postingList != null) | |||||
{ | |||||
candidatePatterns.addAll(postingList); | |||||
} | |||||
} | |||||
// Check patterns by BoW_P & subtree matching | |||||
int notMatchedCount = 0; | |||||
HashSet<String> validCandidatePatterns = new HashSet<String>(); | |||||
for (String p : candidatePatterns) | |||||
{ | |||||
String[] BoW_P = p.split(" "); | |||||
notMatchedCount = 0; // not match number between pattern & question | |||||
for (String s : BoW_P) | |||||
{ | |||||
if (s.length() < 2) | |||||
continue; | |||||
if (s.startsWith("[")) | |||||
continue; | |||||
if (Globals.stopWordsList.isStopWord(s)) | |||||
continue; | |||||
if (!BoW_T.contains(s)) | |||||
{ | |||||
notMatchedCount ++; | |||||
if (notMatchedCount > notMatchedCountThreshold) | |||||
break; | |||||
} | |||||
} | |||||
if (notMatchedCount <= notMatchedCountThreshold) | |||||
{ | |||||
validCandidatePatterns.add(p); | |||||
//TODO: to support matching like [soccer_club] | |||||
subTreeMatching(p, BoW_P, shortestPath, T, qlog, ret, 'S'); | |||||
} | |||||
} | |||||
// Another chance for [soccer_club] (the relation embedded in nodes) | |||||
if(validCandidatePatterns.size() > 0) | |||||
{ | |||||
if(n1.word.originalForm.contains("_") || n2.word.originalForm.contains("_")) | |||||
{ | |||||
for (String p : validCandidatePatterns) | |||||
{ | |||||
String[] BoW_P = p.split(" "); | |||||
notMatchedCount = 0; | |||||
int mappedCharacterCount = 0; | |||||
int matchedWordInArg = 0; | |||||
boolean[] matchedFlag = new boolean[BoW_P.length]; | |||||
for(int idx = 0; idx < BoW_P.length; idx ++) {matchedFlag[idx] = false;} | |||||
int idx = 0; | |||||
for (String s : BoW_P) | |||||
{ | |||||
if(n1.word.baseForm.contains(s) || n2.word.baseForm.contains(s)) // Hit nodes | |||||
matchedWordInArg++; | |||||
if(BoW_T.contains(s)) | |||||
{ | |||||
mappedCharacterCount += s.length(); | |||||
matchedFlag[idx] = true; | |||||
} | |||||
idx++; | |||||
if (s.length() < 2) | |||||
continue; | |||||
if (s.startsWith("[")) | |||||
continue; | |||||
if (Globals.stopWordsList.isStopWord(s)) | |||||
continue; | |||||
if (!BoW_T.contains(s)) | |||||
notMatchedCount ++; | |||||
} | |||||
// Success if has 2 hits | |||||
if(matchedWordInArg >= 2) | |||||
{ | |||||
double matched_score = ((double)(BoW_P.length-notMatchedCount))/((double)(BoW_P.length)); | |||||
if (matched_score > 0.95) | |||||
matched_score *= 10; // award for WHOLE match | |||||
// TODO: this will make LONGER one has LARGER score, sometimes unsuitable | eg, be bear die in | |||||
matched_score = matched_score * Math.sqrt(mappedCharacterCount); | |||||
SimpleRelation sr = new SimpleRelation(); | |||||
sr.arg1Word = n1.word; | |||||
sr.arg2Word = n2.word; | |||||
sr.relationParaphrase = p; | |||||
sr.matchingScore = matched_score; | |||||
sr.extractingMethod = 'X'; | |||||
if (n1.dep_father2child.endsWith("subj")) | |||||
sr.preferredSubj = sr.arg1Word; | |||||
sr.arg1Word.setIsCovered(); | |||||
sr.arg2Word.setIsCovered(); | |||||
sr.setPasList(p, matched_score, matchedFlag); | |||||
sr.setPreferedSubjObjOrder(T); | |||||
ret.add(sr); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
return ret; | |||||
} | |||||
// Core function of paraphrase matching | |||||
private void subTreeMatching (String pattern, String[] BoW_P, | |||||
ArrayList<DependencyTreeNode> shortestPath, | |||||
DependencyTree T, QueryLogger qlog, | |||||
ArrayList<SimpleRelation> ret, char extractingMethod) | |||||
{ | |||||
DependencyTreeNode n1 = shortestPath.get(0); | |||||
DependencyTreeNode n2 = shortestPath.get(shortestPath.size()-1); | |||||
ParaphraseDictionary pd = Globals.pd; | |||||
Queue<DependencyTreeNode> queue = new LinkedList<DependencyTreeNode>(); | |||||
queue.add(T.getRoot()); | |||||
for(DependencyTreeNode curOuterNode: shortestPath) | |||||
{ | |||||
outer: | |||||
for(String s: BoW_P) | |||||
{ | |||||
if(s.equals(curOuterNode.word.baseForm)) | |||||
{ | |||||
// try to match all nodes | |||||
ArrayList<DependencyTreeNode> subTreeNodes = new ArrayList<DependencyTreeNode>(); | |||||
Queue<DependencyTreeNode> queue2 = new LinkedList<DependencyTreeNode>(); | |||||
queue2.add(curOuterNode); | |||||
int unMappedLeft = BoW_P.length; | |||||
int mappedCharacterCount = 0; | |||||
int hitPathCnt = 0; // words in pattern hit the shortest path | |||||
int hitPathBetweenTwoArgCnt = 0; //words in pattern hit the shortest path and excluding the two target nodes | |||||
double mappedCharacterCountPunishment = 0; // punishment when contains [[]] (function word) | |||||
DependencyTreeNode curNode; | |||||
boolean[] matchedFlag = new boolean[BoW_P.length]; | |||||
for(int idx = 0; idx < BoW_P.length; idx ++) {matchedFlag[idx] = false;} | |||||
while (unMappedLeft > 0 && (curNode=queue2.poll())!=null) | |||||
{ | |||||
if (curNode.word.isIgnored) continue; | |||||
int idx = 0; | |||||
for (String ss : BoW_P) | |||||
{ | |||||
// words in pattern only can be matched once | |||||
if (!matchedFlag[idx]) | |||||
{ | |||||
// check word | |||||
if (ss.equals(curNode.word.baseForm)) | |||||
{ | |||||
unMappedLeft --; | |||||
subTreeNodes.add(curNode); | |||||
queue2.addAll(curNode.childrenList); | |||||
matchedFlag[idx] = true; | |||||
mappedCharacterCount += ss.length(); | |||||
if(shortestPath.contains(curNode)) | |||||
{ | |||||
hitPathCnt++; | |||||
if(curNode!=n1 && curNode!=n2) | |||||
hitPathBetweenTwoArgCnt++; | |||||
} | |||||
break; | |||||
} | |||||
// check POS tag | |||||
else if (ss.startsWith("[") && posSame(curNode.word.posTag, ss)) | |||||
{ | |||||
unMappedLeft --; | |||||
subTreeNodes.add(curNode); | |||||
queue2.addAll(curNode.childrenList); | |||||
matchedFlag[idx] = true; | |||||
mappedCharacterCount += curNode.word.baseForm.length(); | |||||
mappedCharacterCountPunishment += 0.01; | |||||
break; | |||||
} | |||||
} | |||||
idx ++; | |||||
} | |||||
} | |||||
int unMatchedNoneStopWordCount = 0; | |||||
int matchedNoneStopWordCount = 0; | |||||
for (int idx = 0; idx < BoW_P.length; idx ++) { | |||||
if (BoW_P[idx].startsWith("[")) continue; | |||||
if (!matchedFlag[idx]) { | |||||
if (!Globals.stopWordsList.isStopWord(BoW_P[idx])) // unmatched | |||||
unMatchedNoneStopWordCount ++; | |||||
} | |||||
else { | |||||
if (!Globals.stopWordsList.isStopWord(BoW_P[idx])) // matched | |||||
matchedNoneStopWordCount ++; | |||||
} | |||||
} | |||||
if (unMatchedNoneStopWordCount > notMatchedCountThreshold) { | |||||
if(qlog.MODE_debug) System.out.println("----But the pattern\"" + pattern + "\" is not a subtree."); | |||||
break outer; | |||||
} | |||||
// MUST have notional words matched, non stop words > 0 | |||||
if (matchedNoneStopWordCount == 0){ | |||||
if(qlog.MODE_debug) System.out.println("----But the matching for pattern \"" + pattern + "\" does not have content words."); | |||||
break outer; | |||||
} | |||||
// IF partial match and be covered by other pattern, give up the current pattern | |||||
if (unMappedLeft > 0) { | |||||
StringBuilder subpattern = new StringBuilder(); | |||||
for (int idx = 0; idx < BoW_P.length; idx ++) { | |||||
if (matchedFlag[idx]) { | |||||
subpattern.append(BoW_P[idx]); | |||||
subpattern.append(' '); | |||||
} | |||||
} | |||||
subpattern.deleteCharAt(subpattern.length()-1); | |||||
if (pd.nlPattern_2_predicateList.containsKey(subpattern)) { | |||||
if(qlog.MODE_debug) System.out.println("----But the partially matched pattern \"" + pattern + "\" is another pattern."); | |||||
break outer; | |||||
} | |||||
} | |||||
// !Preposition | suppose only have one preposition | |||||
// TODO: consider more preposition | the first preposition may be wrong | |||||
DependencyTreeNode prep = null; | |||||
for (DependencyTreeNode dtn : subTreeNodes) { | |||||
outer2: | |||||
for (DependencyTreeNode dtn_child : dtn.childrenList) { | |||||
if(pd.prepositions.contains(dtn_child.word.baseForm)) { | |||||
prep = dtn_child; | |||||
break outer2; | |||||
} | |||||
} | |||||
} | |||||
boolean isContained = false; | |||||
for(DependencyTreeNode dtn_contain : subTreeNodes) { | |||||
if(dtn_contain == prep) isContained = true; | |||||
} | |||||
if(!isContained && prep != null) { | |||||
subTreeNodes.add(prep); | |||||
} | |||||
// Relation extracted, set COVER flags | |||||
for (DependencyTreeNode dtn : subTreeNodes) | |||||
{ | |||||
dtn.word.isCovered = true; | |||||
} | |||||
int cnt = 0; | |||||
double matched_score = ((double)(BoW_P.length-unMappedLeft))/((double)(BoW_P.length)); | |||||
if (matched_score > 0.95) | |||||
matched_score *= 10; // Award for WHOLE match | |||||
// The match ratio between pattern and path larger, the score higher; especially when uncovered with the two target nodes | |||||
if(hitPathCnt != 0) | |||||
{ | |||||
double hitScore = 1 + (double)hitPathCnt/(double)BoW_P.length; | |||||
if(hitPathBetweenTwoArgCnt == hitPathCnt) | |||||
hitScore += 1; | |||||
else if(shortestPath.size() >= 4) // If path long enough, pattern still cover with the target nodes, punishment | |||||
{ | |||||
//hitScore = 0.5; | |||||
if(hitPathBetweenTwoArgCnt == 0) // If path long enough, pattern cover with target nodes totally, punishment a lot | |||||
hitScore = 0.25; | |||||
} | |||||
matched_score *= hitScore; | |||||
} | |||||
matched_score = matched_score * Math.sqrt(mappedCharacterCount) - mappedCharacterCountPunishment; // the longer, the better (unsuitable in some cases) | |||||
if (qlog.MODE_debug) System.out.println("☆" + pattern + ", score=" + matched_score); | |||||
DependencyTreeNode subject = n1; | |||||
DependencyTreeNode object = n2; | |||||
if (subject != object) | |||||
{ | |||||
SimpleRelation sr = new SimpleRelation(); | |||||
sr.arg1Word = subject.word; | |||||
sr.arg2Word = object.word; | |||||
sr.relationParaphrase = pattern; | |||||
sr.matchingScore = matched_score; | |||||
sr.extractingMethod = extractingMethod; | |||||
if (subject.dep_father2child.endsWith("subj")) | |||||
sr.preferredSubj = sr.arg1Word; | |||||
sr.arg1Word.setIsCovered(); | |||||
sr.arg2Word.setIsCovered(); | |||||
sr.setPasList(pattern, matched_score, matchedFlag); | |||||
sr.setPreferedSubjObjOrder(T); | |||||
ret.add(sr); | |||||
cnt ++; | |||||
//String binaryRelation = "<" + subjectString + "> <" + pattern + "> <" + objectString + ">"; | |||||
} | |||||
if (cnt == 0) break outer; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
// [[det]], [[num]], [[adj]], [[pro]], [[prp]], [[con]], [[mod]] | |||||
public boolean posSame(String tag, String posWithBracket) { | |||||
if ( (posWithBracket.charAt(2) == 'd' && tag.equals("DT")) | |||||
|| (posWithBracket.charAt(2) == 'n' && tag.equals("CD")) | |||||
|| (posWithBracket.charAt(2) == 'a' && (tag.startsWith("JJ") || tag.startsWith("RB"))) | |||||
|| (posWithBracket.charAt(2) == 'c' && tag.startsWith("CC"))//TODO: how about "IN: subordinating conjunction"? | |||||
|| (posWithBracket.charAt(2) == 'm' && tag.equals("MD"))) { | |||||
return true; | |||||
} | |||||
else if (posWithBracket.charAt(2) == 'p') { | |||||
if ( (posWithBracket.charAt(4) == 'o' && tag.startsWith("PR")) | |||||
|| (posWithBracket.charAt(4) == 'p' && (tag.equals("IN") || tag.equals("TO")))) { | |||||
return true; | |||||
} | |||||
} | |||||
return false; | |||||
} | |||||
public HashMap<Integer, SemanticRelation> groupSimpleRelationsByArgsAndMapPredicate (ArrayList<SimpleRelation> simpleRelations) { | |||||
System.out.println("==========Group Simple Relations========="); | |||||
HashMap<Integer, SemanticRelation> ret = new HashMap<Integer, SemanticRelation>(); | |||||
HashMap<Integer, HashMap<Integer, StringAndDouble>> key2pasMap = new HashMap<Integer, HashMap<Integer, StringAndDouble>>(); | |||||
for(SimpleRelation simr : simpleRelations) | |||||
{ | |||||
int key = simr.getHashCode(); | |||||
if (!ret.keySet().contains(key)) | |||||
{ | |||||
ret.put(key, new SemanticRelation(simr)); | |||||
key2pasMap.put(key, new HashMap<Integer, StringAndDouble>()); | |||||
} | |||||
SemanticRelation semr = ret.get(key); | |||||
HashMap<Integer, StringAndDouble> pasMap = key2pasMap.get(key); | |||||
// Just use to display. | |||||
if (simr.matchingScore > semr.LongestMatchingScore) | |||||
{ | |||||
semr.LongestMatchingScore = simr.matchingScore; | |||||
semr.relationParaphrase = simr.relationParaphrase; | |||||
} | |||||
// for pid=x, no wonder from which pattern, we only record the highest score and the related pattern. | |||||
for (int pid : simr.pasList.keySet()) { | |||||
double score = simr.pasList.get(pid); | |||||
if (!pasMap.containsKey(pid)) { | |||||
pasMap.put(pid, new StringAndDouble(simr.relationParaphrase, score)); | |||||
} | |||||
else if (score > pasMap.get(pid).score) { | |||||
pasMap.put(pid, new StringAndDouble(simr.relationParaphrase, score)); | |||||
} | |||||
} | |||||
} | |||||
for (Integer key : key2pasMap.keySet()) { | |||||
SemanticRelation semr = ret.get(key); | |||||
HashMap<Integer, StringAndDouble> pasMap = key2pasMap.get(key); | |||||
semr.predicateMappings = new ArrayList<PredicateMapping>(); | |||||
//System.out.print("<"+semr.arg1Word.getFullEntityName() + "," + semr.arg2Word.getFullEntityName() + ">:"); | |||||
for (Integer pid : pasMap.keySet()) | |||||
{ | |||||
semr.predicateMappings.add(new PredicateMapping(pid, pasMap.get(pid).score, pasMap.get(pid).str)); | |||||
//System.out.print("[" + Globals.pd.getPredicateById(pid) + "," + pasMap.get(pid).str + "," + pasMap.get(pid).score + "]"); | |||||
} | |||||
Collections.sort(semr.predicateMappings); | |||||
} | |||||
System.out.println("========================================="); | |||||
return ret; | |||||
} | |||||
} | |||||
class StringAndDouble { | |||||
public String str; | |||||
public double score; | |||||
public StringAndDouble (String str, double score) { | |||||
this.str = str; | |||||
this.score = score; | |||||
} | |||||
} |
@@ -0,0 +1,358 @@ | |||||
package qa.extract; | |||||
import java.io.BufferedReader; | |||||
import java.io.InputStreamReader; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashMap; | |||||
import nlp.ds.Word; | |||||
import nlp.tool.StopWordsList; | |||||
//import fgmt.RelationFragment; | |||||
import fgmt.TypeFragment; | |||||
import lcn.SearchInTypeShortName; | |||||
import log.QueryLogger; | |||||
import qa.Globals; | |||||
import rdf.PredicateMapping; | |||||
import rdf.SemanticRelation; | |||||
import rdf.Triple; | |||||
import rdf.TypeMapping; | |||||
/* | |||||
* 2016-6-17 | |||||
* 1. Recognize types (include YAGO type) | |||||
* 2、Add some type mapping manually, eg, "US State"-"yago:StatesOfTheUnitedStates" | |||||
* 3、Add some extend variable, (generalization of [variable with inherit type] -> [variable with inherit triples]) eg, ?canadian <birthPlace> <Canada> | |||||
* */ | |||||
public class TypeRecognition { | |||||
// dbpedia 2014 | |||||
//public static final int[] type_Person = {180,279}; | |||||
//public static final int[] type_Place = {49,228}; | |||||
//public static final int[] type_Organisation = {419,53}; | |||||
//dbpedia 2016 | |||||
public static final int[] type_Person = {5828,15985}; | |||||
public static final int[] type_Place = {11197,2188}; | |||||
public static final int[] type_Organisation = {1335,4716}; | |||||
public static HashMap<String, String> extendTypeMap = null; | |||||
public static HashMap<String, Triple> extendVariableMap = null; | |||||
SearchInTypeShortName st = new SearchInTypeShortName(); | |||||
static | |||||
{ | |||||
extendTypeMap = new HashMap<String, String>(); | |||||
extendVariableMap = new HashMap<String, Triple>(); | |||||
Triple triple = null; | |||||
//!Handwriting for convenience | TODO: approximate/semantic match of type | |||||
extendTypeMap.put("NonprofitOrganizations", "dbo:Non-ProfitOrganisation"); | |||||
extendTypeMap.put("GivenNames", "dbo:GivenName"); | |||||
extendTypeMap.put("JamesBondMovies","yago:JamesBondFilms"); | |||||
extendTypeMap.put("TVShows", "dbo:TelevisionShow"); | |||||
extendTypeMap.put("USState", "yago:StatesOfTheUnitedStates"); | |||||
extendTypeMap.put("USStates", "yago:StatesOfTheUnitedStates"); | |||||
extendTypeMap.put("Europe", "yago:EuropeanCountries"); | |||||
extendTypeMap.put("Africa", "yago:AfricanCountries"); | |||||
//!The following IDs are based on DBpedia 2014. | |||||
//!extend variable (embedded triples) | eg, [?E|surfers]-?uri dbo:occupation res:Surfing | canadians��<?canadian> <birthPlace> <Canada> | |||||
//1) <?canadians> <birthPlace> <Canada> | [country people] <birthPlace|1639> [country] | |||||
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 1639, 2112902, "Canada", null, 100); | |||||
extendVariableMap.put("canadian", triple); | |||||
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 1639, 883747, "Germany", null, 100); | |||||
extendVariableMap.put("german", triple); | |||||
//2) ?bandleader <occupation|6690> <Bandleader> | |||||
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 6690, 5436853, "Bandleader", null, 100); | |||||
extendVariableMap.put("bandleader", triple); | |||||
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 6690, 5436854, "Surfing>", null, 100); | |||||
extendVariableMap.put("surfer", triple); | |||||
} | |||||
public static void recognizeExtendVariable(Word w) | |||||
{ | |||||
String key = w.baseForm; | |||||
if(extendVariableMap.containsKey(key)) | |||||
{ | |||||
w.mayExtendVariable = true; | |||||
Triple triple = extendVariableMap.get(key).copy(); | |||||
if(triple.subjId == Triple.VAR_ROLE_ID && triple.subject.equals(Triple.VAR_NAME)) | |||||
triple.subject = "?" + w.originalForm; | |||||
if(triple.objId == Triple.VAR_ROLE_ID && triple.object.equals(Triple.VAR_NAME)) | |||||
triple.object = "?" + w.originalForm; | |||||
w.embbededTriple = triple; | |||||
} | |||||
} | |||||
public ArrayList<TypeMapping> getExtendTypeByStr(String allUpperFormWord) | |||||
{ | |||||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||||
//Do not consider SINGLE-word type (most are useless) | eg, Battle, War, Daughter | |||||
if(allUpperFormWord.length() > 1 && allUpperFormWord.substring(1).equals(allUpperFormWord.substring(1).toLowerCase())) | |||||
return null; | |||||
//search in YAGO type | |||||
if(TypeFragment.yagoTypeList.contains(allUpperFormWord)) | |||||
{ | |||||
//YAGO prefix | |||||
String typeName = "yago:"+allUpperFormWord; | |||||
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); | |||||
tmList.add(tm); | |||||
} | |||||
else if(extendTypeMap.containsKey(allUpperFormWord)) | |||||
{ | |||||
String typeName = extendTypeMap.get(allUpperFormWord); | |||||
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); | |||||
tmList.add(tm); | |||||
} | |||||
if(tmList.size()>0) | |||||
return tmList; | |||||
else | |||||
return null; | |||||
} | |||||
public ArrayList<TypeMapping> getTypeIDsAndNamesByStr (String baseform) | |||||
{ | |||||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||||
try | |||||
{ | |||||
tmList = st.searchTypeScore(baseform, 0.4, 0.8, 10); | |||||
Collections.sort(tmList); | |||||
if (tmList.size()>0) | |||||
return tmList; | |||||
else | |||||
return null; | |||||
} catch (Exception e) { | |||||
e.printStackTrace(); | |||||
return null; | |||||
} | |||||
} | |||||
public ArrayList<Integer> recognize (String baseform) { | |||||
char c = baseform.charAt(baseform.length()-1); | |||||
if (c >= '0' && c <= '9') { | |||||
baseform = baseform.substring(0, baseform.length()-2); | |||||
} | |||||
try { | |||||
ArrayList<String> ret = st.searchType(baseform, 0.4, 0.8, 10); | |||||
ArrayList<Integer> ret_in = new ArrayList<Integer>(); | |||||
for (String s : ret) { | |||||
System.out.println("["+s+"]"); | |||||
ret_in.addAll(TypeFragment.typeShortName2IdList.get(s)); | |||||
} | |||||
if (ret_in.size()>0) return ret_in; | |||||
else return null; | |||||
} catch (Exception e) { | |||||
e.printStackTrace(); | |||||
return null; | |||||
} | |||||
} | |||||
public static void AddTypesOfWhwords (HashMap<Integer, SemanticRelation> semanticRelations) { | |||||
ArrayList<TypeMapping> ret = null; | |||||
for (Integer it : semanticRelations.keySet()) | |||||
{ | |||||
SemanticRelation sr = semanticRelations.get(it); | |||||
if(!sr.arg1Word.mayType) | |||||
{ | |||||
ret = recognizeSpecial(sr.arg1Word.baseForm); | |||||
if (ret != null) | |||||
{ | |||||
sr.arg1Word.tmList = ret; | |||||
} | |||||
} | |||||
if(!sr.arg2Word.mayType) | |||||
{ | |||||
ret = recognizeSpecial(sr.arg2Word.baseForm); | |||||
if (ret != null) | |||||
{ | |||||
sr.arg2Word.tmList = ret; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
public static ArrayList<TypeMapping> recognizeSpecial (String wordSpecial) | |||||
{ | |||||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||||
if (wordSpecial.toLowerCase().equals("who")) | |||||
{ | |||||
for (Integer i : type_Person) | |||||
{ | |||||
tmList.add(new TypeMapping(i,"Person",1)); | |||||
} | |||||
//"who" can also means organization | |||||
for (Integer i : type_Organisation) | |||||
{ | |||||
tmList.add(new TypeMapping(i,"Organization",1)); | |||||
} | |||||
return tmList; | |||||
} | |||||
else if (wordSpecial.toLowerCase().equals("where")) | |||||
{ | |||||
for (Integer i : type_Place) | |||||
{ | |||||
tmList.add(new TypeMapping(i,"Place",1)); | |||||
} | |||||
for (Integer i : type_Organisation) | |||||
{ | |||||
tmList.add(new TypeMapping(i,"Organization",1)); | |||||
} | |||||
return tmList; | |||||
} | |||||
//TODO: When ... | |||||
return null; | |||||
} | |||||
/* | |||||
* 1. Priority: mayEnt(Uppercase)>mayType>mayEnt | |||||
* 2. mayEnt=1: Constant | |||||
* 3. mayType=1: | |||||
* (1)Variable, a triple will be added when evaluation. | eg, Which [books] by Kerouac were published by Viking Press? | |||||
* (2)Constant, it modify other words. | eg, Are tree frogs a type of [amphibian]? | |||||
* 4、extend variable (a variable embedded triples) | |||||
* */ | |||||
public static void constantVariableRecognition(HashMap<Integer, SemanticRelation> semanticRelations, QueryLogger qlog) | |||||
{ | |||||
Word[] words = qlog.s.words; | |||||
//NOTICE: modifiers(implicit relation) have not been considered. | |||||
for (Integer it : semanticRelations.keySet()) | |||||
{ | |||||
SemanticRelation sr = semanticRelations.get(it); | |||||
int arg1WordPos = sr.arg1Word.position - 1; | |||||
int arg2WordPos = sr.arg2Word.position - 1; | |||||
// extend variable recognition | |||||
recognizeExtendVariable(sr.arg1Word); | |||||
recognizeExtendVariable(sr.arg2Word); | |||||
// constant or variable | |||||
if(sr.arg1Word.mayExtendVariable) | |||||
{ | |||||
//eg, ?canadian <birthPlace> <Canada> (both extendVariable & type) | |||||
if(sr.arg1Word.mayType) | |||||
sr.arg1Word.mayType = false; | |||||
if(sr.arg1Word.mayEnt) | |||||
{ | |||||
//rule: [extendVaraible & ent] + noun -> ent |eg, Canadian movies -> ent:Canada | |||||
if(arg1WordPos+1 < words.length && words[arg1WordPos+1].posTag.startsWith("N")) | |||||
{ | |||||
sr.arg1Word.mayExtendVariable = false; | |||||
sr.isArg1Constant = true; | |||||
} | |||||
else | |||||
{ | |||||
sr.arg1Word.mayEnt = false; | |||||
} | |||||
} | |||||
} | |||||
// type | |||||
else if(sr.arg1Word.mayType) | |||||
{ | |||||
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries | |||||
if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of")) | |||||
&& !words[arg1WordPos-2].posTag.startsWith("V")) | |||||
{ | |||||
sr.isArg1Constant = true; | |||||
double largerScore = 1000; | |||||
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||||
largerScore = sr.predicateMappings.get(0).score * 2; | |||||
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||||
sr.predicateMappings.add(0,nPredicate); | |||||
//constant type should be object | |||||
sr.preferredSubj = sr.arg2Word; | |||||
} | |||||
} | |||||
//ent: constant | |||||
else if(sr.arg1Word.mayEnt) | |||||
{ | |||||
sr.isArg1Constant = true; | |||||
} | |||||
// constant or variable | |||||
if(sr.arg2Word.mayExtendVariable) | |||||
{ | |||||
if(sr.arg2Word.mayType) | |||||
sr.arg2Word.mayType = false; | |||||
if(sr.arg2Word.mayEnt) | |||||
{ | |||||
if(arg2WordPos+1 < words.length && words[arg2WordPos+1].posTag.startsWith("N")) | |||||
{ | |||||
sr.arg2Word.mayExtendVariable = false; | |||||
sr.isArg2Constant = true; | |||||
} | |||||
else | |||||
{ | |||||
sr.arg2Word.mayEnt = false; | |||||
} | |||||
} | |||||
} | |||||
// type | |||||
else if(sr.arg2Word.mayType) | |||||
{ | |||||
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries | |||||
if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of")) | |||||
&& !words[arg2WordPos-2].posTag.startsWith("V") ) | |||||
{ | |||||
sr.isArg2Constant = true; | |||||
double largerScore = 1000; | |||||
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||||
largerScore = sr.predicateMappings.get(0).score * 2; | |||||
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||||
sr.predicateMappings.add(0,nPredicate); | |||||
sr.preferredSubj = sr.arg1Word; | |||||
} | |||||
//rule: Be ... a type? | |||||
if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a")) | |||||
{ | |||||
sr.isArg2Constant = true; | |||||
double largerScore = 1000; | |||||
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||||
largerScore = sr.predicateMappings.get(0).score * 2; | |||||
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||||
sr.predicateMappings.add(0,nPredicate); | |||||
sr.preferredSubj = sr.arg1Word; | |||||
} | |||||
} | |||||
else if(sr.arg2Word.mayEnt) | |||||
{ | |||||
sr.isArg2Constant = true; | |||||
} | |||||
if(sr.arg1Word != sr.preferredSubj) | |||||
sr.swapArg1Arg2(); | |||||
} | |||||
} | |||||
public static void main (String[] args) | |||||
{ | |||||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||||
String type = "space mission"; | |||||
try | |||||
{ | |||||
TypeFragment.load(); | |||||
Globals.stopWordsList = new StopWordsList(); | |||||
TypeRecognition tr = new TypeRecognition(); | |||||
while(true) | |||||
{ | |||||
System.out.print("Input query type: "); | |||||
type = br.readLine(); | |||||
tr.recognize(type); | |||||
} | |||||
} catch (Exception e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,690 @@ | |||||
package qa.mapping; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import java.util.Iterator; | |||||
import qa.Globals; | |||||
import rdf.Sparql; | |||||
import rdf.Triple; | |||||
import fgmt.EntityFragment; | |||||
import fgmt.RelationFragment; | |||||
import fgmt.TypeFragment; | |||||
import fgmt.VariableFragment; | |||||
/** | |||||
* Notice: one compatiblityChecker can be only used once to check a SPARQL. | |||||
* @author husen | |||||
*/ | |||||
public class CompatibilityChecker { | |||||
static int EnumerateThreshold = 1000; | |||||
public EntityFragmentDict efd = null; | |||||
public HashMap<String, VariableFragment> variable_fragment = null; | |||||
public CompatibilityChecker(EntityFragmentDict efd) { | |||||
this.efd = efd; | |||||
variable_fragment = new HashMap<String, VariableFragment>(); | |||||
} | |||||
// Run this check function after pass "single triple check" (recoded) | |||||
// Recoded: variable will find suitable entities, depend on the inMemory INDEX. Notice when variable = literal | |||||
public boolean isSparqlCompatible3 (Sparql spq) | |||||
{ | |||||
boolean[] isFixed = new boolean[spq.tripleList.size()]; // record triple's compatibility whether need check | |||||
for (int i = 0; i < spq.tripleList.size(); i ++) { | |||||
isFixed[i] = false; | |||||
} | |||||
//System.out.println("tripleList size="+spq.tripleList.size()); | |||||
Iterator<Triple> it; | |||||
boolean shouldContinue = true; | |||||
// shouldContinue when: triple with variables updates variable fragment, then use updated variable fragment check the previous triples | |||||
while (shouldContinue) | |||||
{ | |||||
shouldContinue = false; | |||||
it = spq.tripleList.iterator(); | |||||
int t_cnt = 0; | |||||
while (it.hasNext()) { | |||||
Triple t = it.next(); | |||||
switch (getTripleType(t)) { | |||||
case 1: // (1) E1, P, E2 | |||||
if (!isFixed[t_cnt]) | |||||
{ | |||||
int ret = hs_check1_E1PE2(t); | |||||
if (ret == 0) | |||||
isFixed[t_cnt] = true; | |||||
else if (ret == 5) | |||||
return false; | |||||
} | |||||
break; | |||||
case 2: // (2) E, P, V | |||||
if(!isFixed[t_cnt]) | |||||
{ | |||||
int ret = hs_check2_EPV(t); | |||||
if (ret == 5) | |||||
return false; | |||||
else | |||||
{ | |||||
isFixed[t_cnt] = true; // Now V has set entities or literal; notice E/P->V maybe not unique, eg, xx's starring | |||||
if (ret == 1) | |||||
shouldContinue = true; | |||||
} | |||||
} | |||||
break; | |||||
case 3: // (3) E, <type1>, T | |||||
if (!isFixed[t_cnt]) | |||||
{ | |||||
int ret = check3_Etype1T(t); | |||||
if (ret == -2) return false; | |||||
if (ret == 0) isFixed[t_cnt] = true; | |||||
} | |||||
break; | |||||
case 4: // (4) V, P, E | |||||
if(!isFixed[t_cnt]) | |||||
{ | |||||
int ret = hs_check4_VPE(t); | |||||
if (ret == 5) | |||||
return false; | |||||
else | |||||
{ | |||||
isFixed[t_cnt] = true; // Now V has set entities or literal; notice E/P->V maybe not unique, eg, xx's starring | |||||
if (ret == 1) | |||||
shouldContinue = true; | |||||
} | |||||
} | |||||
break; | |||||
case 5: // (5) V1, P, V2 (The most important and time consuming) | |||||
if(!isFixed[t_cnt]) | |||||
{ | |||||
int ret = hs_check5_V1PV2(t); | |||||
if (ret == 5) | |||||
return false; | |||||
else | |||||
{ | |||||
isFixed[t_cnt] = true; // Just set once and no re-check | |||||
if (ret == 1) | |||||
shouldContinue = true; | |||||
} | |||||
} | |||||
break; | |||||
case 6: // (6) V, <type1>, T | |||||
if (!isFixed[t_cnt]) | |||||
{ | |||||
int ret = hs_check6_Vtype1T(t); | |||||
if (ret == -2) return false; | |||||
else | |||||
{ | |||||
isFixed[t_cnt] = true; | |||||
if (ret == 1) | |||||
shouldContinue = true; | |||||
} | |||||
} | |||||
break; | |||||
case 7: | |||||
// do nothing | |||||
break; | |||||
case 8: | |||||
default: | |||||
return false; | |||||
} | |||||
t_cnt ++; | |||||
} | |||||
} | |||||
return true; | |||||
} | |||||
/** | |||||
* Get Triple's category | |||||
* (1) E1, P, E2 | |||||
* (2) E, P, V | |||||
* (3) E, <type>, T | |||||
* (4) V, P, E | |||||
* (5) V1, P, V2 | |||||
* (6) V, <type>, T | |||||
* (7) E, <type>, V | |||||
* (8) error | |||||
* | |||||
* E: Entity | |||||
* P: Predicate (exclude <type>) | |||||
* V: Variable | |||||
* T: Type | |||||
* | |||||
* @param t | |||||
* @return | |||||
*/ | |||||
public int getTripleType (Triple t) { | |||||
if (t.predicateID == Globals.pd.typePredicateID) { | |||||
boolean s = t.subject.startsWith("?"); | |||||
boolean o = t.object.startsWith("?"); | |||||
if (s && !o) return 6; | |||||
else if (o && !s) return 7; | |||||
else if (!s && !o) return 3; | |||||
else return 8; | |||||
} | |||||
else if (t.subject.startsWith("?")) { | |||||
if (t.object.startsWith("?")) return 5; | |||||
else return 4; | |||||
} | |||||
else { | |||||
if (t.object.startsWith("?")) return 2; | |||||
else return 1; | |||||
} | |||||
} | |||||
public int hs_check1_E1PE2(Triple t) | |||||
{ | |||||
int pid = t.predicateID; | |||||
EntityFragment E1 = efd.getEntityFragmentByEid(t.subjId); | |||||
EntityFragment E2 = efd.getEntityFragmentByEid(t.objId); | |||||
// E2 is E1's one depth neighbor, connected with predicate "p" | |||||
if(E1.outEntMap.containsKey(E2.eId)) | |||||
{ | |||||
ArrayList<Integer> pList = E1.outEntMap.get(E2.eId); | |||||
if(pList.contains(pid)) | |||||
return 0; | |||||
} | |||||
return 5; | |||||
} | |||||
public int hs_check2_EPV(Triple t) | |||||
{ | |||||
int pid = t.predicateID; | |||||
EntityFragment E = efd.getEntityFragmentByEid(t.subjId); | |||||
VariableFragment V = variable_fragment.get(t.object); | |||||
// P ∈ E.outEdges | |||||
if (!E.outEdges.contains(pid)) { | |||||
return 5; | |||||
} | |||||
// Set V, notice maybe literal | |||||
if(V == null) | |||||
{ | |||||
variable_fragment.put(t.object, new VariableFragment()); | |||||
V = variable_fragment.get(t.object); | |||||
for(int vid: E.outEntMap.keySet()) | |||||
{ | |||||
if(E.outEntMap.get(vid).contains(pid)) | |||||
{ | |||||
V.candEntities.add(vid); | |||||
} | |||||
} | |||||
// E's outEdges contain p, but cannot find neighbor ENT by p, then V maybe literal | |||||
if(V.candEntities.size() == 0) | |||||
{ | |||||
V.mayLiteral = true; | |||||
return 0; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
// just okay if V is literal, because fragment has not stored the literal information | |||||
if(V.mayLiteral) | |||||
return 0; | |||||
// Update V's binding by current neighbor of E | |||||
HashSet<Integer> newCandEntities = new HashSet<Integer>(); | |||||
if(V.candEntities.size() > 0 && V.candEntities.size() < E.outEntMap.size()) | |||||
{ | |||||
for(int vid: V.candEntities) | |||||
{ | |||||
if(E.outEntMap.containsKey(vid) && E.outEntMap.get(vid).contains(pid)) | |||||
{ | |||||
newCandEntities.add(vid); | |||||
} | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for(int vid: E.outEntMap.keySet()) | |||||
{ | |||||
if(E.outEntMap.get(vid).contains(pid) && (V.candEntities.size() == 0 || V.candEntities.contains(vid))) | |||||
{ | |||||
newCandEntities.add(vid); | |||||
} | |||||
} | |||||
} | |||||
V.candEntities = newCandEntities; | |||||
} | |||||
if(V.candEntities.size() > 0) | |||||
return 0; | |||||
else | |||||
return 5; | |||||
} | |||||
public int check3_Etype1T(Triple t) { | |||||
String[] T = t.object.split("\\|"); // ע��"|"��Ҫת�� | |||||
EntityFragment E = efd.getEntityFragmentByEid(t.subjId); | |||||
String newTypeString = ""; | |||||
boolean contained = false; | |||||
// check whether each type int T is proper for E | |||||
if (T.length == 0) return -2; | |||||
for (String s : T) { | |||||
contained = false; | |||||
for (Integer i : TypeFragment.typeShortName2IdList.get(s)) { | |||||
if (E.types.contains(i)) { | |||||
if (!contained) { | |||||
contained = true; | |||||
newTypeString += s; | |||||
newTypeString += "|"; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
if (newTypeString.length() > 1) { | |||||
t.object = newTypeString.substring(0, newTypeString.length()-1); | |||||
return 0; | |||||
} | |||||
else return -2; | |||||
} | |||||
public int hs_check4_VPE(Triple t) | |||||
{ | |||||
int pid = t.predicateID; | |||||
EntityFragment E = efd.getEntityFragmentByEid(t.objId); | |||||
VariableFragment V = variable_fragment.get(t.subject); | |||||
TypeFragment subjTf = SemanticItemMapping.getTypeFragmentByWord(t.getSubjectWord()); | |||||
// P ∈ E.inEdges | |||||
if (!E.inEdges.contains(pid)) { | |||||
return 5; | |||||
} | |||||
// Set V, notice V cannot be literal, because now V is subject | |||||
if(V == null) | |||||
{ | |||||
variable_fragment.put(t.subject, new VariableFragment()); | |||||
V = variable_fragment.get(t.subject); | |||||
for(int vid: E.inEntMap.keySet()) | |||||
{ | |||||
if(E.inEntMap.get(vid).contains(pid) && (subjTf == null || subjTf.entSet.contains(vid))) | |||||
{ | |||||
V.candEntities.add(vid); | |||||
} | |||||
} | |||||
// E's inEdges contain p, but cannot find neighbor ENT by p, now V is subject and cannot be literal, so match fail | |||||
if(V.candEntities.size() == 0) | |||||
{ | |||||
return 5; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
// if V is literal, fail because subject cannot be literal | |||||
if(V.mayLiteral) | |||||
return 5; | |||||
// update V's binding by current E's neighbors | |||||
HashSet<Integer> newCandEntities = new HashSet<Integer>(); | |||||
if(V.candEntities.size() > 0 && V.candEntities.size() < E.inEntMap.size()) | |||||
{ | |||||
for(int vid: V.candEntities) | |||||
{ | |||||
if(E.inEntMap.containsKey(vid) && E.inEntMap.get(vid).contains(pid)) | |||||
{ | |||||
newCandEntities.add(vid); | |||||
} | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for(int vid: E.inEntMap.keySet()) | |||||
{ | |||||
if(E.inEntMap.get(vid).contains(pid) && (V.candEntities.size() == 0 || V.candEntities.contains(vid))) | |||||
{ | |||||
newCandEntities.add(vid); | |||||
} | |||||
} | |||||
} | |||||
V.candEntities = newCandEntities; | |||||
} | |||||
if(V.candEntities.size() > 0) | |||||
return 0; | |||||
else | |||||
return 5; | |||||
} | |||||
public int check5_V1PV2(Triple t) { | |||||
ArrayList<Integer> pidList = new ArrayList<Integer>(); | |||||
pidList.add(t.predicateID); | |||||
VariableFragment V1 = variable_fragment.get(t.subject); | |||||
VariableFragment V2 = variable_fragment.get(t.object); | |||||
// V1 & V2's types, equal with types of one fragment of P | |||||
Iterator<Integer> it_int = pidList.iterator(); | |||||
ArrayList<HashSet<Integer>> newCandTypes1 = new ArrayList<HashSet<Integer>>(); | |||||
ArrayList<HashSet<Integer>> newCandTypes2 = new ArrayList<HashSet<Integer>>(); | |||||
while (it_int.hasNext()) { | |||||
Integer i = it_int.next(); | |||||
ArrayList<RelationFragment> flist = RelationFragment.relFragments.get(i); | |||||
Iterator<RelationFragment> it_rln = flist.iterator(); | |||||
while (it_rln.hasNext()) { | |||||
RelationFragment rf = it_rln.next(); | |||||
if (V1 == null && V2 == null) { | |||||
newCandTypes1.add(rf.inTypes); | |||||
newCandTypes2.add(rf.outTypes); | |||||
} | |||||
else if (V1 == null && V2 != null) { | |||||
if (V2.containsAll(rf.outTypes)) { | |||||
newCandTypes1.add(rf.inTypes); | |||||
newCandTypes2.add(rf.outTypes); | |||||
} | |||||
} | |||||
else if (V2 == null && V1 != null) { | |||||
if (V1.containsAll(rf.inTypes)) { | |||||
newCandTypes1.add(rf.inTypes); | |||||
newCandTypes2.add(rf.outTypes); | |||||
} | |||||
} | |||||
else { | |||||
if (V1.containsAll(rf.inTypes) && V2.containsAll(rf.outTypes)) | |||||
{ | |||||
newCandTypes1.add(rf.inTypes); | |||||
newCandTypes2.add(rf.outTypes); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
if (newCandTypes1.size() > 0 && newCandTypes2.size() > 0) { | |||||
if (V1 == null && V2 == null) { | |||||
variable_fragment.put(t.subject, new VariableFragment()); | |||||
variable_fragment.get(t.subject).candTypes = newCandTypes1; | |||||
variable_fragment.put(t.object, new VariableFragment()); | |||||
variable_fragment.get(t.object).candTypes = newCandTypes2; | |||||
return 1; | |||||
} | |||||
else if (V1 == null && V2 != null) { | |||||
variable_fragment.put(t.subject, new VariableFragment()); | |||||
variable_fragment.get(t.subject).candTypes = newCandTypes1; | |||||
if (V2.candTypes.size() > newCandTypes2.size()) { | |||||
V2.candTypes = newCandTypes2; | |||||
return 1; | |||||
} | |||||
else return 0; | |||||
} | |||||
else if (V2 == null && V1 != null) { | |||||
variable_fragment.put(t.object, new VariableFragment()); | |||||
variable_fragment.get(t.object).candTypes = newCandTypes2; | |||||
if (V1.candTypes.size() > newCandTypes1.size()) { | |||||
V1.candTypes = newCandTypes1; | |||||
return 1; | |||||
} | |||||
else return 0; | |||||
} | |||||
else { | |||||
if (V1.candTypes.size() > newCandTypes1.size() || V2.candTypes.size() > newCandTypes2.size()) { | |||||
V1.candTypes = newCandTypes1; | |||||
V2.candTypes = newCandTypes2; | |||||
return 1; | |||||
} | |||||
else return 0; | |||||
} | |||||
} | |||||
else return 5; | |||||
} | |||||
public int hs_check5_V1PV2(Triple t) | |||||
{ | |||||
int pid = t.predicateID; | |||||
VariableFragment V1 = variable_fragment.get(t.subject); | |||||
VariableFragment V2 = variable_fragment.get(t.object); | |||||
if(V1 == null && V2 == null) // The WORST case, current relation fragment has no records of two target entities, cannot check without types, so we should put this triple in the end | |||||
{ | |||||
return 0; // in fact should return 1, just expect the unchecked triples can provide candidates of V1,V2 then can check in the next turn | |||||
} | |||||
else if(V2 == null) | |||||
{ | |||||
if(V1.mayLiteral) | |||||
return 5; | |||||
variable_fragment.put(t.object, new VariableFragment()); | |||||
V2 = variable_fragment.get(t.object); | |||||
HashSet<Integer> newV1cands = new HashSet<Integer>(); | |||||
int cnt = 0; | |||||
for(int v1id: V1.candEntities) | |||||
{ | |||||
cnt++; | |||||
if(cnt > EnumerateThreshold) | |||||
break; | |||||
EntityFragment E = efd.getEntityFragmentByEid(v1id); | |||||
if(E != null && E.outEdges.contains(pid)) | |||||
{ | |||||
newV1cands.add(v1id); | |||||
for(int v2id: E.outEntMap.keySet()) | |||||
{ | |||||
if(E.outEntMap.get(v2id).contains(pid)) | |||||
V2.candEntities.add(v2id); | |||||
} | |||||
} | |||||
} | |||||
V1.candEntities = newV1cands; | |||||
} | |||||
else if(V1 == null) | |||||
{ | |||||
if(V2.mayLiteral) | |||||
return 0; | |||||
variable_fragment.put(t.subject, new VariableFragment()); | |||||
V1 = variable_fragment.get(t.subject); | |||||
HashSet<Integer> newV2cands = new HashSet<Integer>(); | |||||
int cnt = 0; | |||||
for(int v2id: V2.candEntities) | |||||
{ | |||||
cnt++; | |||||
if(cnt > EnumerateThreshold) | |||||
break; | |||||
EntityFragment E = efd.getEntityFragmentByEid(v2id); | |||||
if(E != null && E.inEdges.contains(pid)) | |||||
{ | |||||
newV2cands.add(v2id); | |||||
for(int v1id: E.inEntMap.keySet()) | |||||
{ | |||||
if(E.inEntMap.get(v1id).contains(pid)) | |||||
V1.candEntities.add(v1id); | |||||
} | |||||
} | |||||
} | |||||
V2.candEntities = newV2cands; | |||||
} | |||||
else | |||||
{ | |||||
if(V1.mayLiteral) | |||||
return 5; | |||||
if(V2.mayLiteral) | |||||
return 0; | |||||
HashSet<Integer> newV1cands = new HashSet<Integer>(); | |||||
HashSet<Integer> newV2cands = new HashSet<Integer>(); | |||||
for(int v1id: V1.candEntities) | |||||
{ | |||||
EntityFragment E1 = efd.getEntityFragmentByEid(v1id); | |||||
if(E1 != null && E1.outEdges.contains(pid)) | |||||
newV1cands.add(v1id); | |||||
} | |||||
V1.candEntities = newV1cands; | |||||
for(int v2id: V2.candEntities) | |||||
{ | |||||
EntityFragment E2 = efd.getEntityFragmentByEid(v2id); | |||||
if(E2 != null && E2.inEdges.contains(pid)) | |||||
newV2cands.add(v2id); | |||||
} | |||||
V2.candEntities = newV2cands; | |||||
newV1cands = new HashSet<Integer>(); | |||||
newV2cands = new HashSet<Integer>(); | |||||
for(int v1id: V1.candEntities) | |||||
{ | |||||
EntityFragment E1 = efd.getEntityFragmentByEid(v1id); | |||||
for(int v2id: V2.candEntities) | |||||
{ | |||||
if(E1.outEntMap.containsKey(v2id) && E1.outEntMap.get(v2id).contains(pid)) | |||||
{ | |||||
newV1cands.add(v1id); | |||||
newV2cands.add(v2id); | |||||
} | |||||
} | |||||
} | |||||
V1.candEntities = newV1cands; | |||||
V2.candEntities = newV2cands; | |||||
} | |||||
if(V1.candEntities.size() == 0 || (V2.candEntities.size() == 0 && !RelationFragment.isLiteral(pid))) | |||||
return 5; | |||||
else | |||||
return 0; | |||||
} | |||||
public int check6_Vtype1T(Triple t) { | |||||
String[] T = t.object.split("\\|"); // notice "|" need "\\|" | |||||
VariableFragment V = variable_fragment.get(t.subject); | |||||
String newTypeString = ""; | |||||
boolean contained = false; | |||||
// check whether each type in T is proper for V | |||||
if (T.length == 0) return -2; | |||||
ArrayList<HashSet<Integer>> newCandTypes = new ArrayList<HashSet<Integer>>(); | |||||
for (String s : T) | |||||
{ | |||||
contained = false; | |||||
//YAGO type (uncoded types), just return because we have no INDEX to check it | |||||
if(!TypeFragment.typeShortName2IdList.containsKey(s)) | |||||
return 0; | |||||
for (Integer i : TypeFragment.typeShortName2IdList.get(s)) | |||||
{ | |||||
if (V == null) { | |||||
// constraint V by user given types, flag it due to possible incomplete type | |||||
HashSet<Integer> set = new HashSet<Integer>(); | |||||
set.add(i); | |||||
set.add(VariableFragment.magic_number); | |||||
newCandTypes.add(set); | |||||
if (!contained) { | |||||
contained = true; | |||||
newTypeString += s; | |||||
newTypeString += "|"; | |||||
} | |||||
} | |||||
else if (V.contains(i)) { | |||||
if (!contained) { | |||||
contained = true; | |||||
newTypeString += s; | |||||
newTypeString += "|"; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
// check whether each fragment in V is proper for T | |||||
// if not, delete the fragment (that means we can narrow the scope) | |||||
ArrayList<HashSet<Integer>> deleteCandTypes = new ArrayList<HashSet<Integer>>(); | |||||
if (V != null) | |||||
{ | |||||
Iterator<HashSet<Integer>> it = V.candTypes.iterator(); | |||||
while(it.hasNext()) { | |||||
HashSet<Integer> set = it.next(); | |||||
boolean isCandTypeOkay = false; | |||||
//v get [constraint types] through other triples, at least one type can reserve, otherwise delete the [constriant types] | |||||
for (String s : T) | |||||
{ | |||||
for (Integer i : TypeFragment.typeShortName2IdList.get(s)) { | |||||
if (set.contains(i)) { | |||||
isCandTypeOkay = true; | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
if (!isCandTypeOkay) { | |||||
deleteCandTypes.add(set); | |||||
} | |||||
} | |||||
V.candTypes.removeAll(deleteCandTypes); | |||||
} | |||||
if (V == null) { | |||||
variable_fragment.put(t.subject, new VariableFragment()); | |||||
variable_fragment.get(t.subject).candTypes = newCandTypes; | |||||
} | |||||
if (newTypeString.length() > 1) { | |||||
t.object = newTypeString.substring(0, newTypeString.length()-1); | |||||
if (deleteCandTypes.size() > 0) { | |||||
return 1; | |||||
} | |||||
else { | |||||
return 0; | |||||
} | |||||
} | |||||
else return -2; | |||||
} | |||||
public int hs_check6_Vtype1T(Triple t) | |||||
{ | |||||
String[] tList = t.object.split("\\|"); // ע��"|"��Ҫת�� | |||||
VariableFragment V = variable_fragment.get(t.subject); | |||||
if (tList.length == 0) return -2; | |||||
// Simplify, only consider the first one | |||||
if(!TypeFragment.typeShortName2IdList.containsKey(tList[0])) | |||||
return 0; | |||||
int tid = TypeFragment.typeShortName2IdList.get(tList[0]).get(0); | |||||
TypeFragment T = TypeFragment.typeFragments.get(tid); | |||||
if(V == null) | |||||
{ | |||||
variable_fragment.put(t.subject, new VariableFragment()); | |||||
V = variable_fragment.get(t.subject); | |||||
V.candEntities = T.entSet; | |||||
} | |||||
else | |||||
{ | |||||
if(V.mayLiteral) //literal cannot be subject | |||||
return -2; | |||||
HashSet<Integer> newVcands = new HashSet<Integer>(); | |||||
for(int vid: V.candEntities) | |||||
{ | |||||
EntityFragment E = efd.getEntityFragmentByEid(vid); | |||||
if(E.types.contains(tid)) | |||||
newVcands.add(vid); | |||||
} | |||||
V.candEntities = newVcands; | |||||
} | |||||
if(V.candEntities.size() == 0) | |||||
return -2; | |||||
else | |||||
return 0; | |||||
} | |||||
public void swapTriple (Triple t) { | |||||
String temp = t.subject; | |||||
t.subject = t.object; | |||||
t.object = temp; | |||||
} | |||||
}; |
@@ -0,0 +1,164 @@ | |||||
package qa.mapping; | |||||
import java.io.BufferedReader; | |||||
import java.io.IOException; | |||||
import java.io.InputStreamReader; | |||||
import java.util.ArrayList; | |||||
import java.util.HashMap; | |||||
import lcn.EntityFragmentFields; | |||||
import log.QueryLogger; | |||||
import org.apache.commons.httpclient.HttpClient; | |||||
import org.apache.commons.httpclient.HttpException; | |||||
import org.apache.commons.httpclient.methods.GetMethod; | |||||
import fgmt.EntityFragment; | |||||
import rdf.EntityMapping; | |||||
public class DBpediaLookup { | |||||
//There are two websites of the DBpediaLookup online service. | |||||
//public static final String baseURL = "http://en.wikipedia.org/w/api.php?action=opensearch&format=xml&limit=10&search="; | |||||
//public static final String baseURL = "http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=5&QueryString="; | |||||
public static final String baseURL = "http://172.31.222.72:1234/api/search/KeywordSearch?MaxHits=5&QueryString="; | |||||
public HttpClient ctripHttpClient = null; | |||||
//public static final String begin = "<Text xml:space=\"preserve\">"; | |||||
//public static final String begin = "<Result>\n <Label>"; | |||||
public static final String begin = "<Result>\n <Label>"; | |||||
public static final int begin_length = begin.length(); | |||||
//public static final String end = "</Text>"; | |||||
public static final String end = "</Label>"; | |||||
public static final int end_length = end.length(); | |||||
public static HashMap<String, String>entMentionDict = null; // TODO: base on redirect data & wikipedia click data to build mention2ent's dictionary, now just manually | |||||
public DBpediaLookup() | |||||
{ | |||||
ctripHttpClient = new HttpClient(); | |||||
ctripHttpClient.setTimeout(3000); | |||||
entMentionDict = new HashMap<String, String>(); | |||||
entMentionDict.put("Prince_Charles", "Charles,_Prince_of_Wales"); | |||||
} | |||||
public ArrayList<EntityMapping> getEntityMappings(String searchString, QueryLogger qlog) | |||||
{ | |||||
ArrayList<String> slist = new ArrayList<String>(); | |||||
if(entMentionDict.containsKey(searchString)) | |||||
slist.add(entMentionDict.get(searchString)); | |||||
else | |||||
slist = lookForEntityNames(searchString, qlog); | |||||
if (slist.size() == 0 && searchString.contains(". ")) | |||||
slist.addAll(lookForEntityNames(searchString.replaceAll(". ", "."), qlog)); | |||||
ArrayList<EntityMapping> emlist = new ArrayList<EntityMapping>(); | |||||
// Now string use "_" as delimiter (original) | |||||
String[] sa = searchString.split("_"); | |||||
int UpperCnt = 0; | |||||
for(String str: sa) | |||||
{ | |||||
if( (str.charAt(0)>='A'&&str.charAt(0)<='Z') || (str.charAt(0)>='0'&&str.charAt(0)<='9') ) | |||||
UpperCnt ++; | |||||
} | |||||
System.out.print("DBpediaLookup find: " + slist + ", "); | |||||
int count = 40; | |||||
for (String s : slist) | |||||
{ | |||||
//consider ABBR only when all UPPER; drop when too long edit distance | |||||
if(UpperCnt < sa.length && EntityFragment.calEditDistance(s, searchString.replace("_", ""))>searchString.length()/2) | |||||
continue; | |||||
int eid = -1; | |||||
s = s.replace(" ", "_"); | |||||
if(EntityFragmentFields.entityName2Id.containsKey(s)) | |||||
{ | |||||
eid = EntityFragmentFields.entityName2Id.get(s); | |||||
emlist.add(new EntityMapping(eid, s, count)); | |||||
count -=2 ; | |||||
} | |||||
else | |||||
{ | |||||
System.out.print("Drop "+s+" because it not in Entity Dictionary. "); | |||||
} | |||||
} | |||||
System.out.println("DBpediaLookup select: " + emlist); | |||||
return emlist; | |||||
} | |||||
public ArrayList<String> lookForEntityNames (String searchString, QueryLogger qlog) { | |||||
// URL transition: " " -> %20 | |||||
GetMethod getMethod = new GetMethod((baseURL+searchString).replaceAll(" ", "%20")); | |||||
ArrayList<String> ret = new ArrayList<String>(); | |||||
int statusCode; | |||||
try { | |||||
statusCode = ctripHttpClient.executeMethod(getMethod); | |||||
} catch (HttpException e) { | |||||
e.printStackTrace(); | |||||
return ret; | |||||
} catch (IOException e) { | |||||
e.printStackTrace(); | |||||
return ret; | |||||
} | |||||
if (statusCode!=200) return null; | |||||
String response = getMethod.getResponseBodyAsString(); | |||||
if (qlog != null && qlog.MODE_debug) { | |||||
System.out.println("searchString=" + searchString); | |||||
System.out.println("statusCode=" + statusCode); | |||||
System.out.println("response=" + getMethod.getResponseBodyAsString()); | |||||
} | |||||
getMethod.releaseConnection(); | |||||
//System.out.println(response); | |||||
if (response == null || response.isEmpty()) | |||||
return ret; | |||||
int idx1 = response.indexOf(begin); | |||||
while (idx1 != -1) { | |||||
int idx2 = response.indexOf(end, idx1+begin_length); | |||||
String ss = response.substring(idx1+begin_length, idx2); | |||||
ret.add(ss); | |||||
//System.out.println(ss); | |||||
idx1 = response.indexOf(begin, idx2 + end_length); | |||||
} | |||||
return ret; | |||||
} | |||||
public static void main(String argv[]){ | |||||
DBpediaLookup dbplook = new DBpediaLookup(); | |||||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||||
try { | |||||
while (true) { | |||||
System.out.println("Test DBpediaLookup."); | |||||
System.out.print("Please input the search string: "); | |||||
String searchString = br.readLine(); | |||||
try { | |||||
long t1 = System.currentTimeMillis(); | |||||
ArrayList<String> res = dbplook.lookForEntityNames(searchString, null); | |||||
long t2 = System.currentTimeMillis(); | |||||
System.out.println(res); | |||||
System.out.println("time=" + (t2-t1) + "ms"); | |||||
} catch (Exception e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} catch (IOException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
return; | |||||
} | |||||
} |
@@ -0,0 +1,44 @@ | |||||
package qa.mapping; | |||||
import java.util.HashMap; | |||||
//import lcn.EntityFragmentFields; | |||||
//import qa.Globals; | |||||
import fgmt.EntityFragment; | |||||
public class EntityFragmentDict { | |||||
//public HashMap<String, EntityFragment> entityFragmentDictionary = new HashMap<String, EntityFragment>(); | |||||
public HashMap<Integer, EntityFragment> entityFragmentDictionary = new HashMap<Integer, EntityFragment>(); | |||||
public EntityFragment getEntityFragmentByEid (Integer eid) | |||||
{ | |||||
if (!entityFragmentDictionary.containsKey(eid)) | |||||
{ | |||||
entityFragmentDictionary.put(eid, EntityFragment.getEntityFragmentByEntityId(eid)); | |||||
} | |||||
return entityFragmentDictionary.get(eid); | |||||
} | |||||
/* | |||||
* Old version, search by name | |||||
* */ | |||||
// public EntityFragment getEntityFragmentByName (String name) { | |||||
// if (name.startsWith("?")) { | |||||
// return null; | |||||
// } | |||||
// if (!entityFragmentDictionary.containsKey(name)) { | |||||
// String fgmt = EntityFragment.getEntityFgmtStringByName(name); | |||||
// if (fgmt != null) | |||||
// { | |||||
// int eid = EntityFragmentFields.entityName2Id.get(name); | |||||
// entityFragmentDictionary.put(name, new EntityFragment(eid, fgmt)); | |||||
// } | |||||
// else { | |||||
// entityFragmentDictionary.put(name, null); | |||||
// } | |||||
// } | |||||
// return entityFragmentDictionary.get(name); | |||||
// | |||||
// } | |||||
} |
@@ -0,0 +1,811 @@ | |||||
package qa.mapping; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import java.util.Iterator; | |||||
import java.util.Map; | |||||
import nlp.ds.Word; | |||||
import nlp.ds.Sentence.SentenceType; | |||||
import fgmt.EntityFragment; | |||||
import fgmt.RelationFragment; | |||||
import fgmt.TypeFragment; | |||||
import log.QueryLogger; | |||||
import qa.Globals; | |||||
import rdf.EntityMapping; | |||||
import rdf.PredicateMapping; | |||||
import rdf.SemanticRelation; | |||||
import rdf.Sparql; | |||||
import rdf.Triple; | |||||
import rdf.TypeMapping; | |||||
public class SemanticItemMapping { | |||||
public HashMap<Word, ArrayList<EntityMapping>> entityDictionary = new HashMap<Word, ArrayList<EntityMapping>>(); | |||||
public static int k = 10; // useless now | |||||
public static int t = 10; // Depth of enumerating candidates of each node/edge. O(t^n). | |||||
ArrayList<Sparql> rankedSparqls = new ArrayList<Sparql>(); | |||||
HashSet<String> checkedSparqlStrs = new HashSet<String>(); | |||||
public ArrayList<ArrayList<EntityMapping>> entityPhrasesList = new ArrayList<ArrayList<EntityMapping>>(); | |||||
public ArrayList<Word> entityWordList = new ArrayList<Word>(); | |||||
public HashMap<Integer, EntityMapping> currentEntityMappings = new HashMap<Integer, EntityMapping>(); | |||||
public ArrayList<ArrayList<PredicateMapping>> predicatePhraseList = new ArrayList<ArrayList<PredicateMapping>>(); | |||||
public ArrayList<SemanticRelation> predicateSrList = new ArrayList<SemanticRelation>(); | |||||
public HashMap<Integer, PredicateMapping> currentPredicateMappings = new HashMap<Integer, PredicateMapping>(); | |||||
public HashMap<Integer, SemanticRelation> semanticRelations = null; | |||||
public QueryLogger qlog = null; | |||||
public EntityFragmentDict efd = new EntityFragmentDict(); | |||||
public boolean isAnswerFound = false; | |||||
public int tripleCheckCallCnt = 0; | |||||
public int sparqlCheckCallCnt = 0; | |||||
public int sparqlCheckId = 0; | |||||
SemanticRelation firstFalseSr = null; | |||||
long tripleCheckTime = 0; | |||||
long sparqlCheckTime = 0; | |||||
/* | |||||
* A best-first top-down method, enumerate all possible query graph and sort. | |||||
* Notice, we use fragment checking to simulate graph matching and generate the TOP-k SPARQL queries, which can be executed via GStore or Virtuoso. | |||||
* */ | |||||
public void process(QueryLogger qlog, HashMap<Integer, SemanticRelation> semRltn) | |||||
{ | |||||
semanticRelations = semRltn; | |||||
this.qlog = qlog; | |||||
long t1; | |||||
t = 10; // Notice, t is adjustable. | |||||
entityPhrasesList.clear(); | |||||
entityWordList.clear(); | |||||
currentEntityMappings.clear(); | |||||
predicatePhraseList.clear(); | |||||
predicateSrList.clear(); | |||||
currentPredicateMappings.clear(); | |||||
// 1. collect info of constant nodes(entities) | |||||
Iterator<Map.Entry<Integer, SemanticRelation>> it = semanticRelations.entrySet().iterator(); | |||||
while(it.hasNext()) | |||||
{ | |||||
Map.Entry<Integer, SemanticRelation> entry = it.next(); | |||||
SemanticRelation sr = entry.getValue(); | |||||
//We now only tackle Constant of Entity & Type. TODO: consider Literal. | |||||
if(sr.isArg1Constant && !sr.arg1Word.mayType && !sr.arg1Word.mayEnt || sr.isArg2Constant && !sr.arg2Word.mayType && !sr.arg2Word.mayEnt) | |||||
{ | |||||
it.remove(); | |||||
continue; | |||||
} | |||||
//Type constant will be solved in ScoreAndRanking function. | |||||
if(sr.isArg1Constant && sr.arg1Word.mayEnt) | |||||
{ | |||||
if(!entityDictionary.containsKey(sr.arg1Word)) | |||||
entityDictionary.put(sr.arg1Word, sr.arg1Word.emList); | |||||
entityPhrasesList.add(sr.arg1Word.emList); | |||||
entityWordList.add(sr.arg1Word); | |||||
} | |||||
if(sr.isArg2Constant && !sr.arg2Word.mayType) | |||||
{ | |||||
if (!entityDictionary.containsKey(sr.arg2Word)) | |||||
entityDictionary.put(sr.arg2Word, sr.arg2Word.emList); | |||||
entityPhrasesList.add(sr.arg2Word.emList); | |||||
entityWordList.add(sr.arg2Word); | |||||
} | |||||
} | |||||
// 2. collect info of edges(relations). | |||||
for (Integer key : semanticRelations.keySet()) | |||||
{ | |||||
SemanticRelation sr = semanticRelations.get(key); | |||||
predicatePhraseList.add(sr.predicateMappings); | |||||
predicateSrList.add(sr); | |||||
// Reduce t when structure enumeration needed. | |||||
if(Globals.evaluationMethod > 1 && !sr.isSteadyEdge) | |||||
t = 5; | |||||
} | |||||
// 3. top-k join | |||||
t1 = System.currentTimeMillis(); | |||||
if(semanticRelations.size()>0) | |||||
topkJoin(semanticRelations); | |||||
else | |||||
System.out.println("No Valid SemanticRelations."); | |||||
qlog.timeTable.put("TopkJoin", (int)(System.currentTimeMillis()-t1)); | |||||
qlog.timeTable.put("TripleCheck", (int)tripleCheckTime); | |||||
qlog.timeTable.put("SparqlCheck", (int)sparqlCheckTime); | |||||
Collections.sort(rankedSparqls); | |||||
// Notice, use addAll because we may have more than one node recognition decision. | |||||
qlog.rankedSparqls.addAll(rankedSparqls); | |||||
qlog.entityDictionary = entityDictionary; | |||||
System.out.println("Check query graph count: " + tripleCheckCallCnt + "\nPass single check: " + sparqlCheckCallCnt + "\nPass final check: " + rankedSparqls.size()); | |||||
System.out.println("TopkJoin time=" + qlog.timeTable.get("TopkJoin")); | |||||
} | |||||
public void topkJoin (HashMap<Integer, SemanticRelation> semanticRelations) | |||||
{ | |||||
dfs_entityName(0); | |||||
} | |||||
// Each level for a CERTAIN entity | |||||
public void dfs_entityName (int level_i) | |||||
{ | |||||
// All entities ready. | |||||
if (level_i == entityPhrasesList.size()) | |||||
{ | |||||
dfs_predicate(0); | |||||
return; | |||||
} | |||||
ArrayList<EntityMapping> list = entityPhrasesList.get(level_i); | |||||
Word w = entityWordList.get(level_i); | |||||
int tcount = 0; | |||||
for(EntityMapping em : list) | |||||
{ | |||||
if (tcount == t || isAnswerFound) break; | |||||
currentEntityMappings.put(w.hashCode(), em); | |||||
dfs_entityName(level_i+1); | |||||
currentEntityMappings.remove(w.hashCode()); | |||||
tcount ++; | |||||
} | |||||
} | |||||
public void dfs_predicate(int level_i) | |||||
{ | |||||
// All entities & predicates ready, start generate SPARQL. | |||||
if (level_i == predicatePhraseList.size()) | |||||
{ | |||||
scoringAndRanking(); | |||||
return; | |||||
} | |||||
ArrayList<PredicateMapping> list = predicatePhraseList.get(level_i); | |||||
SemanticRelation sr = predicateSrList.get(level_i); | |||||
if (sr.dependOnSemanticRelation != null) | |||||
{ | |||||
dfs_predicate(level_i+1); | |||||
} | |||||
else | |||||
{ | |||||
int tcount=0; | |||||
for (PredicateMapping pm : list) | |||||
{ | |||||
if (tcount==t || isAnswerFound) break; | |||||
currentPredicateMappings.put(sr.hashCode(), pm); | |||||
dfs_predicate(level_i+1); | |||||
currentPredicateMappings.remove(sr.hashCode()); | |||||
tcount++; | |||||
// Pruning (If we do not change predicate of firstFalseSr, it will still false, so just return) | |||||
if(firstFalseSr != null) | |||||
{ | |||||
if(firstFalseSr != sr) return; | |||||
else firstFalseSr = null; | |||||
} | |||||
} | |||||
// "null" means we drop this edge, this is how we enumerate structure. | |||||
if(Globals.evaluationMethod == 2 && sr.isSteadyEdge == false) | |||||
{ | |||||
currentPredicateMappings.put(sr.hashCode(), null); | |||||
dfs_predicate(level_i+1); | |||||
currentPredicateMappings.remove(sr.hashCode()); | |||||
tcount++; | |||||
} | |||||
} | |||||
} | |||||
/* | |||||
* Run this function when all nodes/edges have set value (through currentEntityMappings、currentPredicateMappings) | |||||
* Generate SPARQL according current ENTs and RELATIONs, then fragment checking | |||||
* Notice: add embedded type information: | |||||
* eg, ?who <height> ?how --add--> ?who <type1> <Person> | ?book <author> <Tom> --add--> ?book <type1> <Book> | |||||
* Notice: add constant type information: | |||||
* eg, ask: <YaoMing> <type1> <BasketballPlayer> | |||||
* Notice: add embedded triple information: | |||||
* eg, ?Canadians <residence> <Unitied_State> --add--> ?Canadians <birthPlace> <Canada> | |||||
* */ | |||||
public void scoringAndRanking() | |||||
{ | |||||
firstFalseSr = null; | |||||
Sparql sparql = new Sparql(semanticRelations); | |||||
// A simple way to judge connectivity (may incorrect when nodes number >= 6) | |||||
//TODO: a standard method to judge CONNECTIVITY | |||||
HashMap<Integer, Integer> count = new HashMap<Integer, Integer>(); | |||||
int edgeCnt = 0; | |||||
for (Integer key : semanticRelations.keySet()) | |||||
{ | |||||
SemanticRelation sr = semanticRelations.get(key); | |||||
if(currentPredicateMappings.get(sr.hashCode()) == null) | |||||
continue; | |||||
edgeCnt++; | |||||
int v1 = sr.arg1Word.hashCode(), v2 = sr.arg2Word.hashCode(); | |||||
if(!count.containsKey(v1)) | |||||
count.put(v1, 1); | |||||
else | |||||
count.put(v1, count.get(v1)+1); | |||||
if(!count.containsKey(v2)) | |||||
count.put(v2, 1); | |||||
else | |||||
count.put(v2, count.get(v2)+1); | |||||
} | |||||
if(count.size() < qlog.semanticUnitList.size()) | |||||
return; | |||||
if(edgeCnt == 0) | |||||
return; | |||||
if(edgeCnt > 1) | |||||
{ | |||||
for (Integer key : semanticRelations.keySet()) | |||||
{ | |||||
SemanticRelation sr = semanticRelations.get(key); | |||||
if(currentPredicateMappings.get(sr.hashCode()) == null) | |||||
continue; | |||||
int v1 = sr.arg1Word.hashCode(), v2 = sr.arg2Word.hashCode(); | |||||
if(count.get(v1) == 1 && count.get(v2) == 1) | |||||
return; | |||||
} | |||||
} | |||||
// Now the graph is connected, start to generate SPARQL. | |||||
HashSet<String> typeSetFlag = new HashSet<String>(); | |||||
for (Integer key : semanticRelations.keySet()) | |||||
{ | |||||
SemanticRelation sr = semanticRelations.get(key); | |||||
String sub, obj; | |||||
int subjId = -1, objId = -1; | |||||
int pid; | |||||
double score = 1; | |||||
boolean isSubjObjOrderSameWithSemRltn = true; | |||||
// argument1 | |||||
if(sr.isArg1Constant && (sr.arg1Word.mayEnt || sr.arg1Word.mayType) ) // Constant | |||||
{ | |||||
// For subject, entity has higher priority. | |||||
if(sr.arg1Word.mayEnt) | |||||
{ | |||||
EntityMapping em = currentEntityMappings.get(sr.arg1Word.hashCode()); | |||||
subjId = em.entityID; | |||||
sub = em.entityName; | |||||
score *= em.score; | |||||
} | |||||
else | |||||
{ | |||||
TypeMapping tm = sr.arg1Word.tmList.get(0); | |||||
subjId = Triple.TYPE_ROLE_ID; | |||||
sub = tm.typeName; | |||||
score *= (tm.score*100); // Generalization. type score: [0,1], entity score: [0,100]. | |||||
} | |||||
} | |||||
else // Variable | |||||
{ | |||||
subjId = Triple.VAR_ROLE_ID; | |||||
sub = "?" + sr.arg1Word.originalForm; | |||||
} | |||||
// Embedded Type info of argument1(variable type) | eg, ?book <type> <Book> | |||||
// Notice, mayType & mayExtendVariable is mutual-exclusive. (see constantVariableRecognition) | |||||
// Notice, we do NOT consider types of [?who,?where...] now. | |||||
Triple subt = null; | |||||
if (!sr.isArg1Constant && sr.arg1Word.mayType && sr.arg1Word.tmList != null && sr.arg1Word.tmList.size() > 0 && !typeSetFlag.contains(sub)) | |||||
{ | |||||
StringBuilder type = new StringBuilder(""); | |||||
for (TypeMapping tm: sr.arg1Word.tmList) | |||||
{ | |||||
Integer tt = tm.typeID; | |||||
if(tt != -1) | |||||
type.append(TypeFragment.typeId2ShortName.get(tt)); | |||||
else | |||||
type.append(tm.typeName); | |||||
type.append('|'); | |||||
} | |||||
String ttt = type.substring(0, type.length()-1); | |||||
subt = new Triple(subjId, sub, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, ttt, null, 10); | |||||
subt.typeSubjectWord = sr.arg1Word; | |||||
if(sr.arg1Word.tmList.get(0).prefferdRelation == -1) | |||||
subt = null; | |||||
} | |||||
// predicate | |||||
SemanticRelation dep = sr.dependOnSemanticRelation; | |||||
PredicateMapping pm = null; | |||||
if (dep == null) | |||||
pm = currentPredicateMappings.get(sr.hashCode()); | |||||
else | |||||
pm = currentPredicateMappings.get(dep.hashCode()); | |||||
if(pm == null) | |||||
continue; | |||||
pid = pm.pid; | |||||
score *= pm.score; | |||||
// argument2 | |||||
if(sr.isArg2Constant && (sr.arg2Word.mayEnt || sr.arg2Word.mayType) ) | |||||
{ | |||||
if(!sr.arg2Word.mayType) | |||||
{ | |||||
EntityMapping em = currentEntityMappings.get(sr.arg2Word.hashCode()); | |||||
objId = em.entityID; | |||||
obj = em.entityName; | |||||
score *= em.score; | |||||
} | |||||
else | |||||
{ | |||||
TypeMapping tm = sr.arg2Word.tmList.get(0); | |||||
objId = Triple.TYPE_ROLE_ID; | |||||
obj = tm.typeName; | |||||
score *= (tm.score*100); | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
objId = Triple.VAR_ROLE_ID; | |||||
obj = "?" + sr.arg2Word.getFullEntityName(); | |||||
} | |||||
// Type info of argument2 | |||||
Triple objt = null; | |||||
if (sr.arg2Word.tmList != null && sr.arg2Word.tmList.size() > 0 && !typeSetFlag.contains(obj) && !sr.isArg2Constant) | |||||
{ | |||||
StringBuilder type = new StringBuilder(""); | |||||
for (TypeMapping tm : sr.arg2Word.tmList) | |||||
{ | |||||
Integer tt = tm.typeID; | |||||
if(tt != -1) | |||||
type.append(TypeFragment.typeId2ShortName.get(tt)); | |||||
else | |||||
type.append(tm.typeName); | |||||
type.append('|'); | |||||
} | |||||
String ttt = type.substring(0, type.length()-1); | |||||
objt = new Triple(objId, obj, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, ttt, null, 10); | |||||
objt.typeSubjectWord = sr.arg2Word; | |||||
if(sr.arg2Word.tmList.get(0).prefferdRelation == -1) | |||||
objt = null; | |||||
} | |||||
// Prune. | |||||
if(objId == Triple.TYPE_ROLE_ID && pid != Globals.pd.typePredicateID) | |||||
return; | |||||
// Consider orders rely on LITERAL relations | at least one argument has TYPE info | |||||
if (RelationFragment.isLiteral(pid) && (subt != null || objt != null)) | |||||
{ | |||||
if (sub.startsWith("?") && obj.startsWith("?")) // two variables | |||||
{ | |||||
// two variables have both possibility as object literal | |||||
if (subt != null) { | |||||
subt.object += ("|" + "literal_HRZ"); | |||||
} | |||||
if (objt != null) { | |||||
objt.object += ("|" + "literal_HRZ"); | |||||
} | |||||
if (subt==null && objt!=null) | |||||
{ | |||||
// if object has type, subject has no type, more possible to change sub/obj because literal has no type in general [however maybe have yago:type] | |||||
String temp = sub; | |||||
int tmpId = subjId; | |||||
sub = obj; | |||||
subjId = objId; | |||||
obj = temp; | |||||
objId = tmpId; | |||||
isSubjObjOrderSameWithSemRltn=!isSubjObjOrderSameWithSemRltn; | |||||
} | |||||
} | |||||
else if (sub.startsWith("?") && !obj.startsWith("?")) { | |||||
// need change subj/obj order | |||||
if (subt != null) { | |||||
subt.object += ("|" + "literal_HRZ"); | |||||
} | |||||
String temp = sub; | |||||
int tmpId = subjId; | |||||
sub = obj; | |||||
subjId = objId; | |||||
obj = temp; | |||||
objId = tmpId; | |||||
isSubjObjOrderSameWithSemRltn=!isSubjObjOrderSameWithSemRltn; | |||||
//System.out.println("here: "+sub+obj); | |||||
} | |||||
else if (obj.startsWith("?") && !sub.startsWith("?")) { | |||||
if (objt != null) { | |||||
objt.object += ("|" + "literal_HRZ"); | |||||
} | |||||
} | |||||
} | |||||
Triple t = new Triple(subjId, sub, pid, objId, obj, sr, score,isSubjObjOrderSameWithSemRltn); | |||||
//System.out.println("triple: "+t+" "+isTripleCompatibleCanSwap(t)); | |||||
sparql.addTriple(t); | |||||
// score of subject/object's type should correlative with the score of triple itself | |||||
if (subt != null) | |||||
{ | |||||
subt.score += t.score*0.2; | |||||
sparql.addTriple(subt); | |||||
typeSetFlag.add(subt.subject); // be cautious to NOT use sub, it may has changed subj/obj order | |||||
} | |||||
if (objt != null) | |||||
{ | |||||
objt.score += t.score*0.2; | |||||
sparql.addTriple(objt); | |||||
typeSetFlag.add(objt.subject); | |||||
} | |||||
// add argument' embedded triple, eg, ?canadian <birthPlace> <Canada> | |||||
if(!sr.isArg1Constant && sr.arg1Word.mayExtendVariable && sr.arg1Word.embbededTriple != null) | |||||
{ | |||||
sparql.addTriple(sr.arg1Word.embbededTriple); | |||||
} | |||||
if(!sr.isArg2Constant && sr.arg2Word.mayExtendVariable && sr.arg2Word.embbededTriple != null) | |||||
{ | |||||
sparql.addTriple(sr.arg2Word.embbededTriple); | |||||
} | |||||
sparql.adjustTriplesOrder(); | |||||
} | |||||
// deduplicate | |||||
sparql.deduplicate(); | |||||
if(checkedSparqlStrs.contains(sparql.toStringForGStore2())) | |||||
return; | |||||
checkedSparqlStrs.add(sparql.toStringForGStore2()); | |||||
if (!qlog.MODE_fragment) { | |||||
// Method 1: do NOT check compatibility | |||||
rankedSparqls.add(sparql); | |||||
isAnswerFound = true; | |||||
} | |||||
else { | |||||
// Method 2: check compatibility by FRAGMENT (offline index) | |||||
//1. single-triple check (a quickly prune), allow to swap subject and object. Try to adjust to the best order. | |||||
tripleCheckCallCnt++; | |||||
long t1 = System.currentTimeMillis(); | |||||
for (Triple t : sparql.tripleList) | |||||
if(t.predicateID!=Globals.pd.typePredicateID && !isTripleCompatibleCanSwap(t)) | |||||
{ | |||||
firstFalseSr = t.semRltn; | |||||
return; | |||||
} | |||||
tripleCheckTime += (System.currentTimeMillis()-t1); | |||||
//2. SPARQL check (consider the interact between all triples), allow to swap subject and object. | |||||
t1 = System.currentTimeMillis(); | |||||
sparqlCheckCallCnt++; | |||||
enumerateSubjObjOrders(sparql, new Sparql(sparql.semanticRelations), 0); | |||||
sparqlCheckTime += (System.currentTimeMillis()-t1); | |||||
} | |||||
} | |||||
/* | |||||
* Notice: | |||||
* typeId=-1 then no data fragment | |||||
* */ | |||||
public static TypeFragment getTypeFragmentByWord(Word word) | |||||
{ | |||||
TypeFragment tf = null; | |||||
if(word!=null && word.tmList!=null && word.tmList.size()>0) | |||||
{ | |||||
int typeId = word.tmList.get(0).typeID; | |||||
if(typeId != -1) | |||||
tf = TypeFragment.typeFragments.get(typeId); | |||||
} | |||||
return tf; | |||||
} | |||||
/* | |||||
* (Just PRE CHECK [single triple check] in this function, the final check in enumerateSubjObjOrders which utilize more INDEX) | |||||
* notice: predicate = type cannot entrance this function | |||||
* */ | |||||
public boolean isTripleCompatibleCanSwap (Triple t) { | |||||
if (qlog.s.sentenceType==SentenceType.GeneralQuestion) | |||||
{ | |||||
if (fragmentCompatible2(t.subjId, t.predicateID, t.objId) > | |||||
fragmentCompatible2(t.objId, t.predicateID, t.subjId)) | |||||
t.swapSubjObjOrder(); | |||||
if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) | |||||
return true; | |||||
return false; | |||||
} | |||||
else | |||||
{ | |||||
//var & var | |||||
if(t.subject.startsWith("?") && t.object.startsWith("?")) | |||||
{ | |||||
Word subjWord = t.getSubjectWord(), objWord = t.getObjectWord(); | |||||
TypeFragment subjTf = getTypeFragmentByWord(subjWord), objTf = getTypeFragmentByWord(objWord); | |||||
//based on whether the two varabile's type fragment's in/out edge contain predicate, calculate whether need change order | |||||
//just vote | |||||
int nowOrderCnt = 0, reverseOrderCnt = 0; | |||||
if(subjTf == null || subjTf.outEdges.contains(t.predicateID)) | |||||
nowOrderCnt ++; | |||||
if(objTf == null || objTf.inEdges.contains(t.predicateID)) | |||||
nowOrderCnt ++; | |||||
if(subjTf == null || subjTf.inEdges.contains(t.predicateID)) | |||||
reverseOrderCnt ++; | |||||
if(objTf == null || objTf.outEdges.contains(t.predicateID)) | |||||
reverseOrderCnt ++; | |||||
if(nowOrderCnt<2 && reverseOrderCnt<2) | |||||
return false; | |||||
else if(nowOrderCnt > reverseOrderCnt) | |||||
{ | |||||
// do nothing | |||||
} | |||||
else if(reverseOrderCnt > nowOrderCnt) | |||||
{ | |||||
t.swapSubjObjOrder(); | |||||
} | |||||
else //now order and reverse order both passed type fragment checking, need SELECT one | |||||
{ | |||||
//rule1: ?inventor <occupation> ?occupation || ... <name> ?name -> more similar string will be put latter | |||||
String p = Globals.pd.getPredicateById(t.predicateID); | |||||
int ed1 = EntityFragment.calEditDistance(subjWord.baseForm, p); | |||||
int ed2 = EntityFragment.calEditDistance(objWord.baseForm, p); | |||||
if(ed1 < ed2) | |||||
{ | |||||
t.swapSubjObjOrder(); | |||||
} | |||||
} | |||||
return true; | |||||
} | |||||
///ent & ent || var & ent | |||||
else | |||||
{ | |||||
boolean flag = false; | |||||
if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) { | |||||
flag = true; | |||||
} | |||||
else if (fragmentCompatible(t.objId, t.predicateID, t.subjId)) { | |||||
t.swapSubjObjOrder(); | |||||
flag = true; | |||||
} | |||||
// Var & Ent | ?city <type1> <City> & <Chile_Route_68> <country> ?city : <country> is invalid for City | Notice: the data often dirty and can not prune correctly. | |||||
if(flag == true && (t.subject.startsWith("?") || t.object.startsWith("?"))) | |||||
{ | |||||
Word subjWord = t.getSubjectWord(), objWord = t.getObjectWord(); | |||||
TypeFragment subjTf = getTypeFragmentByWord(subjWord), objTf = getTypeFragmentByWord(objWord); | |||||
if(subjTf != null) | |||||
{ | |||||
if(subjTf.outEdges.contains(t.predicateID)) | |||||
flag = true; | |||||
else if(subjTf.inEdges.contains(t.predicateID)) | |||||
{ | |||||
t.swapSubjObjOrder(); | |||||
flag = true; | |||||
} | |||||
else | |||||
flag = false; | |||||
} | |||||
else if(objTf != null) | |||||
{ | |||||
if(objTf.inEdges.contains(t.predicateID)) | |||||
flag = true; | |||||
else if(objTf.outEdges.contains(t.predicateID)) | |||||
{ | |||||
t.swapSubjObjOrder(); | |||||
flag = true; | |||||
} | |||||
else | |||||
flag = false; | |||||
} | |||||
} | |||||
return flag; | |||||
} | |||||
} | |||||
} | |||||
public boolean isTripleCompatibleNotSwap (Triple t) { | |||||
if (t.predicateID == Globals.pd.typePredicateID) { | |||||
return true; | |||||
} | |||||
else if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) { | |||||
return true; | |||||
} | |||||
else { | |||||
return false; | |||||
} | |||||
} | |||||
public boolean fragmentCompatible (int id1, int pid, int id2) { | |||||
EntityFragment ef1 = efd.getEntityFragmentByEid(id1); | |||||
EntityFragment ef2 = efd.getEntityFragmentByEid(id2); | |||||
// valid entity MUST has fragment | |||||
if (id1!=Triple.TYPE_ROLE_ID && id1!=Triple.VAR_ROLE_ID && ef1 == null) return false; | |||||
if (id2!=Triple.TYPE_ROLE_ID && id2!=Triple.VAR_ROLE_ID && ef2 == null) return false; | |||||
boolean ef1_constant = (ef1==null)?false:true; | |||||
boolean ef2_constant = (ef2==null)?false:true; | |||||
int entityCnt=0,compatibleCnt=0; | |||||
if(ef1_constant) { | |||||
entityCnt++; | |||||
if (ef1.outEdges.contains(pid)) | |||||
compatibleCnt++; | |||||
// else // <e1,p> Ϊ false pair | |||||
// { | |||||
// falseEntPres.add(new Pair(id1,pid)); | |||||
// } | |||||
} | |||||
if (ef2_constant) { | |||||
entityCnt++; | |||||
if (ef2.inEdges.contains(pid)) | |||||
compatibleCnt++; | |||||
// else // <p,e2> Ϊfalse pair | |||||
// { | |||||
// falsePreEnts.add(new Pair(pid,id2)); | |||||
// } | |||||
} | |||||
// for SELECT sparql, EXCAT match between predicate and subject and object, ASK sparql can be relaxed | |||||
if (qlog.s.sentenceType==SentenceType.GeneralQuestion) | |||||
return entityCnt-compatibleCnt<=1; | |||||
else | |||||
return entityCnt==compatibleCnt; | |||||
} | |||||
public int fragmentCompatible2 (int id1, int pid, int id2) { | |||||
EntityFragment ef1 = efd.getEntityFragmentByEid(id1); | |||||
EntityFragment ef2 = efd.getEntityFragmentByEid(id2); | |||||
int entityCnt=0,compatibleCnt=0; | |||||
if(id1 != Triple.VAR_ROLE_ID && id1 != Triple.TYPE_ROLE_ID) { | |||||
entityCnt++; | |||||
if (ef1!=null && ef1.outEdges.contains(pid)) | |||||
compatibleCnt++; | |||||
} | |||||
if (id2 != Triple.VAR_ROLE_ID && id2 != Triple.TYPE_ROLE_ID) { | |||||
entityCnt++; | |||||
if (ef2!=null && ef2.inEdges.contains(pid)) | |||||
compatibleCnt++; | |||||
} | |||||
return entityCnt-compatibleCnt; | |||||
} | |||||
public boolean checkConstantConsistency (Sparql spql) { | |||||
HashMap<String, String> constants = new HashMap<String, String>(); | |||||
for (Triple t : spql.tripleList) { | |||||
if (!t.subject.startsWith("?")) { | |||||
String e = t.getSubjectWord().getFullEntityName(); | |||||
if (!constants.containsKey(e)) | |||||
constants.put(e, t.subject); | |||||
else { | |||||
if (!constants.get(e).equals(t.subject)) | |||||
return false; | |||||
} | |||||
} | |||||
if (!t.object.startsWith("?")) { | |||||
String e = t.getObjectWord().getFullEntityName(); | |||||
if (!constants.containsKey(e)) | |||||
constants.put(e, t.object); | |||||
else { | |||||
if (!constants.get(e).equals(t.object)) | |||||
return false; | |||||
} | |||||
} | |||||
} | |||||
return true; | |||||
} | |||||
public void reviseScoreByTripleOrders(Sparql spq) | |||||
{ | |||||
Triple shouldDel = null; | |||||
for(Triple triple: spq.tripleList) | |||||
{ | |||||
// eg, ?who <president> <United_States_Navy> need punished (or dropped). | |||||
if(triple.subject.toLowerCase().equals("?who")) | |||||
{ | |||||
String rel = Globals.pd.id_2_predicate.get(triple.predicateID); | |||||
if(rel.equals("president") || rel.equals("starring") || rel.equals("producer")) | |||||
{ | |||||
spq.score -= triple.score; | |||||
triple.score /= 10; | |||||
spq.score += triple.score; | |||||
if(triple.semRltn!=null && triple.semRltn.isSteadyEdge == false) | |||||
shouldDel = triple; | |||||
} | |||||
} | |||||
} | |||||
if(shouldDel != null) | |||||
spq.delTriple(shouldDel); | |||||
} | |||||
// enumerate subject/object order, fragment check | |||||
// Modify score of "ask one triple" | |||||
public boolean enumerateSubjObjOrders (Sparql originalSpq, Sparql currentSpq, int level) | |||||
{ | |||||
if (level == originalSpq.tripleList.size()) | |||||
{ | |||||
if(currentSpq.tripleList.size() == 0) | |||||
return false; | |||||
CompatibilityChecker cc = new CompatibilityChecker(efd); | |||||
if (qlog.s.sentenceType==SentenceType.GeneralQuestion) //ask where sparql: no need for fragment check | |||||
{ | |||||
if(cc.isSparqlCompatible3(currentSpq)) //reward score for "TRUE" | |||||
{ | |||||
for(Triple triple: currentSpq.tripleList) | |||||
triple.addScore(triple.getScore()); | |||||
} | |||||
rankedSparqls.add(currentSpq.copy()); | |||||
return true; | |||||
} | |||||
try | |||||
{ | |||||
sparqlCheckId++; | |||||
if (cc.isSparqlCompatible3(currentSpq)) | |||||
{ | |||||
//eg, ?who <president> <United_States_Navy> | |||||
//When query graph contains circle, we just prune this edge | |||||
Sparql sparql = currentSpq.copy(); | |||||
reviseScoreByTripleOrders(sparql); | |||||
if(!rankedSparqls.contains(sparql)) | |||||
rankedSparqls.add(sparql); | |||||
return true; | |||||
} | |||||
} | |||||
catch (Exception e) { | |||||
System.out.println("[CompatibilityChecker ERROR]"+currentSpq); | |||||
e.printStackTrace(); | |||||
} | |||||
return false; | |||||
} | |||||
Triple cur_t = originalSpq.tripleList.get(level); | |||||
// first try default order | |||||
currentSpq.addTriple(cur_t); | |||||
boolean flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1); | |||||
currentSpq.removeLastTriple(); | |||||
// !deprecated: not change triple order for [literal relation] | |||||
// if (RelationFragment.isLiteral(cur_t.predicateID)) return false; | |||||
// Enumerate reserve/drop the type info | |||||
if (cur_t.predicateID == Globals.pd.typePredicateID) | |||||
{ | |||||
flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1); | |||||
return flag; | |||||
} | |||||
else | |||||
{ | |||||
// single triple check after swap | |||||
Triple swapped_t = cur_t.copySwap(); | |||||
swapped_t.score = swapped_t.score*0.8; | |||||
if (isTripleCompatibleNotSwap(swapped_t)) | |||||
{ | |||||
currentSpq.addTriple(swapped_t); | |||||
flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1); | |||||
currentSpq.removeLastTriple(); | |||||
} | |||||
return flag; | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,208 @@ | |||||
package qa.parsing; | |||||
import org.maltparser.core.exception.MaltChainedException; | |||||
import log.QueryLogger; | |||||
import nlp.ds.DependencyTree; | |||||
import nlp.ds.DependencyTreeNode; | |||||
import nlp.ds.Word; | |||||
import nlp.ds.Sentence.SentenceType; | |||||
import qa.Globals; | |||||
import rdf.Sparql; | |||||
import rdf.Triple; | |||||
public class QuestionParsing { | |||||
public void process(QueryLogger qlog) { | |||||
getDependenciesAndNER(qlog); | |||||
recognizeSentenceType(qlog); | |||||
} | |||||
public void getDependenciesAndNER (QueryLogger qlog) { | |||||
long t1 = System.currentTimeMillis(); | |||||
try { | |||||
qlog.s.dependencyTreeStanford = new DependencyTree(qlog.s, Globals.stanfordParser); | |||||
}catch(Exception e){ | |||||
e.printStackTrace(); | |||||
} | |||||
long t2 = System.currentTimeMillis(); | |||||
try{ | |||||
qlog.s.dependencyTreeMalt = new DependencyTree(qlog.s, Globals.maltParser); | |||||
}catch(Exception e){ | |||||
//if errors occur, abandon malt tree | |||||
qlog.s.dependencyTreeMalt = qlog.s.dependencyTreeStanford; | |||||
System.err.println("MALT parser error! Use stanford parser instead."); | |||||
} | |||||
try { | |||||
long t3 = System.currentTimeMillis(); | |||||
Globals.nerRecognizer.recognize(qlog.s); | |||||
long t4 = System.currentTimeMillis(); | |||||
System.out.println("====StanfordDependencies("+(t2-t1)+"ms)===="); | |||||
System.out.println(qlog.s.dependencyTreeStanford); | |||||
System.out.println("====MaltDependencies("+(t3-t2)+"ms)===="); | |||||
System.out.println(qlog.s.dependencyTreeMalt); | |||||
System.out.println("====NameEntityRecognition("+(t4-t3)+"ms)===="); | |||||
qlog.s.printNERResult(); | |||||
qlog.timeTable.put("StanfordParser", (int)(t2-t1)); | |||||
qlog.timeTable.put("MaltParser", (int)(t3-t2)); | |||||
qlog.timeTable.put("NER", (int)(t4-t3)); | |||||
} catch (Exception e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
public void recognizeSentenceType(QueryLogger qlog) | |||||
{ | |||||
boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford)|| | |||||
recognizeImperativeSentence(qlog.s.dependencyTreeMalt); | |||||
if (IsImperativeSentence) | |||||
{ | |||||
qlog.s.sentenceType = SentenceType.ImperativeSentence; | |||||
//two dependencyTree's ignored words should equal | |||||
for (DependencyTreeNode sNode : qlog.s.dependencyTreeStanford.nodesList) | |||||
for (DependencyTreeNode mNode : qlog.s.dependencyTreeMalt.nodesList) | |||||
if (sNode.equals(mNode) && (sNode.word.isIgnored||mNode.word.isIgnored)) | |||||
sNode.word.isIgnored = mNode.word.isIgnored = true; | |||||
return; | |||||
} | |||||
boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford)|| | |||||
recognizeSpecialQuestion(qlog.s.dependencyTreeMalt); | |||||
if (IsSpecialQuestion) | |||||
{ | |||||
qlog.s.sentenceType = SentenceType.SpecialQuestion; | |||||
return; | |||||
} | |||||
boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford)|| | |||||
recognizeGeneralQuestion(qlog.s.dependencyTreeMalt); | |||||
if (IsGeneralQuestion) | |||||
{ | |||||
qlog.s.sentenceType = SentenceType.GeneralQuestion; | |||||
return; | |||||
} | |||||
//default is special | |||||
qlog.s.sentenceType = SentenceType.SpecialQuestion; | |||||
} | |||||
//if imperative, omitting those polite words | |||||
private boolean recognizeImperativeSentence(DependencyTree tree) { | |||||
if(tree.getRoot().word.posTag.startsWith("V") || tree.getRoot().word.posTag.startsWith("NN")) { | |||||
DependencyTreeNode dobj = null; | |||||
DependencyTreeNode iobj = null; | |||||
for (DependencyTreeNode n : tree.getRoot().childrenList) { | |||||
if (n.dep_father2child.equals("dobj")) { | |||||
dobj = n; | |||||
} | |||||
else if (n.dep_father2child.equals("iobj")) { | |||||
iobj = n; | |||||
} | |||||
} | |||||
if (dobj != null && iobj != null) { | |||||
tree.getRoot().word.isIgnored = true; | |||||
iobj.word.isIgnored = true; | |||||
// give me a list of .. | |||||
if (dobj.word.baseForm.equals("list")) | |||||
{ | |||||
dobj.word.isIgnored = true; | |||||
} | |||||
return true; | |||||
} | |||||
//start with "List": List all games by GMT. | |||||
if (dobj != null && tree.getRoot().word.baseForm.equals("list")) | |||||
{ | |||||
//System.out.println("isListSentence!"); | |||||
tree.getRoot().word.isIgnored = true; | |||||
return true; | |||||
} | |||||
} | |||||
return false; | |||||
} | |||||
private boolean recognizeSpecialQuestion(DependencyTree tree) | |||||
{ | |||||
DependencyTreeNode firstNode = null; | |||||
for (DependencyTreeNode dtn : tree.nodesList) | |||||
if (dtn.word.position == 1) | |||||
{ | |||||
firstNode = dtn; | |||||
break; | |||||
} | |||||
//eg. In which city... | |||||
if (firstNode!=null && | |||||
(firstNode.word.posTag.equals("IN")||firstNode.word.posTag.equals("TO"))&& | |||||
firstNode.dep_father2child.startsWith("prep")) | |||||
{ | |||||
firstNode = null; | |||||
for (DependencyTreeNode dtn : tree.nodesList) | |||||
if (dtn.word.position == 2) | |||||
{ | |||||
firstNode = dtn; | |||||
break; | |||||
} | |||||
} | |||||
if (firstNode != null) | |||||
{ | |||||
if (firstNode.word.posTag.startsWith("W")) | |||||
return true; | |||||
} | |||||
return false; | |||||
} | |||||
private boolean recognizeGeneralQuestion(DependencyTree tree) | |||||
{ | |||||
DependencyTreeNode firstNode = null; | |||||
for (DependencyTreeNode dtn : tree.nodesList) | |||||
if (dtn.word.position == 1) | |||||
{ | |||||
firstNode = dtn; | |||||
break; | |||||
} | |||||
if (firstNode != null) | |||||
{ | |||||
String dep = firstNode.dep_father2child; | |||||
String pos = firstNode.word.posTag; | |||||
String baseform = firstNode.word.baseForm; | |||||
if ((baseform.equals("be")||baseform.equals("do")) && | |||||
pos.startsWith("VB") && | |||||
(dep.equals("root")||dep.equals("cop")||dep.startsWith("aux"))) | |||||
return true; | |||||
} | |||||
return false; | |||||
} | |||||
public static String detectQuestionFocus(Sparql spq) { | |||||
String ret = null; | |||||
int posi = Integer.MAX_VALUE; | |||||
for (Triple t : spq.tripleList) { | |||||
if (!t.isSubjConstant()) { | |||||
Word subj = t.getSubjectWord(); | |||||
if (subj!=null && subj.position < posi) { | |||||
posi = subj.position; | |||||
ret = t.subject; | |||||
} | |||||
} | |||||
if (!t.isObjConstant()) { | |||||
Word obj = t.getObjectWord(); | |||||
if (obj!=null && obj.position < posi) { | |||||
posi = obj.position; | |||||
ret = t.object; | |||||
} | |||||
} | |||||
} | |||||
if (ret != null) return ret.replace(' ', '_'); | |||||
else return null; | |||||
} | |||||
} |
@@ -0,0 +1,40 @@ | |||||
package rdf; | |||||
import fgmt.EntityFragment; | |||||
public class EntityMapping implements Comparable<EntityMapping> { | |||||
public int entityID = -1; | |||||
public String entityName = null; | |||||
public double score = 0; | |||||
public EntityFragment entityFragment = null; | |||||
public EntityMapping(int eid, String en, double sco) { | |||||
entityID = eid; | |||||
entityName = en; | |||||
score = sco; | |||||
//punishment if entity start with "?" | |||||
if (entityName.startsWith("?")) | |||||
score *=0.5; | |||||
} | |||||
// In descending order: big --> small | |||||
public int compareTo(EntityMapping o) { | |||||
double diff = this.score - o.score; | |||||
if (diff > 0) return -1; | |||||
else if (diff < 0) return 1; | |||||
else return 0; | |||||
} | |||||
public int hashCode() | |||||
{ | |||||
return new Integer(entityID).hashCode(); | |||||
} | |||||
public String toString() | |||||
{ | |||||
StringBuilder res = new StringBuilder(entityName+"("+score+")"); | |||||
return res.toString(); | |||||
} | |||||
} |
@@ -0,0 +1,77 @@ | |||||
package rdf; | |||||
import fgmt.TypeFragment; | |||||
import qa.Globals; | |||||
import lcn.EntityFragmentFields; | |||||
public class ImplicitRelation { | |||||
public String subj = null; | |||||
public String obj = null; | |||||
public int pId = -1; | |||||
public double score = 0; | |||||
//Role : 1|ent , 2|type_ , 3|var | |||||
public enum roleEnum {ENTITY, TYPE_CONSTANT, TYPE_VARIABLE, VARIABLE}; | |||||
public int subjRole = -1; | |||||
public int objRole = -1; | |||||
public int subjId = -1; | |||||
public int objId = -1; | |||||
public ImplicitRelation(String s, String o, int pid, double sc) | |||||
{ | |||||
pId = pid; | |||||
subj = s; | |||||
obj = o; | |||||
score = sc; | |||||
subjId = EntityFragmentFields.entityName2Id.get(s); | |||||
if(pId != Globals.pd.typePredicateID) | |||||
objId = EntityFragmentFields.entityName2Id.get(o); | |||||
else | |||||
objId = TypeFragment.typeShortName2IdList.get(o).get(0); | |||||
} | |||||
public ImplicitRelation(Integer sId, Integer oId, int pid, double sc) | |||||
{ | |||||
pId = pid; | |||||
subjId = sId; | |||||
objId = oId; | |||||
score = sc; | |||||
} | |||||
public void setSubjectId(Integer s) | |||||
{ | |||||
subjId = s; | |||||
} | |||||
public void setObjectId(Integer o) | |||||
{ | |||||
objId = o; | |||||
} | |||||
public void setSubject(String s) | |||||
{ | |||||
subj = s; | |||||
} | |||||
public void setObject(String o) | |||||
{ | |||||
obj = o; | |||||
} | |||||
public int hashCode() | |||||
{ | |||||
return new Integer(pId).hashCode() ^ new Integer(subjId).hashCode() ^ new Integer(objId).hashCode(); | |||||
} | |||||
@Override | |||||
public boolean equals(Object ir) | |||||
{ | |||||
ImplicitRelation tmpIr = (ImplicitRelation) ir; | |||||
if (pId == tmpIr.pId && subjId == tmpIr.subjId && objId == tmpIr.objId) | |||||
return true; | |||||
else return false; | |||||
} | |||||
} |
@@ -0,0 +1,41 @@ | |||||
package rdf; | |||||
import java.util.ArrayList; | |||||
import rdf.EntityMapping; | |||||
import rdf.TypeMapping; | |||||
public class MergedWord implements Comparable<MergedWord> | |||||
{ | |||||
//original position | |||||
public int st,ed; | |||||
//position after merge (unselected is -1) | |||||
public int mergedPos = -1; | |||||
public String name; | |||||
public boolean mayCategory = false; | |||||
public boolean mayLiteral = false; | |||||
public boolean mayEnt = false; | |||||
public boolean mayType = false; | |||||
public ArrayList<EntityMapping> emList = null; | |||||
public ArrayList<TypeMapping> tmList = null; | |||||
public String category = null; | |||||
public MergedWord(int s,int e,String n) | |||||
{ | |||||
st = s; | |||||
ed = e; | |||||
name = n; | |||||
} | |||||
@Override | |||||
//long to short | |||||
public int compareTo(MergedWord o) | |||||
{ | |||||
int lenDiff = (this.ed-this.st) - (o.ed-o.st); | |||||
if (lenDiff > 0) return -1; | |||||
else if (lenDiff < 0) return 1; | |||||
return 0; | |||||
} | |||||
} |
@@ -0,0 +1,24 @@ | |||||
package rdf; | |||||
import java.util.ArrayList; | |||||
public class NodeSelectedWithScore implements Comparable<NodeSelectedWithScore> | |||||
{ | |||||
public ArrayList<Integer> selected; | |||||
int size; //split key to st and ed | |||||
public double score = 0; | |||||
public NodeSelectedWithScore(ArrayList<Integer> a, double b) | |||||
{ | |||||
selected = a; | |||||
score = b; | |||||
} | |||||
// In descending order: big --> small | |||||
public int compareTo(NodeSelectedWithScore o) { | |||||
double diff = this.score - o.score; | |||||
if (diff > 0) return -1; | |||||
else if (diff < 0) return 1; | |||||
else return 0; | |||||
} | |||||
} |
@@ -0,0 +1,28 @@ | |||||
package rdf; | |||||
public class PredicateMapping implements Comparable<PredicateMapping> { | |||||
public int pid = -1; | |||||
public double score = 0; | |||||
public String parapharase = null; | |||||
public PredicateMapping (int pid, double sco, String para) { | |||||
this.pid = pid; | |||||
score = sco; | |||||
parapharase = para; | |||||
} | |||||
// In descending order: big --> small | |||||
public int compareTo(PredicateMapping o) { | |||||
double diff = this.score - o.score; | |||||
if (diff > 0) return -1; | |||||
else if (diff < 0) return 1; | |||||
else return 0; | |||||
} | |||||
@Override | |||||
public String toString() { | |||||
String ret = ""; | |||||
ret = "<"+pid+" : "+parapharase+" : "+score+">"; | |||||
return ret; | |||||
} | |||||
} |
@@ -0,0 +1,180 @@ | |||||
package rdf; | |||||
import java.util.ArrayList; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import qa.Globals; | |||||
import nlp.ds.Word; | |||||
public class SemanticQueryGraph implements Comparable<SemanticQueryGraph> | |||||
{ | |||||
public ArrayList<SemanticUnit> semanticUnitList = null; | |||||
public HashMap<Integer, SemanticRelation> semanticRelations = new HashMap<>(); | |||||
public double score = 0; | |||||
public SemanticQueryGraph(ArrayList<SemanticUnit> suList) | |||||
{ | |||||
semanticUnitList = suList; //TODO: need copy? | |||||
// Calculate Score by a reward function (TODO: using SVM-Rank) | |||||
} | |||||
public SemanticQueryGraph(SemanticQueryGraph head) | |||||
{ | |||||
semanticUnitList = new ArrayList<>(); | |||||
for(SemanticUnit su: head.semanticUnitList) | |||||
semanticUnitList.add(su.copy()); | |||||
score = head.score; | |||||
} | |||||
public void connect(SemanticUnit u, SemanticUnit v) | |||||
{ | |||||
if(u.equals(v)) | |||||
return; | |||||
SemanticUnit su1 = null, su2 = null; | |||||
for(SemanticUnit su: this.semanticUnitList) | |||||
if(su.equals(u)) | |||||
su1 = su; | |||||
else if(su.equals(v)) | |||||
su2 = su; | |||||
if(su1 != null && su2 != null) | |||||
if(!su1.neighborUnitList.contains(su2) && !su2.neighborUnitList.contains(su1)) | |||||
{ | |||||
su1.neighborUnitList.add(su2); | |||||
su2.neighborUnitList.add(su1); | |||||
} | |||||
} | |||||
public void merge(SemanticUnit u, SemanticUnit v) | |||||
{ | |||||
SemanticUnit su1 = null, su2 = null; | |||||
for(SemanticUnit su: this.semanticUnitList) | |||||
if(su.equals(u)) | |||||
su1 = su; | |||||
else if(su.equals(v)) | |||||
su2 = su; | |||||
if(su1 != null && su2 != null) | |||||
{ | |||||
for(SemanticUnit su: this.semanticUnitList) | |||||
if(su != su2 && su.neighborUnitList.contains(su1) && !su.neighborUnitList.contains(su2)) //TODO: Notice, now REJECT multi-edges; The hash function of SR should be modified to allow multi-edges. | |||||
su.neighborUnitList.add(su2); | |||||
this.semanticUnitList.remove(su1); | |||||
su2.neighborUnitList.remove(su1); | |||||
} | |||||
} | |||||
@Override | |||||
public int hashCode() { | |||||
int code = 0; | |||||
for(SemanticUnit su: this.semanticUnitList) | |||||
code ^= su.hashCode(); | |||||
return code; | |||||
} | |||||
@Override | |||||
public boolean equals(Object o) | |||||
{ | |||||
if (o instanceof SemanticQueryGraph) | |||||
{ | |||||
int matchCnt = 0; | |||||
for(SemanticUnit su1: ((SemanticQueryGraph) o).semanticUnitList) | |||||
for(SemanticUnit su2: this.semanticUnitList) | |||||
{ | |||||
if(su1.equals(su2)) | |||||
{ | |||||
if(su1.neighborUnitList.containsAll(su2.neighborUnitList) && su2.neighborUnitList.containsAll(su1.neighborUnitList)) | |||||
matchCnt++; | |||||
} | |||||
} | |||||
if(matchCnt == ((SemanticQueryGraph) o).semanticUnitList.size() && matchCnt == this.semanticUnitList.size()) | |||||
return true; | |||||
} | |||||
return false; | |||||
} | |||||
@Override | |||||
public int compareTo(SemanticQueryGraph o) | |||||
{ | |||||
double diff = this.score - o.score; | |||||
if (diff > 0) return -1; | |||||
else if (diff < 0) return 1; | |||||
else return 0; | |||||
} | |||||
public boolean isFinalState() | |||||
{ | |||||
if(semanticUnitList == null || semanticUnitList.isEmpty()) | |||||
return false; | |||||
// Basic assumption: a final Semantic Query Graph should be Connected. | |||||
HashSet<SemanticUnit> visited = new HashSet<>(); | |||||
SemanticUnit start = semanticUnitList.get(0); | |||||
visited.add(start); | |||||
dfs(start, visited); | |||||
if(visited.size() == semanticUnitList.size()) | |||||
return true; | |||||
return false; | |||||
} | |||||
private void dfs(SemanticUnit headNode, HashSet<SemanticUnit> visited) | |||||
{ | |||||
for(SemanticUnit curNode: headNode.neighborUnitList) | |||||
if(!visited.contains(curNode)) | |||||
{ | |||||
visited.add(curNode); | |||||
dfs(curNode, visited); | |||||
} | |||||
for(SemanticUnit curNode: semanticUnitList) | |||||
{ | |||||
if(curNode.neighborUnitList.contains(headNode) || headNode.neighborUnitList.contains(curNode)) | |||||
{ | |||||
if(!visited.contains(curNode)) | |||||
{ | |||||
visited.add(curNode); | |||||
dfs(curNode, visited); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
public void calculateScore(HashMap<Integer, SemanticRelation> potentialSemanticRelations) | |||||
{ | |||||
// 1. entity/type score | |||||
double entSco = 0; | |||||
for(SemanticUnit su: this.semanticUnitList) | |||||
{ | |||||
Word w = su.centerWord; | |||||
if(w.mayEnt && w.emList.size()>0) | |||||
entSco += w.emList.get(0).score * 100; | |||||
if(w.mayType && w.tmList.size()>0) | |||||
entSco += w.tmList.get(0).score; | |||||
} | |||||
// 2. relation score | |||||
double relSco = 0; | |||||
int relCnt = 0; | |||||
for(SemanticUnit su1: this.semanticUnitList) | |||||
for(SemanticUnit su2: su1.neighborUnitList) | |||||
{ | |||||
//Deduplicate | |||||
if(su1.centerWord.position > su2.centerWord.position) | |||||
continue; | |||||
relCnt++; | |||||
int key = su1.centerWord.getNnHead().hashCode() ^ su2.centerWord.getNnHead().hashCode(); | |||||
SemanticRelation sr = potentialSemanticRelations.get(key); | |||||
if(sr == null) | |||||
System.err.println("No semantic relation for: " + su1 + " & " + su2); | |||||
else | |||||
{ | |||||
relSco += sr.predicateMappings.get(0).score; | |||||
semanticRelations.put(key, sr); | |||||
} | |||||
} | |||||
relSco/=relCnt; //average | |||||
this.score = entSco + relSco; | |||||
} | |||||
} |
@@ -0,0 +1,171 @@ | |||||
package rdf; | |||||
import java.util.ArrayList; | |||||
import rdf.SimpleRelation; | |||||
import nlp.ds.Word; | |||||
public class SemanticRelation { | |||||
public Word arg1Word = null; | |||||
public Word arg2Word = null; | |||||
public String relationParaphrase = null; // longest match | |||||
public double LongestMatchingScore = 0; // longest match score | |||||
//judge difference when copy semantic relation from special pattern | |||||
public int arg1SuffixId = 0; | |||||
public int arg2SuffixId = 0; | |||||
public Word arg1Word_beforeCRR = null; | |||||
public Word arg2Word_beforeCRR = null; | |||||
public ArrayList<PredicateMapping> predicateMappings = null; | |||||
public boolean isArg1Constant = false; | |||||
public boolean isArg2Constant = false; | |||||
public char extractingMethod = ' '; // S: StanfordParser; M: MaltParser; N: N-gram; R: rules | |||||
public SemanticRelation dependOnSemanticRelation = null; | |||||
public Word preferredSubj = null; | |||||
public boolean isSteadyEdge = true; | |||||
public SemanticRelation(SemanticRelation r2) { | |||||
arg1Word = r2.arg1Word; | |||||
arg2Word = r2.arg2Word; | |||||
relationParaphrase = r2.relationParaphrase; | |||||
LongestMatchingScore = r2.LongestMatchingScore; | |||||
arg1SuffixId = r2.arg1SuffixId; | |||||
arg2SuffixId = r2.arg2SuffixId; | |||||
arg1Word_beforeCRR = r2.arg1Word_beforeCRR; | |||||
arg2Word_beforeCRR = r2.arg2Word_beforeCRR; | |||||
arg1Word.emList = r2.arg1Word.emList; | |||||
arg2Word.emList = r2.arg2Word.emList; | |||||
predicateMappings = r2.predicateMappings; | |||||
// arg1Types = r2.arg1Types; | |||||
// arg2Types = r2.arg2Types; | |||||
isArg1Constant = r2.isArg1Constant; | |||||
isArg2Constant = r2.isArg2Constant; | |||||
extractingMethod = r2.extractingMethod; | |||||
dependOnSemanticRelation = r2.dependOnSemanticRelation; | |||||
preferredSubj = r2.preferredSubj; | |||||
} | |||||
public void swapArg1Arg2() | |||||
{ | |||||
Word tmpWord = arg1Word; | |||||
arg1Word = arg2Word; | |||||
arg2Word = tmpWord; | |||||
int tmpSuffixId = arg1SuffixId; | |||||
arg1SuffixId = arg2SuffixId; | |||||
arg2SuffixId = tmpSuffixId; | |||||
tmpWord = arg1Word_beforeCRR; | |||||
arg1Word_beforeCRR = arg2Word_beforeCRR; | |||||
arg2Word_beforeCRR = tmpWord; | |||||
boolean tmpBool = isArg1Constant; | |||||
isArg1Constant = isArg2Constant; | |||||
isArg2Constant = tmpBool; | |||||
} | |||||
public SemanticRelation (SimpleRelation simr) { | |||||
if (simr.preferredSubj == null) { | |||||
if (simr.arg1Word.compareTo(simr.arg2Word) < 0) { | |||||
this.arg1Word = simr.arg1Word; | |||||
this.arg2Word = simr.arg2Word; | |||||
this.arg1Word_beforeCRR = simr.arg1Word_beforeCRR; | |||||
this.arg2Word_beforeCRR = simr.arg2Word_beforeCRR; | |||||
} | |||||
else { | |||||
this.arg1Word = simr.arg2Word; | |||||
this.arg2Word = simr.arg1Word; | |||||
this.arg1Word_beforeCRR = simr.arg2Word_beforeCRR; | |||||
this.arg2Word_beforeCRR = simr.arg1Word_beforeCRR; | |||||
} | |||||
this.extractingMethod = simr.extractingMethod; | |||||
} | |||||
else { | |||||
if (simr.arg1Word == simr.preferredSubj) { | |||||
this.arg1Word = simr.arg1Word; | |||||
this.arg2Word = simr.arg2Word; | |||||
this.arg1Word_beforeCRR = simr.arg1Word_beforeCRR; | |||||
this.arg2Word_beforeCRR = simr.arg2Word_beforeCRR; | |||||
this.preferredSubj = simr.preferredSubj; | |||||
} | |||||
else { | |||||
this.arg1Word = simr.arg2Word; | |||||
this.arg2Word = simr.arg1Word; | |||||
this.arg1Word_beforeCRR = simr.arg2Word_beforeCRR; | |||||
this.arg2Word_beforeCRR = simr.arg1Word_beforeCRR; | |||||
this.preferredSubj = simr.preferredSubj; | |||||
} | |||||
this.extractingMethod = simr.extractingMethod; | |||||
} | |||||
} | |||||
@Override | |||||
public int hashCode() { | |||||
return arg1Word.hashCode() ^ arg2Word.hashCode() + arg1SuffixId + arg2SuffixId; | |||||
} | |||||
@Override | |||||
public boolean equals(Object o) { | |||||
if (o instanceof SemanticRelation) { | |||||
SemanticRelation sr2 = (SemanticRelation) o; | |||||
if (this.arg1Word.equals(sr2.arg1Word) | |||||
&& this.arg2Word.equals(sr2.arg2Word) | |||||
&& this.arg1SuffixId == sr2.arg1SuffixId | |||||
&& this.arg2SuffixId == sr2.arg2SuffixId | |||||
&& this.relationParaphrase.equals(sr2.relationParaphrase) | |||||
&& this.LongestMatchingScore == sr2.LongestMatchingScore) { | |||||
return true; | |||||
} | |||||
} | |||||
return false; | |||||
} | |||||
@Override | |||||
public String toString() { | |||||
return arg1Word.originalForm + "," + arg2Word.originalForm + "," + relationParaphrase + "," + LongestMatchingScore + "["+extractingMethod+"]"; | |||||
// return arg1Word.getFullEntityName() + "," + arg2Word.getFullEntityName() + "," + relationParaphrase + "," + LongestMatchingScore + "["+extractingMethod+"]"; | |||||
} | |||||
public void normalizeScore() | |||||
{ | |||||
double maxScore; | |||||
if (arg1Word.emList!=null && !arg1Word.emList.isEmpty()) | |||||
{ | |||||
maxScore=0.0; | |||||
for (EntityMapping em : arg1Word.emList) | |||||
maxScore = Math.max(maxScore, em.score); | |||||
for (EntityMapping em : arg1Word.emList) | |||||
em.score = em.score/maxScore; | |||||
} | |||||
if (arg2Word.emList!=null && !arg2Word.emList.isEmpty()) | |||||
{ | |||||
maxScore=0.0; | |||||
for (EntityMapping em : arg2Word.emList) | |||||
maxScore = Math.max(maxScore, em.score); | |||||
for (EntityMapping em : arg2Word.emList) | |||||
em.score = em.score/maxScore; | |||||
} | |||||
if (predicateMappings!=null && !predicateMappings.isEmpty()) | |||||
{ | |||||
maxScore=0.0; | |||||
for (PredicateMapping pm : predicateMappings) | |||||
maxScore = Math.max(maxScore, pm.score); | |||||
for (PredicateMapping pm : predicateMappings) | |||||
pm.score = pm.score/maxScore; | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,61 @@ | |||||
package rdf; | |||||
import java.util.ArrayList; | |||||
import java.util.HashMap; | |||||
import rdf.SemanticRelation; | |||||
import nlp.ds.DependencyTreeNode; | |||||
import nlp.ds.Word; | |||||
public class SemanticUnit | |||||
{ | |||||
public Word centerWord = null; | |||||
public ArrayList<DependencyTreeNode> describeNodeList = new ArrayList<DependencyTreeNode>(); | |||||
public ArrayList<SemanticUnit> neighborUnitList = new ArrayList<SemanticUnit>(); | |||||
public HashMap<Word, SemanticRelation> RelationList = new HashMap<Word, SemanticRelation>(); | |||||
public boolean isSubj = true; | |||||
public Integer prefferdType = null; | |||||
public SemanticUnit(Word center, boolean isSubJ) | |||||
{ | |||||
centerWord = center; | |||||
isSubj = isSubJ; | |||||
} | |||||
public SemanticUnit copy() | |||||
{ | |||||
SemanticUnit su = new SemanticUnit(this.centerWord, this.isSubj); | |||||
su.describeNodeList = (ArrayList<DependencyTreeNode>) this.describeNodeList.clone(); | |||||
su.neighborUnitList = (ArrayList<SemanticUnit>) this.neighborUnitList.clone(); | |||||
su.RelationList = (HashMap<Word, SemanticRelation>) this.RelationList.clone(); | |||||
return su; | |||||
} | |||||
@Override | |||||
public int hashCode() { | |||||
return centerWord.hashCode(); | |||||
} | |||||
@Override | |||||
public boolean equals(Object o) { | |||||
if (o instanceof SemanticUnit) { | |||||
SemanticUnit su2 = (SemanticUnit) o; | |||||
if(this.centerWord.equals(su2.centerWord)) | |||||
return true; | |||||
} | |||||
return false; | |||||
} | |||||
@Override | |||||
public String toString() | |||||
{ | |||||
String ret = "<" + centerWord + ", {"; | |||||
for(SemanticUnit su: neighborUnitList) | |||||
ret += su.centerWord + ", "; | |||||
ret += "}>"; | |||||
return ret; | |||||
} | |||||
} |
@@ -0,0 +1,88 @@ | |||||
package rdf; | |||||
import java.util.ArrayList; | |||||
import java.util.HashMap; | |||||
import paradict.PredicateIDAndSupport; | |||||
import qa.Globals; | |||||
import nlp.ds.DependencyTree; | |||||
import nlp.ds.DependencyTreeNode; | |||||
import nlp.ds.Word; | |||||
// allow repetition | |||||
public class SimpleRelation { | |||||
public Word arg1Word = null; | |||||
public Word arg2Word = null; | |||||
public String relationParaphrase = null; | |||||
public double matchingScore = 0; | |||||
public Word arg1Word_beforeCRR = null; | |||||
public Word arg2Word_beforeCRR = null; | |||||
public HashMap<Integer, Double> pasList = new HashMap<Integer, Double>(); | |||||
public Word preferredSubj = null; | |||||
public char extractingMethod = ' '; // S: StanfordParser; M: MaltParser; N: N-gram; R: rules | |||||
public SimpleRelation() | |||||
{ | |||||
} | |||||
public SimpleRelation(SimpleRelation sr) | |||||
{ | |||||
arg1Word = sr.arg1Word; | |||||
arg2Word = sr.arg2Word; | |||||
relationParaphrase = sr.relationParaphrase; | |||||
matchingScore = sr.matchingScore; | |||||
arg1Word_beforeCRR = sr.arg1Word_beforeCRR; | |||||
arg2Word_beforeCRR = sr.arg2Word_beforeCRR; | |||||
pasList = sr.pasList; | |||||
preferredSubj = sr.preferredSubj; | |||||
extractingMethod = 'R'; | |||||
} | |||||
@Override | |||||
public String toString() { | |||||
return arg1Word.originalForm + "," + arg2Word.originalForm + "," + relationParaphrase + "," + matchingScore + "["+extractingMethod+"]"; | |||||
//return arg1Word.getFullEntityName() + "," + arg2Word.getFullEntityName() + "," + relationParaphrase + "," + matchingScore + "["+extractingMethod+"]"; | |||||
} | |||||
public int getHashCode() { | |||||
return arg1Word.hashCode() ^ arg2Word.hashCode(); | |||||
} | |||||
public void setPasList (String pattern, double matchingScore, boolean[] matchedFlag) { | |||||
ArrayList<PredicateIDAndSupport> list = Globals.pd.nlPattern_2_predicateList.get(pattern); | |||||
for (PredicateIDAndSupport pidsup : list) { | |||||
double sumSelectivity = 0; | |||||
for (int i = 0; i < matchedFlag.length; i ++) { | |||||
if (matchedFlag[i]) { | |||||
sumSelectivity += pidsup.wordSelectivity[i]; | |||||
} | |||||
} | |||||
sumSelectivity = matchingScore*sumSelectivity*pidsup.support; | |||||
int pid = pidsup.predicateID; | |||||
if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5; //����dbo�е�predicate //pid ���ܲ��� dbo �У� | |||||
if (!pasList.containsKey(pid)) | |||||
pasList.put(pid, sumSelectivity); | |||||
else if (sumSelectivity > pasList.get(pid)) | |||||
pasList.put(pid, sumSelectivity); | |||||
} | |||||
} | |||||
public void setPreferedSubjObjOrder(DependencyTree tree) { | |||||
DependencyTreeNode n1 = tree.getNodeByIndex(this.arg1Word.position).getNNTopTreeNode(tree); | |||||
DependencyTreeNode n2 = tree.getNodeByIndex(this.arg2Word.position).getNNTopTreeNode(tree); | |||||
if (n1.father != null && n1.father.word.baseForm.equals("of") && n1.dep_father2child.equals("pobj")) { | |||||
this.preferredSubj = this.arg1Word; | |||||
} | |||||
else if (n2.father != null && n2.father.word.baseForm.equals("of") && n2.dep_father2child.equals("pobj")) { | |||||
this.preferredSubj = this.arg2Word; | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,305 @@ | |||||
package rdf; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
//import java.util.Comparator; | |||||
import java.util.HashMap; | |||||
import java.util.HashSet; | |||||
import log.QueryLogger; | |||||
import nlp.ds.Sentence; | |||||
import nlp.ds.Sentence.SentenceType; | |||||
import qa.Globals; | |||||
public class Sparql implements Comparable<Sparql> | |||||
{ | |||||
public ArrayList<Triple> tripleList = new ArrayList<Triple>(); | |||||
public boolean countTarget = false; | |||||
public String mostStr = null; | |||||
public String moreThanStr = null; | |||||
public double score = 0; | |||||
public String questionFocus = null; // The answer variable | |||||
public HashSet<String> variables = new HashSet<String>(); | |||||
public enum QueryType {Select,Ask} | |||||
public QueryType queryType = QueryType.Select; | |||||
public HashMap<Integer, SemanticRelation> semanticRelations = null; | |||||
public void addTriple(Triple t) | |||||
{ | |||||
if(!tripleList.contains(t)) | |||||
{ | |||||
tripleList.add(t); | |||||
score += t.score; | |||||
} | |||||
} | |||||
public void delTriple(Triple t) | |||||
{ | |||||
if(tripleList.contains(t)) | |||||
{ | |||||
tripleList.remove(t); | |||||
score -= t.score; | |||||
} | |||||
} | |||||
@Override | |||||
public String toString() | |||||
{ | |||||
String ret = ""; | |||||
for (Triple t : tripleList) { | |||||
ret += t.toString(); | |||||
ret += '\n'; | |||||
} | |||||
return ret; | |||||
} | |||||
public void deduplicate() | |||||
{ | |||||
HashSet<String> set = new HashSet<String>(); | |||||
ArrayList<Triple> list = new ArrayList<Triple>(); | |||||
for(Triple t: tripleList) | |||||
{ | |||||
String st = t.toStringWithoutScore(); | |||||
if(set.contains(st)) | |||||
list.add(t); | |||||
set.add(st); | |||||
} | |||||
for(Triple t: list) | |||||
this.delTriple(t); | |||||
} | |||||
// Is it a Basic Graph Pattern without filter and aggregation? | |||||
public boolean isBGP() | |||||
{ | |||||
if(moreThanStr != null || mostStr != null || countTarget) | |||||
return false; | |||||
return true; | |||||
} | |||||
//Use to display (can not be executed) | |||||
public String toStringForGStore() | |||||
{ | |||||
String ret = ""; | |||||
for (Triple t : tripleList) | |||||
{ | |||||
// !Omit obvious LITERAL | |||||
if(t.object.equals("literal_HRZ")) | |||||
continue; | |||||
// !Omit some bad TYPEs | |||||
if(t.predicateID==Globals.pd.typePredicateID && Globals.pd.bannedTypes.contains(t.object)) | |||||
continue; | |||||
ret += t.toStringForGStore(); | |||||
ret += '\n'; | |||||
} | |||||
return ret; | |||||
} | |||||
/** | |||||
* @description: | |||||
* 1. Select all variables for BGP queries to display specific information. | |||||
* 2. DO NOT select all variables when Aggregation like "HAVING" "COUNT" ... | |||||
* (It may involves too many results, e.g. "which countries have more than 1000 caves?", caves is no need to display) | |||||
* @param: NULL. | |||||
* @return: A SPARQL query can be executed by GStore (NO prefix of entities/predicates). | |||||
*/ | |||||
public String toStringForGStore2() | |||||
{ | |||||
String ret = ""; | |||||
variables.clear(); | |||||
for(Triple t: tripleList) | |||||
{ | |||||
if (!t.isSubjConstant()) variables.add(t.subject.replaceAll(" ", "_")); | |||||
if (!t.isObjConstant()) variables.add(t.object.replaceAll(" ", "_")); | |||||
} | |||||
if(variables.size() == 0) | |||||
queryType = QueryType.Ask; | |||||
// part1: select / ask ... | |||||
if (queryType==QueryType.Ask) | |||||
ret += "ask"; | |||||
else if(countTarget) | |||||
ret += ("select COUNT(DISTINCT " + questionFocus + ")"); | |||||
else | |||||
{ | |||||
if(!isBGP()) // AGG: select question focus | |||||
ret += ("select DISTINCT " + questionFocus); | |||||
else // BGP: select all variables | |||||
{ | |||||
ret += "select DISTINCT "; | |||||
for (String v : variables) | |||||
ret += v + " "; | |||||
} | |||||
} | |||||
// part2: triples | |||||
ret += " where\n{\n"; | |||||
for(Triple t : tripleList) | |||||
{ | |||||
if (!t.object.equals("literal_HRZ")) { // need not display literal | |||||
ret += t.toStringForGStore(); | |||||
ret += " .\n"; | |||||
} | |||||
} | |||||
ret += "}\n"; | |||||
// part3: order by / group by ... | |||||
if(moreThanStr != null) | |||||
ret += moreThanStr+"\n"; | |||||
if(mostStr != null) | |||||
ret += mostStr+"\n"; | |||||
// part4: limit | |||||
if(queryType != QueryType.Ask && (mostStr == null || !mostStr.contains("LIMIT"))) | |||||
ret += "LIMIT " + Globals.MaxAnswerNum; | |||||
return ret; | |||||
} | |||||
//Use to execute (select all variables; format 'aggregation' and 'ask') | |||||
public String toStringForVirtuoso() | |||||
{ | |||||
String ret = ""; | |||||
HashSet<String> variables = new HashSet<String>(); | |||||
// prefix | |||||
if (queryType==QueryType.Ask) | |||||
ret += "ask where"; | |||||
else if(countTarget) | |||||
ret += ("select COUNT(DISTINCT " + questionFocus + ") where"); | |||||
else | |||||
{ | |||||
// AGG: select question focus | |||||
if(moreThanStr != null || mostStr != null) | |||||
ret += ("select DISTINCT " + questionFocus + " where"); | |||||
// BGP: select all variables | |||||
else | |||||
{ | |||||
for (Triple t: tripleList) | |||||
{ | |||||
if (!t.isSubjConstant()) variables.add(t.subject.replaceAll(" ", "_")); | |||||
if (!t.isObjConstant()) variables.add(t.object.replaceAll(" ", "_")); | |||||
} | |||||
ret += "select "; | |||||
for (String v : variables) | |||||
ret += v + " "; | |||||
ret += "where"; | |||||
} | |||||
} | |||||
ret += "\n{\n"; | |||||
if(variables.size() == 0) | |||||
variables.add(questionFocus); | |||||
// triples | |||||
for (Triple t : tripleList) | |||||
{ | |||||
if (!t.object.equals("literal_HRZ")) { | |||||
ret += t.toStringForGStore(); | |||||
ret += " .\n"; | |||||
} | |||||
} | |||||
ret += "}\n"; | |||||
// suffix | |||||
if(moreThanStr != null) | |||||
{ | |||||
ret += moreThanStr+"\n"; | |||||
} | |||||
if(mostStr != null) | |||||
{ | |||||
ret += mostStr+"\n"; | |||||
} | |||||
return ret; | |||||
} | |||||
public int getVariableNumber() | |||||
{ | |||||
int res = 0; | |||||
for (Triple t: tripleList) | |||||
{ | |||||
if (!t.isSubjConstant()) res++; | |||||
if (!t.isObjConstant()) res++; | |||||
} | |||||
return res; | |||||
} | |||||
public void adjustTriplesOrder() | |||||
{ | |||||
Collections.sort(this.tripleList); | |||||
} | |||||
public int compareTo(Sparql o) | |||||
{ | |||||
double diff = this.score - o.score; | |||||
if (diff > 0) | |||||
return -1; | |||||
else if (diff < 0) | |||||
return 1; | |||||
else | |||||
return 0; | |||||
} | |||||
@Override | |||||
public int hashCode() | |||||
{ | |||||
int key = 0; | |||||
for(Triple t: this.tripleList) | |||||
key ^= t.hashCode(); | |||||
return key; | |||||
} | |||||
@Override | |||||
public boolean equals(Object spq) | |||||
{ | |||||
Sparql tempSparql= (Sparql) spq; | |||||
String s1 = this.toStringForGStore2(), s2 = tempSparql.toStringForGStore2(); | |||||
if(this.toStringForGStore2().equals(tempSparql.toStringForGStore2())) | |||||
return true; | |||||
else | |||||
return false; | |||||
} | |||||
public Sparql(){} | |||||
public Sparql(HashMap<Integer, SemanticRelation> semanticRelations) | |||||
{ | |||||
this.semanticRelations = semanticRelations; | |||||
} | |||||
public Sparql copy() | |||||
{ | |||||
Sparql spq = new Sparql(this.semanticRelations); | |||||
for (Triple t : this.tripleList) | |||||
spq.addTriple(t); | |||||
return spq; | |||||
} | |||||
public void removeLastTriple() | |||||
{ | |||||
int idx = tripleList.size()-1; | |||||
score -= tripleList.get(idx).score; | |||||
tripleList.remove(idx); | |||||
} | |||||
public Sparql removeAllTypeInfo () | |||||
{ | |||||
score = 0; | |||||
ArrayList<Triple> newTripleList = new ArrayList<Triple>(); | |||||
for (Triple t : tripleList) | |||||
{ | |||||
if (t.predicateID != Globals.pd.typePredicateID) | |||||
{ | |||||
newTripleList.add(t); | |||||
score += t.score; | |||||
} | |||||
} | |||||
tripleList = newTripleList; | |||||
return this; | |||||
} | |||||
}; |
@@ -0,0 +1,257 @@ | |||||
package rdf; | |||||
import nlp.ds.Word; | |||||
import qa.Globals; | |||||
public class Triple implements Comparable<Triple>{ | |||||
public String subject = null; // subject/object after disambiguation. | |||||
public String object = null; | |||||
static public int TYPE_ROLE_ID = -5; | |||||
static public int VAR_ROLE_ID = -2; | |||||
static public int CAT_ROLE_ID = -8; // Category | |||||
static public String VAR_NAME = "?xxx"; | |||||
// subjId/objId: entity id | TYPE_ROLE_ID | VAR_ROLE_ID | |||||
public int subjId = -1; | |||||
public int objId = -1; | |||||
public int predicateID = -1; | |||||
public Word subjWord = null; // only be used when semRltn == null | |||||
public Word objWord = null; | |||||
public SemanticRelation semRltn = null; | |||||
public double score = 0; | |||||
public boolean isSubjObjOrderSameWithSemRltn = true; | |||||
public boolean isSubjObjOrderPrefered = false; | |||||
public Word typeSubjectWord = null; // for "type" triples only | |||||
public Triple (Triple t) { | |||||
subject = t.subject; | |||||
object = t.object; | |||||
subjId = t.subjId; | |||||
objId = t.objId; | |||||
predicateID = t.predicateID; | |||||
semRltn = t.semRltn; | |||||
score = t.score; | |||||
isSubjObjOrderSameWithSemRltn = t.isSubjObjOrderSameWithSemRltn; | |||||
isSubjObjOrderPrefered = t.isSubjObjOrderPrefered; | |||||
} | |||||
// A final triple (subject/object order will not changed), does not rely on semantic relation (sr == null), from one word (type variable | embedded info) | |||||
public Triple (int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco) { | |||||
subjId = sId; | |||||
objId = oId; | |||||
subject = s; | |||||
predicateID = p; | |||||
object = o; | |||||
semRltn = sr; | |||||
score = sco; | |||||
} | |||||
// A triple translated from a semantic relation (subject/object order can be changed in later) | |||||
public Triple (int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco, boolean isSwap) { | |||||
subjId = sId; | |||||
objId = oId; | |||||
subject = s; | |||||
predicateID = p; | |||||
object = o; | |||||
semRltn = sr; | |||||
score = sco; | |||||
isSubjObjOrderSameWithSemRltn = isSwap; | |||||
} | |||||
// A final triple (subject/object order will not changed), does not rely on semantic relation (sr == null), from two word (implicit relations of modifier) | |||||
public Triple(int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco, Word subj, Word obj) { | |||||
subjId = sId; | |||||
objId = oId; | |||||
subject = s; | |||||
predicateID = p; | |||||
object = o; | |||||
semRltn = sr; | |||||
score = sco; | |||||
subjWord = subj; | |||||
objWord = obj; | |||||
} | |||||
public Triple copy() { | |||||
Triple t = new Triple(this); | |||||
return t; | |||||
} | |||||
public Triple copySwap() { | |||||
Triple t = new Triple(this); | |||||
String temp; | |||||
int tmpId; | |||||
tmpId = t.subjId; | |||||
t.subjId = t.objId; | |||||
t.objId = tmpId; | |||||
temp = t.subject; | |||||
t.subject = t.object; | |||||
t.object = temp; | |||||
t.isSubjObjOrderSameWithSemRltn = !this.isSubjObjOrderSameWithSemRltn; | |||||
t.isSubjObjOrderPrefered = !this.isSubjObjOrderPrefered; | |||||
return t; | |||||
} | |||||
public void addScore(double s) { | |||||
score += s; | |||||
} | |||||
public double getScore() { | |||||
return score; | |||||
} | |||||
@Override | |||||
public int hashCode() | |||||
{ | |||||
return new Integer(subjId).hashCode() ^ new Integer(objId).hashCode() ^ new Integer(predicateID).hashCode(); | |||||
} | |||||
@Override | |||||
public String toString() { | |||||
return subjId+":<" + subject + "> <" + Globals.pd.getPredicateById(predicateID) + "> "+objId+":<" + object + ">" + " : " + score; | |||||
} | |||||
public String toStringForGStore() { | |||||
StringBuilder sb = new StringBuilder(""); | |||||
String _subject = subject; | |||||
if(_subject.startsWith("?")) | |||||
sb.append(_subject+"\t"); | |||||
else | |||||
sb.append("<" + _subject + ">\t"); | |||||
sb.append("<" + Globals.pd.getPredicateById(predicateID) + ">\t"); | |||||
String _object; | |||||
if(predicateID == Globals.pd.typePredicateID && object.contains("|")) | |||||
_object = object.substring(0, object.indexOf('|')); | |||||
else | |||||
_object = object; | |||||
if(_object.startsWith("?")) | |||||
sb.append(_object); | |||||
else | |||||
sb.append("<" + _object + ">"); | |||||
return sb.toString().replace(' ', '_'); | |||||
} | |||||
public String toStringWithoutScore() { | |||||
return "<" + subject + "> <" + Globals.pd.getPredicateById(predicateID) + "> <" + object + ">"; | |||||
} | |||||
public Word getSubjectWord () { | |||||
if (predicateID == Globals.pd.typePredicateID) { | |||||
return typeSubjectWord; | |||||
} | |||||
else if(semRltn == null) | |||||
{ | |||||
return subjWord; | |||||
} | |||||
else { | |||||
if (isSubjObjOrderSameWithSemRltn) return semRltn.arg1Word; | |||||
else return semRltn.arg2Word; | |||||
} | |||||
} | |||||
public Word getObjectWord () { | |||||
if (predicateID == Globals.pd.typePredicateID) { | |||||
return typeSubjectWord; | |||||
} | |||||
else if(semRltn == null) | |||||
{ | |||||
return objWord; | |||||
} | |||||
else { | |||||
if (isSubjObjOrderSameWithSemRltn) return semRltn.arg2Word; | |||||
else return semRltn.arg1Word; | |||||
} | |||||
} | |||||
public boolean isSubjConstant () { | |||||
if (predicateID == Globals.pd.typePredicateID) { | |||||
return !subject.startsWith("?"); | |||||
} | |||||
else { | |||||
// Triple from semantic (obvious) relation | |||||
if(semRltn != null) | |||||
{ | |||||
if (isSubjObjOrderSameWithSemRltn) return semRltn.isArg1Constant; | |||||
else return semRltn.isArg2Constant; | |||||
} | |||||
// Triple from implicit relation (no semantic relation), it is final triple | |||||
else | |||||
{ | |||||
if(subjId != Triple.VAR_ROLE_ID && subjId != Triple.TYPE_ROLE_ID) | |||||
return true; | |||||
else | |||||
return false; | |||||
} | |||||
} | |||||
} | |||||
public boolean isObjConstant () { | |||||
if (predicateID == Globals.pd.typePredicateID) { | |||||
return !object.startsWith("?"); | |||||
} | |||||
else { | |||||
if(semRltn != null) | |||||
{ | |||||
if (isSubjObjOrderSameWithSemRltn) return semRltn.isArg2Constant; | |||||
else return semRltn.isArg1Constant; | |||||
} | |||||
else | |||||
{ | |||||
if(objId != Triple.VAR_ROLE_ID && objId != Triple.TYPE_ROLE_ID) | |||||
return true; | |||||
else | |||||
return false; | |||||
} | |||||
} | |||||
} | |||||
public int compareTo(Triple o) | |||||
{ | |||||
//Order: Type, Ent&Ent, Ent&Var, Var&Var | |||||
if(this.predicateID == Globals.pd.typePredicateID) | |||||
{ | |||||
if(o.predicateID == Globals.pd.typePredicateID) | |||||
return 0; | |||||
else | |||||
return -1; | |||||
} | |||||
int cnt1 = 0, cnt2 = 0; | |||||
if(!this.subject.startsWith("?")) | |||||
cnt1++; | |||||
if(!this.object.startsWith("?")) | |||||
cnt1++; | |||||
if(!o.subject.startsWith("?")) | |||||
cnt2++; | |||||
if(!o.object.startsWith("?")) | |||||
cnt2++; | |||||
if(cnt1 == cnt2) | |||||
return 0; | |||||
else if(cnt1 > cnt2) | |||||
return -1; | |||||
else | |||||
return 1; | |||||
} | |||||
public void swapSubjObjOrder() { | |||||
String temp = subject; | |||||
int tmpId = subjId; | |||||
subject = object; | |||||
subjId = objId; | |||||
object = temp; | |||||
objId = tmpId; | |||||
isSubjObjOrderSameWithSemRltn = !isSubjObjOrderSameWithSemRltn; | |||||
} | |||||
}; |
@@ -0,0 +1,53 @@ | |||||
package rdf; | |||||
import qa.Globals; | |||||
public class TypeMapping implements Comparable<TypeMapping> | |||||
{ | |||||
public Integer typeID = null; | |||||
public String typeName = null; | |||||
public double score = 0; | |||||
/* | |||||
* 1, For standard type (DBO type in DBpedia), relation = typePredicateID (rdf:type) | |||||
* 2, For nonstandard type, typeID = -1 | |||||
* 3, If add type into triples, need relation | eg, Which professional surfers were born in Australia? (?uri dbo:occupation res:Surfing) relation = dbo:occupation | |||||
* 4, If needn't add type, relation = -1 | eg, Who was the father of [Queen] Elizabeth II | |||||
* */ | |||||
public int prefferdRelation = Globals.pd.typePredicateID; | |||||
public TypeMapping(Integer tid, String type, double sco) | |||||
{ | |||||
typeID = tid; | |||||
typeName = type; | |||||
score = sco; | |||||
} | |||||
public TypeMapping(Integer tid, String type, Integer relation, double sco) | |||||
{ | |||||
typeID = tid; | |||||
typeName = type.replace("_", ""); | |||||
score = sco; | |||||
prefferdRelation = relation; | |||||
} | |||||
// In descending order: big --> small | |||||
public int compareTo(TypeMapping o) | |||||
{ | |||||
double diff = this.score - o.score; | |||||
if (diff > 0) return -1; | |||||
else if (diff < 0) return 1; | |||||
else return 0; | |||||
} | |||||
public int hashCode() | |||||
{ | |||||
return typeID.hashCode(); | |||||
} | |||||
public String toString() | |||||
{ | |||||
StringBuilder res = new StringBuilder(typeName+"("+score+")"); | |||||
return res.toString(); | |||||
} | |||||
} |
@@ -0,0 +1,91 @@ | |||||
package utils; | |||||
import java.io.*; | |||||
import java.util.ArrayList; | |||||
import java.util.HashSet; | |||||
import java.util.List; | |||||
import java.util.Set; | |||||
public class FileUtil { | |||||
public static List<String> readFile(String filePath){ | |||||
List<String> lines = new ArrayList<String>(); | |||||
try { | |||||
BufferedReader br = new BufferedReader(new FileReader(filePath)); | |||||
String line = null; | |||||
while( (line = br.readLine()) != null ){ | |||||
lines.add(line); | |||||
} | |||||
br.close(); | |||||
}catch(Exception e){ | |||||
e.printStackTrace(); | |||||
}finally { | |||||
return lines; | |||||
} | |||||
} | |||||
public static Set<String> readFileAsSet(String filePath){ | |||||
Set<String> lines = new HashSet<String>(); | |||||
try { | |||||
BufferedReader br = new BufferedReader(new FileReader(filePath)); | |||||
String line = null; | |||||
while( (line = br.readLine()) != null ){ | |||||
lines.add(line); | |||||
} | |||||
br.close(); | |||||
}catch(Exception e){ | |||||
e.printStackTrace(); | |||||
}finally { | |||||
return lines; | |||||
} | |||||
} | |||||
public static List<String> readFile(InputStream is){ | |||||
List<String> lines = new ArrayList<String>(); | |||||
try { | |||||
BufferedReader br = new BufferedReader(new InputStreamReader(is)); | |||||
String line = null; | |||||
while( (line = br.readLine()) != null ){ | |||||
lines.add(line); | |||||
} | |||||
br.close(); | |||||
}catch(Exception e){ | |||||
e.printStackTrace(); | |||||
}finally { | |||||
return lines; | |||||
} | |||||
} | |||||
public static String readFileAsALine(InputStream is){ | |||||
List<String> lines = readFile(is); | |||||
StringBuffer buffer = new StringBuffer(); | |||||
for(String line : lines){ | |||||
buffer.append(line); | |||||
} | |||||
return buffer.toString(); | |||||
} | |||||
public static void writeFile(List<String> lines, String filePath){ | |||||
try{ | |||||
BufferedWriter bw = new BufferedWriter(new FileWriter(filePath)); | |||||
for(String line : lines){ | |||||
bw.write(line+"\n"); | |||||
} | |||||
bw.close(); | |||||
}catch(Exception e){ | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
public static void writeFile(List<String> lines, String filePath, boolean ifContinueWrite){ | |||||
try{ | |||||
BufferedWriter bw = new BufferedWriter(new FileWriter(filePath, ifContinueWrite)); | |||||
for(String line : lines){ | |||||
bw.write(line+"\n"); | |||||
} | |||||
bw.close(); | |||||
}catch(Exception e){ | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,114 @@ | |||||
package utils; | |||||
import java.io.BufferedReader; | |||||
import java.io.IOException; | |||||
import java.io.InputStream; | |||||
import java.io.InputStreamReader; | |||||
import java.io.PrintWriter; | |||||
import java.net.URL; | |||||
import java.net.URLConnection; | |||||
import java.util.List; | |||||
import java.util.Map; | |||||
public class HttpRequest | |||||
{ | |||||
public static String sendGet(String url, String param) { | |||||
String result = ""; | |||||
BufferedReader in = null; | |||||
try { | |||||
String urlNameString = url + "?" + param; | |||||
URL realUrl = new URL(urlNameString); | |||||
URLConnection connection = realUrl.openConnection(); | |||||
connection.setRequestProperty("accept", "*/*"); | |||||
connection.setRequestProperty("connection", "Keep-Alive"); | |||||
connection.setRequestProperty("user-agent", | |||||
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); | |||||
connection.connect(); | |||||
Map<String, List<String>> map = connection.getHeaderFields(); | |||||
for (String key : map.keySet()) { | |||||
System.out.println(key + "--->" + map.get(key)); | |||||
} | |||||
in = new BufferedReader(new InputStreamReader( | |||||
connection.getInputStream())); | |||||
String line; | |||||
while ((line = in.readLine()) != null) { | |||||
result += line; | |||||
} | |||||
} catch (Exception e) { | |||||
System.out.println("Error when sending GET request: " + e); | |||||
e.printStackTrace(); | |||||
} | |||||
finally { | |||||
try { | |||||
if (in != null) { | |||||
in.close(); | |||||
} | |||||
} catch (Exception e2) { | |||||
e2.printStackTrace(); | |||||
} | |||||
} | |||||
return result; | |||||
} | |||||
public static String sendPost(String url, String param) { | |||||
PrintWriter out = null; | |||||
BufferedReader in = null; | |||||
String result = ""; | |||||
try { | |||||
URL realUrl = new URL(url); | |||||
URLConnection conn = realUrl.openConnection(); | |||||
conn.setRequestProperty("accept", "*/*"); | |||||
conn.setRequestProperty("connection", "Keep-Alive"); | |||||
conn.setRequestProperty("user-agent", | |||||
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); | |||||
conn.setDoOutput(true); | |||||
conn.setDoInput(true); | |||||
out = new PrintWriter(conn.getOutputStream()); | |||||
out.print(param); | |||||
out.flush(); | |||||
in = new BufferedReader( | |||||
new InputStreamReader(conn.getInputStream())); | |||||
String line; | |||||
while ((line = in.readLine()) != null) { | |||||
result += line; | |||||
} | |||||
} catch (Exception e) { | |||||
System.out.println("Error when sending POST request: "+e); | |||||
e.printStackTrace(); | |||||
} | |||||
finally{ | |||||
try{ | |||||
if(out!=null){ | |||||
out.close(); | |||||
} | |||||
if(in!=null){ | |||||
in.close(); | |||||
} | |||||
} | |||||
catch(IOException ex){ | |||||
ex.printStackTrace(); | |||||
} | |||||
} | |||||
return result; | |||||
} | |||||
public static String getPostData(InputStream in, int size, String charset) { | |||||
if (in != null && size > 0) { | |||||
byte[] buf = new byte[size]; | |||||
try { | |||||
in.read(buf); | |||||
if (charset == null || charset.length() == 0) | |||||
return new String(buf); | |||||
else { | |||||
return new String(buf, charset); | |||||
} | |||||
} catch (IOException e) { | |||||
e.printStackTrace(); | |||||
} | |||||
} | |||||
return null; | |||||
} | |||||
} |