@@ -0,0 +1,238 @@ | |||
package addition; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import paradict.PredicateIDAndSupport; | |||
import log.QueryLogger; | |||
//import nlp.ds.DependencyTree; | |||
//import nlp.ds.DependencyTreeNode; | |||
import nlp.ds.Word; | |||
import nlp.ds.Sentence.SentenceType; | |||
import qa.Globals; | |||
//import qa.extract.TypeRecognition; | |||
//import qa.mapping.SemanticItemMapping; | |||
//import rdf.EntityMapping; | |||
import rdf.SemanticUnit; | |||
import rdf.Sparql; | |||
import rdf.Sparql.QueryType; | |||
import rdf.Triple; | |||
//import fgmt.TypeFragment; | |||
public class AddtionalFix | |||
{ | |||
public HashMap<String, String> pattern2category = new HashMap<String, String>(); | |||
public AddtionalFix() | |||
{ | |||
// Some category mappings for DBpedia, try automatic linking methods later. | base form | |||
pattern2category.put("gangster_from_the_prohibition_era", "Prohibition-era_gangsters"); | |||
pattern2category.put("seven_wonder_of_the_ancient_world", "Seven_Wonders_of_the_Ancient_World"); | |||
pattern2category.put("three_ship_use_by_columbus", "Christopher_Columbus"); | |||
pattern2category.put("13_british_colony", "Thirteen_Colonies"); | |||
} | |||
public void process(QueryLogger qlog) | |||
{ | |||
fixCategory(qlog); | |||
oneTriple(qlog); | |||
oneNode(qlog); | |||
//aggregation | |||
AggregationRecognition ar = new AggregationRecognition(); | |||
ar.recognize(qlog); | |||
//query type | |||
decideQueryType(qlog); | |||
} | |||
public void decideQueryType(QueryLogger qlog) | |||
{ | |||
for(Sparql spq: qlog.rankedSparqls) | |||
if(qlog.s.sentenceType == SentenceType.GeneralQuestion) | |||
spq.queryType = QueryType.Ask; | |||
} | |||
public void fixCategory(QueryLogger qlog) | |||
{ | |||
if(qlog == null || qlog.semanticUnitList == null) | |||
return; | |||
String var = null, category = null; | |||
for(SemanticUnit su: qlog.semanticUnitList) | |||
{ | |||
if(su.centerWord.mayCategory) | |||
{ | |||
var = "?"+su.centerWord.originalForm; | |||
category = su.centerWord.category; | |||
} | |||
} | |||
if(category != null && var != null) | |||
for(Sparql spq: qlog.rankedSparqls) | |||
{ | |||
boolean occured = false; | |||
for(Triple tri: spq.tripleList) | |||
{ | |||
if(tri.subject.equals(var)) | |||
{ | |||
occured = true; | |||
break; | |||
} | |||
} | |||
String oName = category; | |||
String pName = "subject"; | |||
int pid = Globals.pd.predicate_2_id.get(pName); | |||
Triple triple = new Triple(Triple.VAR_ROLE_ID, var, pid, Triple.CAT_ROLE_ID, oName, null, 100); | |||
spq.addTriple(triple); | |||
} | |||
} | |||
/* recognize one-Node query | |||
* Two cases:1、Special question|Imperative sentence 2、General question | |||
* 1-1:how many [], highest [] ... | For single variable, add constraint (aggregation) | |||
* 1-2: What is backgammon? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...) | |||
* 1-3: Give me all Seven Wonders of the Ancient World. | Notice, "Seven Wonders of the Ancient World" should be recognized as ENT before. (in fact it is CATEGORY in DBpeida) | |||
* 2-1: Are there any [castles_in_the_United_States](yago:type) | |||
* 2-2:Was Sigmund Freud married? | Lack of variable node. | |||
* 2-3:Are penguins endangered? | No suitable relation matching, need transition. | |||
*/ | |||
public void oneNode(QueryLogger qlog) | |||
{ | |||
if(qlog == null || qlog.semanticUnitList == null || qlog.semanticUnitList.size()>1) | |||
return; | |||
Word target = qlog.target; | |||
Word[] words = qlog.s.words; | |||
if(qlog.s.sentenceType != SentenceType.GeneralQuestion) | |||
{ | |||
//1-1: how many [type] are there | List all [type] | |||
if(target.mayType && target.tmList != null) | |||
{ | |||
String subName = "?"+target.originalForm; | |||
String typeName = target.tmList.get(0).typeName; | |||
Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
//1-2: What is [ent]? | |||
else if(target.mayEnt && target.emList != null) | |||
{ | |||
if(words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be")) | |||
{ | |||
int eid = target.emList.get(0).entityID; | |||
String subName = target.emList.get(0).entityName; | |||
Triple triple = new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+target.originalForm, null, target.emList.get(0).score); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
} | |||
//1-3: Give me all Seven Wonders of the Ancient World. | |||
else if(target.mayCategory && target.category != null) | |||
{ | |||
String oName = target.category; | |||
String pName = "subject"; | |||
int pid = Globals.pd.predicate_2_id.get(pName); | |||
Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+target.originalForm, pid, Triple.CAT_ROLE_ID, oName, null, 100); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
} | |||
else | |||
{ | |||
if(target.mayEnt && target.emList != null) | |||
{ | |||
//2-2:Was Sigmund Freud married? | |||
String relMention = ""; | |||
for(Word word: words) | |||
if(word != target && !word.baseForm.equals(".") && !word.baseForm.equals("?")) | |||
relMention += word.baseForm+" "; | |||
if(relMention.length() > 1) | |||
relMention = relMention.substring(0, relMention.length()-1); | |||
ArrayList<PredicateIDAndSupport> pmList = null; | |||
if(Globals.pd.nlPattern_2_predicateList.containsKey(relMention)) | |||
pmList = Globals.pd.nlPattern_2_predicateList.get(relMention); | |||
if(pmList != null && pmList.size() > 0) | |||
{ | |||
int pid = pmList.get(0).predicateID; | |||
int eid = target.emList.get(0).entityID; | |||
String subName = target.emList.get(0).entityName; | |||
Triple triple = new Triple(eid, subName, pid, Triple.VAR_ROLE_ID, "?x", null, 100); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
//2-3:Are penguins endangered? | |||
else | |||
{ | |||
if(target.position < words.length && pattern2category.containsKey(words[target.position].baseForm)) | |||
{ | |||
String oName = pattern2category.get(words[target.position].baseForm); | |||
String pName = "subject"; | |||
int pid = Globals.pd.predicate_2_id.get(pName); | |||
int eid = target.emList.get(0).entityID; | |||
String subName = target.emList.get(0).entityName; | |||
Triple triple = new Triple(eid, subName, pid, Triple.CAT_ROLE_ID, oName, null, 100); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
} | |||
} | |||
//2-1: Are there any [castles_in_the_United_States](yago:type) | |||
else if(target.mayType && target.tmList != null) | |||
{ | |||
String typeName = target.tmList.get(0).typeName; | |||
String subName = "?" + target.originalForm; | |||
//System.out.println("typeName="+typeName+" subName="+subName); | |||
Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
} | |||
} | |||
/* | |||
* One triple recognized but no suitable relation. | |||
* */ | |||
public void oneTriple (QueryLogger qlog) | |||
{ | |||
if(qlog == null || qlog.semanticUnitList == null) | |||
return; | |||
if(qlog.s.sentenceType == SentenceType.SpecialQuestion) | |||
{ | |||
Word[] words = qlog.s.words; | |||
if(qlog.semanticUnitList.size() == 2) | |||
{ | |||
Word entWord = null, whWord = null; | |||
for(int i=0;i<qlog.semanticUnitList.size();i++) | |||
{ | |||
if(qlog.semanticUnitList.get(i).centerWord.baseForm.startsWith("wh")) | |||
whWord = qlog.semanticUnitList.get(i).centerWord; | |||
if(qlog.semanticUnitList.get(i).centerWord.mayEnt) | |||
entWord = qlog.semanticUnitList.get(i).centerWord; | |||
} | |||
// 1-1: (what) is [ent] | we guess users may want the type of ent. | |||
if(entWord!=null && whWord!= null && words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be")) | |||
{ | |||
int eid = entWord.emList.get(0).entityID; | |||
String subName = entWord.emList.get(0).entityName; | |||
Triple triple = new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+whWord.originalForm, null, entWord.emList.get(0).score); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
} | |||
} | |||
} | |||
} | |||
@@ -0,0 +1,155 @@ | |||
package addition; | |||
import nlp.ds.DependencyTree; | |||
import nlp.ds.DependencyTreeNode; | |||
import nlp.ds.Word; | |||
import qa.Globals; | |||
import rdf.SemanticRelation; | |||
import rdf.Sparql; | |||
import rdf.Triple; | |||
import log.QueryLogger; | |||
public class AggregationRecognition { | |||
// Numbers | |||
static String x[]={"zero","one","two","three","four","five","six","seven","eight","nine"}; | |||
static String y[]={"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen"}; | |||
static String z[]={"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"}; | |||
static int b; | |||
public static Integer translateNumbers(String str) // 1~100 | |||
{ | |||
int flag; | |||
try { | |||
b=Integer.valueOf(str); | |||
flag=1; | |||
} | |||
catch (Exception e){ | |||
flag=2; | |||
} | |||
int i,j; | |||
switch(flag) | |||
{ | |||
case 1: | |||
return b; | |||
case 2: // Words need to be translated into numbers | |||
boolean flag1=true; | |||
for(i=0;i<8;i++) // 20~99 | |||
{ | |||
for(j=0;j<10;j++) | |||
{ | |||
String str1=z[i],str2=x[j]; | |||
if(str.equals((str1))){ | |||
return i*10+20; // 1x | |||
} | |||
else if(str.equals((str1+" "+str2))){ | |||
return i*10+j+20; | |||
} | |||
} | |||
} | |||
for(i=0;i<10;i++){ | |||
if(str.equals(x[i])){ | |||
return i; | |||
} | |||
else if(str.equals(y[i])){ | |||
return 10+i; | |||
} | |||
} | |||
System.out.println("Warning: Can not Translate Number: " + str); | |||
} | |||
return 1; | |||
} | |||
public void recognize(QueryLogger qlog) | |||
{ | |||
DependencyTree ds = qlog.s.dependencyTreeStanford; | |||
if(qlog.isMaltParserUsed) | |||
ds = qlog.s.dependencyTreeMalt; | |||
Word[] words = qlog.s.words; | |||
// how often | how many | |||
if(qlog.s.plainText.indexOf("How many")!=-1||qlog.s.plainText.indexOf("How often")!=-1||qlog.s.plainText.indexOf("how many")!=-1||qlog.s.plainText.indexOf("how often")!=-1) | |||
{ | |||
for(Sparql sp: qlog.rankedSparqls) | |||
{ | |||
sp.countTarget = true; | |||
// How many pages does War and Peace have? --> res:War_and_Peace dbo:numberOfPages ?n . | |||
// ?uri dbo:populationTotal ?inhabitants . | |||
for(Triple triple: sp.tripleList) | |||
{ | |||
String p = Globals.pd.getPredicateById(triple.predicateID).toLowerCase(); | |||
if(p.contains("number") || p.contains("total") || p.contains("calories") || p.contains("satellites")) | |||
{ | |||
sp.countTarget = false; | |||
} | |||
} | |||
} | |||
} | |||
// more than [num] [node] | |||
for(DependencyTreeNode dtn: ds.nodesList) | |||
{ | |||
if(dtn.word.baseForm.equals("more")) | |||
{ | |||
if(dtn.father!=null && dtn.father.word.baseForm.equals("than")) | |||
{ | |||
DependencyTreeNode tmp = dtn.father; | |||
if(tmp.father!=null && tmp.father.word.posTag.equals("CD") && tmp.father.father!=null && tmp.father.father.word.posTag.startsWith("N")) | |||
{ | |||
DependencyTreeNode target = tmp.father.father; | |||
// Which caves have more than 3 entrances | entranceCount | filter | |||
for(Sparql sp: qlog.rankedSparqls) | |||
{ | |||
if(target.father !=null && target.father.word.baseForm.equals("have")) | |||
{ | |||
sp.moreThanStr = "GROUP BY ?" + qlog.target.originalForm + "\nHAVING (COUNT(?"+target.word.originalForm + ") > "+tmp.father.word.baseForm+")"; | |||
} | |||
else | |||
{ | |||
int num = translateNumbers(tmp.father.word.baseForm); | |||
sp.moreThanStr = "FILTER (?"+target.word.originalForm+"> " + num + ")"; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
} | |||
// most | |||
for(Word word: words) | |||
{ | |||
if(word.baseForm.equals("most")) | |||
{ | |||
Word modifiedWord = word.modifiedWord; | |||
if(modifiedWord != null) | |||
{ | |||
for(Sparql sp: qlog.rankedSparqls) | |||
{ | |||
// Which Indian company has the most employees? --> ... dbo:numberOfEmployees ?n . || ?employees dbo:company ... | |||
sp.mostStr = "ORDER BY DESC(COUNT(?"+modifiedWord.originalForm+"))\nOFFSET 0 LIMIT 1"; | |||
for(Triple triple: sp.tripleList) | |||
{ | |||
String p = Globals.pd.getPredicateById(triple.predicateID).toLowerCase(); | |||
if(p.contains("number") || p.contains("total")) | |||
{ | |||
sp.mostStr = "ORDER BY DESC(?"+modifiedWord.originalForm+")\nOFFSET 0 LIMIT 1"; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
} | |||
} | |||
public static void main(String[] args) { | |||
System.out.println(translateNumbers("Twelve")); | |||
System.out.println(translateNumbers("thirty two")); | |||
} | |||
} |
@@ -0,0 +1,312 @@ | |||
package fgmt; | |||
import java.io.IOException; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import rdf.EntityMapping; | |||
import lcn.EntityFragmentFields; | |||
import lcn.EntityNameAndScore; | |||
import lcn.SearchInEntityFragments; | |||
public class EntityFragment extends Fragment { | |||
public int eId; | |||
public HashSet<Integer> inEdges = new HashSet<Integer>(); | |||
public HashSet<Integer> outEdges = new HashSet<Integer>(); | |||
public HashSet<Integer> types = new HashSet<Integer>(); | |||
// in/out entity and the connected edges. Eg, <eId><director><tom> <eId><star><tom>, then outEntMap of eId contains <tom,<director,star>> | |||
public HashMap<Integer, ArrayList<Integer>> inEntMap = new HashMap<Integer, ArrayList<Integer>>(); // notice the input file should no redundant triple. | |||
public HashMap<Integer, ArrayList<Integer>> outEntMap = new HashMap<Integer, ArrayList<Integer>>(); | |||
static double thres1 = 0.4; | |||
static double thres2 = 0.8; | |||
static int thres3 = 3; | |||
static int k = 50; | |||
/** | |||
* mention to entity using Lucene index. | |||
* | |||
* rule: | |||
* select top-k results of each phrase. | |||
* (1)if current lowest score < thres1, drop those score < thres1. | |||
* (2)if current lowest score > thres2, add those score > thres2. | |||
* | |||
* exact match: | |||
* (1)Lucene score = 1. | |||
* (2)String match (lowercase): edit distance <= thres3. | |||
* | |||
* score: | |||
* use Lucene score directly. | |||
* | |||
* @param phrase | |||
* @return | |||
*/ | |||
public static HashMap<Integer, Double> getCandEntityNames2(String phrase) { | |||
HashMap<Integer, Double> ret = new HashMap<Integer, Double>(); | |||
ArrayList<EntityNameAndScore> list1 = getCandEntityNames_subject(phrase, thres1, thres2, k); | |||
if(list1 == null) | |||
return ret; | |||
int iter_size = 0; | |||
if (list1.size() <= k) { | |||
iter_size = list1.size(); | |||
} | |||
else if (list1.size() > k) { | |||
if (list1.get(k-1).score >= thres2) { | |||
iter_size = list1.size(); | |||
} | |||
else { | |||
iter_size = k; | |||
} | |||
} | |||
for(int i = 0; i < iter_size; i ++) { | |||
if (i < k) { | |||
ret.put(list1.get(i).entityID, getScore(phrase, list1.get(i).entityName, list1.get(i).score)); | |||
} | |||
else if (list1.get(i).score >= thres2) { | |||
ret.put(list1.get(i).entityID, getScore(phrase, list1.get(i).entityName, list1.get(i).score)); | |||
} | |||
else { | |||
break; | |||
} | |||
} | |||
return ret; | |||
} | |||
public static ArrayList<EntityMapping> getEntityMappingList (String n) | |||
{ | |||
HashMap<Integer, Double> map = getCandEntityNames2(n); | |||
ArrayList<EntityMapping> ret = new ArrayList<EntityMapping>(); | |||
for (int eid : map.keySet()) | |||
{ | |||
String s = EntityFragmentFields.entityId2Name.get(eid); | |||
ret.add(new EntityMapping(eid, s, map.get(eid))); | |||
} | |||
Collections.sort(ret); | |||
return ret; | |||
} | |||
public static double getScore (String s1, String s2, double luceneScore) { | |||
double ret = luceneScore*100.0/(Math.log(calEditDistance(s1, s2)*1.5+1)+1); | |||
return ret; | |||
} | |||
/** | |||
* Edit distance (all lowercase) | |||
* @param s1 | |||
* @param s2 | |||
* @return | |||
*/ | |||
public static int calEditDistance (String s1, String s2) { | |||
s1 = s1.toLowerCase(); | |||
s2 = s2.toLowerCase(); | |||
int d[][]; | |||
int n = s1.length(); | |||
int m = s2.length(); | |||
int i, j, temp; | |||
char ch1, ch2; | |||
if(n == 0) { | |||
return m; | |||
} | |||
if(m == 0) { | |||
return n; | |||
} | |||
d = new int[n+1][m+1]; | |||
for(i=0; i<=n; i++) { | |||
d[i][0] = i; | |||
} | |||
for(j=0; j<=m; j++) { | |||
d[0][j] = j; | |||
} | |||
for(i=1; i<=n; i++) { | |||
ch1 = s1.charAt(i-1); | |||
for(j=1; j<=m; j++) { | |||
ch2 = s2.charAt(j-1); | |||
if(ch1 == ch2) { | |||
temp = 0; | |||
} else { | |||
temp = 1; | |||
} | |||
d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp); | |||
} | |||
} | |||
return d[n][m]; | |||
} | |||
private static int min(int a, int b, int c) { | |||
int ab = a<b?a:b; | |||
return ab<c?ab:c; | |||
} | |||
public static ArrayList<EntityNameAndScore> getCandEntityNames_subject(String phrase, double thres1, double thres2, int k) { | |||
SearchInEntityFragments sf = new SearchInEntityFragments(); | |||
//System.out.println("EntityFragment.getCandEntityNames_subject() ..."); | |||
ArrayList<EntityNameAndScore> ret_sf = null; | |||
try { | |||
ret_sf = sf.searchName(phrase, thres1, thres2, k); | |||
} catch (IOException e) { | |||
//e.printStackTrace(); | |||
System.err.println("Reading lcn index error"); | |||
} | |||
return ret_sf; | |||
} | |||
public static EntityFragment getEntityFragmentByEntityId(Integer entityId) | |||
{ | |||
if(!EntityFragmentFields.entityFragmentString.containsKey(entityId)) | |||
return null; | |||
String fgmt = EntityFragmentFields.entityFragmentString.get(entityId); | |||
EntityFragment ef = new EntityFragment(entityId, fgmt); | |||
return ef; | |||
} | |||
public static String getEntityFgmtStringByName(String entityName) | |||
{ | |||
int id = EntityFragmentFields.entityName2Id.get(entityName); | |||
String fgmt = EntityFragmentFields.entityFragmentString.get(id); | |||
return fgmt; | |||
} | |||
public EntityFragment(int eid, String fgmt) | |||
{ | |||
eId = eid; | |||
fragmentType = typeEnum.ENTITY_FRAGMENT; | |||
//eg: 11 |3961112:2881;410;,4641020:2330;, | |||
fgmt = fgmt.replace('|', '#'); | |||
String[] fields = fgmt.split("#"); | |||
if(fields.length > 0 && fields[0].length() > 0) | |||
{ | |||
String[] entEdgesArr = fields[0].split(","); | |||
for(int i = 0; i < entEdgesArr.length; i ++) | |||
{ | |||
String[] nums = entEdgesArr[i].split(":"); | |||
if(nums.length != 2) | |||
continue; | |||
int intEntId = Integer.valueOf(nums[0]); | |||
String[] intEdges = nums[1].split(";"); | |||
ArrayList<Integer> intEdgeList = new ArrayList<Integer>(); | |||
for(String outEdge: intEdges) | |||
{ | |||
intEdgeList.add(Integer.valueOf(outEdge)); | |||
} | |||
if(intEdgeList.size()>0) | |||
inEntMap.put(intEntId, intEdgeList); | |||
} | |||
} | |||
if(fields.length > 1 && fields[1].length() > 0) | |||
{ | |||
String[] entEdgesArr = fields[1].split(","); | |||
for(int i = 0; i < entEdgesArr.length; i ++) | |||
{ | |||
String[] nums = entEdgesArr[i].split(":"); | |||
if(nums.length != 2) | |||
continue; | |||
int outEntId = Integer.valueOf(nums[0]); | |||
String[] outEdges = nums[1].split(";"); | |||
ArrayList<Integer> outEdgeList = new ArrayList<Integer>(); | |||
for(String outEdge: outEdges) | |||
{ | |||
outEdgeList.add(Integer.valueOf(outEdge)); | |||
} | |||
if(outEdgeList.size()>0) | |||
outEntMap.put(outEntId, outEdgeList); | |||
} | |||
} | |||
if(fields.length > 2 && fields[2].length() > 0) { | |||
String[] nums = fields[2].split(","); | |||
for(int i = 0; i < nums.length; i ++) { | |||
if (nums[i].length() > 0) { | |||
inEdges.add(Integer.parseInt(nums[i])); | |||
} | |||
} | |||
} | |||
if(fields.length > 3 && fields[3].length() > 0) { | |||
String[] nums = fields[3].split(","); | |||
for(int i = 0; i < nums.length; i ++) { | |||
if (nums[i].length() > 0) { | |||
outEdges.add(Integer.parseInt(nums[i])); | |||
} | |||
} | |||
} | |||
if(fields.length > 4 && fields[4].length() > 0) { | |||
String[] nums = fields[4].split(","); | |||
for(int i = 0; i < nums.length; i ++) { | |||
if (nums[i].length() > 0) { | |||
types.add(Integer.parseInt(nums[i])); | |||
} | |||
} | |||
} | |||
//TODO: fix data for DBpedia 2014 (should be eliminated when update dataset) | |||
if(eid==2640237) //Barack_Obama | |||
{ | |||
inEdges.add(8432); //spouse | |||
outEdges.add(8432); | |||
ArrayList<Integer> outEdgeList = new ArrayList<Integer>(); | |||
outEdgeList.add(8432); | |||
inEntMap.put(4953443, outEdgeList); | |||
outEntMap.put(4953443, outEdgeList); | |||
} | |||
} | |||
@Override | |||
public String toString() | |||
{ | |||
StringBuilder ret = new StringBuilder(""); | |||
for(Integer inEnt: inEntMap.keySet()) | |||
{ | |||
ArrayList<Integer> inEdgeList = inEntMap.get(inEnt); | |||
if(inEdgeList==null || inEdgeList.size()==0) | |||
continue; | |||
ret.append(inEnt+":"); | |||
for(int inEdge: inEdgeList) | |||
ret.append(inEdge+";"); | |||
ret.append(","); | |||
} | |||
ret.append('|'); | |||
for(Integer outEnt: outEntMap.keySet()) | |||
{ | |||
ArrayList<Integer> outEdgeList = outEntMap.get(outEnt); | |||
if(outEdgeList==null || outEdgeList.size()==0) | |||
continue; | |||
ret.append(outEnt+":"); | |||
for(int outEdge: outEdgeList) | |||
ret.append(outEdge+";"); | |||
ret.append(","); | |||
} | |||
ret.append('|'); | |||
for(Integer p : inEdges) { | |||
ret.append(p); | |||
ret.append(','); | |||
} | |||
ret.append('|'); | |||
for(Integer p : outEdges) { | |||
ret.append(p); | |||
ret.append(','); | |||
} | |||
ret.append('|'); | |||
for(Integer t : types) { | |||
ret.append(t); | |||
ret.append(','); | |||
} | |||
return ret.toString(); | |||
} | |||
} |
@@ -0,0 +1,8 @@ | |||
package fgmt; | |||
public abstract class Fragment { | |||
public enum typeEnum {ENTITY_FRAGMENT, RELATION_FRAGMENT, TYPE_FRAGMENT, VAR_FRAGMENT}; | |||
public typeEnum fragmentType; | |||
public int fragmentId; | |||
}; |
@@ -0,0 +1,105 @@ | |||
package fgmt; | |||
import java.io.IOException; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import java.util.List; | |||
import qa.Globals; | |||
import utils.FileUtil; | |||
public class RelationFragment extends Fragment | |||
{ | |||
public static HashMap<Integer, ArrayList<RelationFragment>> relFragments = null; | |||
public static HashMap<String, ArrayList<Integer>> relationShortName2IdList = null; | |||
public static HashSet<Integer> literalRelationSet = null; | |||
public HashSet<Integer> inTypes = new HashSet<Integer>(); | |||
public HashSet<Integer> outTypes = new HashSet<Integer>(); | |||
public static final int literalTypeId = -176; | |||
public RelationFragment(String inFgmt, String outFgmt, int fid) | |||
{ | |||
fragmentId = fid; | |||
fragmentType = typeEnum.RELATION_FRAGMENT; | |||
String[] nums; | |||
// in | |||
nums = inFgmt.split(","); | |||
for(String s: nums) | |||
if(s.length() > 0) | |||
inTypes.add(Integer.parseInt(s)); | |||
// out | |||
if(outFgmt.equals("itera")) | |||
outTypes.add(literalTypeId); | |||
else | |||
{ | |||
nums = outFgmt.split(","); | |||
for(String s: nums) | |||
if(s.length() > 0) | |||
outTypes.add(Integer.parseInt(s)); | |||
} | |||
} | |||
public static void load() throws Exception | |||
{ | |||
String filename = Globals.localPath + "data/DBpedia2016/fragments/predicate_RDF_fragment/predicate_fragment.txt"; | |||
List<String> inputs = FileUtil.readFile(filename); | |||
relFragments = new HashMap<Integer, ArrayList<RelationFragment>>(); | |||
literalRelationSet = new HashSet<Integer>(); | |||
for(String line: inputs) | |||
{ | |||
String[] lines = line.split("\t"); | |||
String inString = lines[0].substring(1, lines[0].length()-1); | |||
int pid = Integer.parseInt(lines[1]); | |||
String outString = lines[2].substring(1, lines[2].length()-1); | |||
// Record which relations can connect LITERAL objects. | |||
if(outString.equals("itera")) // "literal".substring(1, length()-1) | |||
literalRelationSet.add(pid); | |||
if(!relFragments.containsKey(pid)) | |||
relFragments.put(pid, new ArrayList<RelationFragment>()); | |||
relFragments.get(pid).add(new RelationFragment(inString, outString, pid)); | |||
} | |||
loadId(); | |||
} | |||
public static void loadId() throws IOException | |||
{ | |||
String filename = Globals.localPath + "data/DBpedia2016/fragments/id_mappings/16predicate_id.txt"; | |||
List<String> inputs = FileUtil.readFile(filename); | |||
relationShortName2IdList = new HashMap<String, ArrayList<Integer>>(); | |||
for(String line: inputs) | |||
{ | |||
String[] lines = line.split("\t"); | |||
String rlnShortName = lines[0]; | |||
if (!relationShortName2IdList.containsKey(rlnShortName)) | |||
relationShortName2IdList.put(rlnShortName, new ArrayList<Integer>()); | |||
relationShortName2IdList.get(rlnShortName).add(Integer.parseInt(lines[1])); | |||
} | |||
} | |||
public static boolean isLiteral (String p) | |||
{ | |||
for (Integer i : relationShortName2IdList.get(p)) | |||
if (literalRelationSet.contains(i)) | |||
return true; | |||
return false; | |||
} | |||
public static boolean isLiteral (int pid) | |||
{ | |||
if (literalRelationSet.contains(pid)) | |||
return true; | |||
else | |||
return false; | |||
} | |||
} |
@@ -0,0 +1,179 @@ | |||
package fgmt; | |||
import java.io.BufferedReader; | |||
import java.io.File; | |||
import java.io.FileInputStream; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import qa.Globals; | |||
public class TypeFragment extends Fragment { | |||
public static HashMap<Integer, TypeFragment> typeFragments = null; | |||
public static HashMap<String, ArrayList<Integer>> typeShortName2IdList = null; | |||
public static HashMap<Integer, String> typeId2ShortName = null; | |||
public static final int NO_RELATION = -24232; | |||
public static HashSet<String> yagoTypeList = null; | |||
public HashSet<Integer> inEdges = new HashSet<Integer>(); | |||
public HashSet<Integer> outEdges = new HashSet<Integer>(); | |||
public HashSet<Integer> entSet = new HashSet<Integer>(); | |||
/* | |||
* Eliminate some bad YAGO Types which conflict with: | |||
* 1, ENT: amazon、earth、the_hunger_game、sparkling_wine | |||
* 2, TYPE: type | |||
* 3, RELATION: flow、owner、series、shot、part、care | |||
* 4, others: peace、vice | |||
*/ | |||
public static ArrayList<String> stopYagoTypeList = null; | |||
static void loadStopYagoTypeList() | |||
{ | |||
stopYagoTypeList = new ArrayList<String>(); | |||
stopYagoTypeList.add("Amazon"); | |||
stopYagoTypeList.add("Earth"); | |||
stopYagoTypeList.add("TheHungerGames"); | |||
stopYagoTypeList.add("SparklingWine"); | |||
stopYagoTypeList.add("Type"); | |||
stopYagoTypeList.add("Flow"); | |||
stopYagoTypeList.add("Owner"); | |||
stopYagoTypeList.add("Series"); | |||
stopYagoTypeList.add("Shot"); | |||
stopYagoTypeList.add("Part"); | |||
stopYagoTypeList.add("Care"); | |||
stopYagoTypeList.add("Peace"); | |||
stopYagoTypeList.add("Vice"); | |||
stopYagoTypeList.add("Dodo"); | |||
stopYagoTypeList.add("CzechFilms"); | |||
stopYagoTypeList.add("ChineseFilms"); | |||
} | |||
public TypeFragment(String fgmt, int fid) | |||
{ | |||
fragmentId = fid; | |||
fragmentType = typeEnum.TYPE_FRAGMENT; | |||
fgmt = fgmt.replace('|', '#'); | |||
String[] ss = fgmt.split("#"); | |||
String[] nums; | |||
if (ss[0].length() > 0) { | |||
nums = ss[0].split(","); | |||
for(int i = 0; i < nums.length; i ++) { | |||
if (nums[i].length() > 0) { | |||
inEdges.add(Integer.parseInt(nums[i])); | |||
} | |||
} | |||
} | |||
else { | |||
inEdges.add(NO_RELATION); | |||
} | |||
if (ss.length > 1 && ss[1].length() > 0) { | |||
nums = ss[1].split(","); | |||
for(int i = 0; i < nums.length; i ++) { | |||
if (nums[i].length() > 0) { | |||
outEdges.add(Integer.parseInt(nums[i])); | |||
} | |||
} | |||
} | |||
else { | |||
outEdges.add(NO_RELATION); | |||
} | |||
if(ss.length > 2 && ss[2].length() > 0) | |||
{ | |||
nums = ss[2].split(","); | |||
for(int i = 0; i < nums.length; i ++) { | |||
if (nums[i].length() > 0) { | |||
entSet.add(Integer.parseInt(nums[i])); | |||
} | |||
} | |||
} | |||
} | |||
public static void load() throws Exception | |||
{ | |||
String filename = Globals.localPath+"data/DBpedia2016/fragments/class_RDF_fragment/16type_fragment.txt"; | |||
File file = new File(filename); | |||
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); | |||
BufferedReader br = new BufferedReader(in); | |||
typeFragments = new HashMap<Integer, TypeFragment>(); | |||
System.out.println("Loading type IDs and Fragments ..."); | |||
String line; | |||
while((line = br.readLine()) != null) { | |||
String[] lines = line.split("\t"); | |||
TypeFragment tfgmt = null; | |||
if(lines[0].length() > 0 && !lines[0].equals("literal")) { | |||
int tid = Integer.parseInt(lines[0]); | |||
try{tfgmt = new TypeFragment(lines[1], tid);} | |||
catch(Exception e){} | |||
typeFragments.put(tid, tfgmt); | |||
} | |||
} | |||
br.close(); | |||
// can fix some data there | |||
// load Type Id | |||
loadId(); | |||
System.out.println("Load "+typeId2ShortName.size()+" basic types and "+yagoTypeList.size()+" yago types."); | |||
} | |||
public static void loadId() throws IOException | |||
{ | |||
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16basic_types_id.txt"; | |||
String yagoFileName = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16yago_types_list.txt"; | |||
File file = new File(filename); | |||
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); | |||
BufferedReader br = new BufferedReader(in); | |||
typeShortName2IdList = new HashMap<String, ArrayList<Integer>>(); | |||
typeId2ShortName = new HashMap<Integer, String>(); | |||
String line; | |||
while((line = br.readLine()) != null) { | |||
String[] lines = line.split("\t"); | |||
String typeShortName = lines[0]; | |||
// reserve typeShortName's capitalization | |||
if (!typeShortName2IdList.containsKey(typeShortName)) { | |||
typeShortName2IdList.put(typeShortName, new ArrayList<Integer>()); | |||
} | |||
typeShortName2IdList.get(typeShortName).add(Integer.parseInt(lines[1])); | |||
typeId2ShortName.put(Integer.parseInt(lines[1]), typeShortName); | |||
} | |||
// literalType | |||
typeShortName2IdList.put("literal_HRZ", new ArrayList<Integer>()); | |||
typeShortName2IdList.get("literal_HRZ").add(RelationFragment.literalTypeId); | |||
typeId2ShortName.put(RelationFragment.literalTypeId, "literal_HRZ"); | |||
br.close(); | |||
//load YAGO types | |||
in = new InputStreamReader(new FileInputStream(yagoFileName),"utf-8"); | |||
br = new BufferedReader(in); | |||
yagoTypeList = new HashSet<String>(); | |||
while((line = br.readLine())!=null) | |||
{ | |||
String[] lines = line.split("\t"); | |||
String typeName = lines[0]; | |||
yagoTypeList.add(typeName); | |||
} | |||
loadStopYagoTypeList(); | |||
yagoTypeList.removeAll(stopYagoTypeList); | |||
} | |||
} |
@@ -0,0 +1,56 @@ | |||
package fgmt; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashSet; | |||
import java.util.Iterator; | |||
public class VariableFragment extends Fragment { | |||
public static final int magic_number = -265; | |||
public ArrayList<HashSet<Integer>> candTypes = null; | |||
public HashSet<Integer> candEntities = null; | |||
public boolean mayLiteral = false; | |||
public VariableFragment() | |||
{ | |||
fragmentType = typeEnum.VAR_FRAGMENT; | |||
candTypes = new ArrayList<HashSet<Integer>>(); | |||
candEntities = new HashSet<Integer>(); | |||
} | |||
@Override | |||
public String toString() | |||
{ | |||
return "("+ candEntities.size() +")"; | |||
} | |||
public boolean containsAll(HashSet<Integer> s1) { | |||
Iterator<HashSet<Integer>> it = candTypes.iterator(); | |||
while(it.hasNext()) { | |||
HashSet<Integer> s2 = it.next(); | |||
if (s2.contains(magic_number)) { | |||
if (!Collections.disjoint(s1, s2)) { | |||
return true; | |||
} | |||
} | |||
else { | |||
if (s1.containsAll(s2) && s2.containsAll(s1)) { | |||
return true; | |||
} | |||
} | |||
} | |||
return false; | |||
} | |||
public boolean contains(Integer i) { | |||
Iterator<HashSet<Integer>> it = candTypes.iterator(); | |||
while(it.hasNext()) { | |||
HashSet<Integer> s = it.next(); | |||
if (s.contains(i)) { | |||
return true; | |||
} | |||
} | |||
return false; | |||
} | |||
} |
@@ -0,0 +1,489 @@ | |||
package jgsc; | |||
import java.io.*; | |||
import java.net.*; | |||
import java.lang.*; | |||
import java.net.URLEncoder; | |||
import java.net.URLDecoder; | |||
import java.io.UnsupportedEncodingException; | |||
import java.util.List; | |||
import java.util.Map; | |||
public class GstoreConnector { | |||
public static final String defaultServerIP = "127.0.0.1"; | |||
public static final int defaultServerPort = 9000; | |||
private String serverIP; | |||
private int serverPort; | |||
//private Socket socket = null; | |||
public GstoreConnector() { | |||
this.serverIP = GstoreConnector.defaultServerIP; | |||
this.serverPort = GstoreConnector.defaultServerPort; | |||
} | |||
public GstoreConnector(int _port) { | |||
this.serverIP = GstoreConnector.defaultServerIP; | |||
this.serverPort = _port; | |||
} | |||
public GstoreConnector(String _ip, int _port) { | |||
this.serverIP = _ip; | |||
this.serverPort = _port; | |||
} | |||
//PERFORMANCE: what if the query result is too large? receive and save to file directly at once | |||
//In addition, set the -Xmx larger(maybe in scale of Gs) if the query result could be very large, | |||
//this may help to reduce the GC cost | |||
public String sendGet(String param) { | |||
String url = "http://" + this.serverIP + ":" + this.serverPort; | |||
StringBuffer result = new StringBuffer(); | |||
BufferedReader in = null; | |||
System.out.println("parameter: "+param); | |||
try { | |||
param = URLEncoder.encode(param, "UTF-8"); | |||
} | |||
catch (UnsupportedEncodingException ex) { | |||
throw new RuntimeException("Broken VM does not support UTF-8"); | |||
} | |||
try { | |||
String urlNameString = url + "/" + param; | |||
System.out.println("request: "+urlNameString); | |||
URL realUrl = new URL(urlNameString); | |||
// 閹垫挸绱戦崪瀛禦L娑斿妫块惃鍕箾閹猴拷 | |||
URLConnection connection = realUrl.openConnection(); | |||
// 鐠佸墽鐤嗛柅姘辨暏閻ㄥ嫯顕Ч鍌氱潣閹拷 | |||
connection.setRequestProperty("accept", "*/*"); | |||
connection.setRequestProperty("connection", "Keep-Alive"); | |||
//set agent to avoid: speed limited by server if server think the client not a browser | |||
connection.setRequestProperty("user-agent", | |||
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); | |||
// 瀵よ櫣鐝涚�圭偤妾惃鍕箾閹猴拷 | |||
connection.connect(); | |||
long t0 = System.currentTimeMillis(); //ms | |||
// 閼惧嘲褰囬幍锟介張澶婃惙鎼存柨銇旂�涙顔� | |||
Map<String, List<String>> map = connection.getHeaderFields(); | |||
// 闁秴宸婚幍锟介張澶屾畱閸濆秴绨叉径鏉戠摟濞堬拷 | |||
//for (String key : map.keySet()) { | |||
// System.out.println(key + "--->" + map.get(key)); | |||
//} | |||
long t1 = System.currentTimeMillis(); //ms | |||
//System.out.println("Time to get header: "+(t1 - t0)+" ms"); | |||
//System.out.println("============================================"); | |||
// 鐎规矮绠� BufferedReader鏉堟挸鍙嗗ù浣规降鐠囪褰嘦RL閻ㄥ嫬鎼锋惔锟� | |||
in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8")); | |||
String line; | |||
while ((line = in.readLine()) != null) { | |||
//PERFORMANCE: this can be very costly if result is very large, because many temporary Strings are produced | |||
//In this case, just print the line directly will be much faster | |||
result.append(line+"\n"); | |||
//System.out.println("get data size: " + line.length()); | |||
//System.out.println(line); | |||
} | |||
long t2 = System.currentTimeMillis(); //ms | |||
//System.out.println("Time to get data: "+(t2 - t1)+" ms"); | |||
} catch (Exception e) { | |||
System.out.println("error in get request: " + e); | |||
e.printStackTrace(); | |||
} | |||
// 娴h法鏁inally閸ф娼甸崗鎶芥4鏉堟挸鍙嗗ù锟� | |||
finally { | |||
try { | |||
if (in != null) { | |||
in.close(); | |||
} | |||
} catch (Exception e2) { | |||
e2.printStackTrace(); | |||
} | |||
} | |||
return result.toString(); | |||
} | |||
public void sendGet(String param, String filename) { | |||
String url = "http://" + this.serverIP + ":" + this.serverPort; | |||
BufferedReader in = null; | |||
System.out.println("parameter: "+param); | |||
if (filename == null) | |||
return; | |||
FileWriter fw = null; | |||
try { | |||
fw = new FileWriter(filename); | |||
} catch (IOException e) { | |||
System.out.println("can not open " + filename + "!"); | |||
} | |||
try { | |||
param = URLEncoder.encode(param, "UTF-8"); | |||
} catch (UnsupportedEncodingException ex) { | |||
throw new RuntimeException("Broken VM does not support UTF-8"); | |||
} | |||
try { | |||
String urlNameString = url + "/" + param; | |||
System.out.println("request: "+urlNameString); | |||
URL realUrl = new URL(urlNameString); | |||
// 閹垫挸绱戦崪瀛禦L娑斿妫块惃鍕箾閹猴拷 | |||
URLConnection connection = realUrl.openConnection(); | |||
// 鐠佸墽鐤嗛柅姘辨暏閻ㄥ嫯顕Ч鍌氱潣閹拷 | |||
connection.setRequestProperty("accept", "*/*"); | |||
connection.setRequestProperty("connection", "Keep-Alive"); | |||
//set agent to avoid: speed limited by server if server think the client not a browser | |||
connection.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); | |||
// 瀵よ櫣鐝涚�圭偤妾惃鍕箾閹猴拷 | |||
connection.connect(); | |||
long t0 = System.currentTimeMillis(); //ms | |||
// 閼惧嘲褰囬幍锟介張澶婃惙鎼存柨銇旂�涙顔� | |||
Map<String, List<String>> map = connection.getHeaderFields(); | |||
// 闁秴宸婚幍锟介張澶屾畱閸濆秴绨叉径鏉戠摟濞堬拷 | |||
//for (String key : map.keySet()) { | |||
// System.out.println(key + "--->" + map.get(key)); | |||
//} | |||
long t1 = System.currentTimeMillis(); // ms | |||
//System.out.println("Time to get header: "+(t1 - t0)+" ms"); | |||
// 鐎规矮绠� BufferedReader鏉堟挸鍙嗗ù浣规降鐠囪褰嘦RL閻ㄥ嫬鎼锋惔锟� | |||
in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8")); | |||
char chars[] = new char[2048]; | |||
int b; | |||
while ((b = in.read(chars, 0, 2048)) != -1) { | |||
if (fw != null) | |||
fw.write(chars); | |||
chars = new char[2048]; | |||
} | |||
long t2 = System.currentTimeMillis(); //ms | |||
//System.out.println("Time to get data: "+(t2 - t1)+" ms"); | |||
} catch (Exception e) { | |||
//System.out.println("error in get request: " + e); | |||
e.printStackTrace(); | |||
} | |||
// 娴h法鏁inally閸ф娼甸崗鎶芥4鏉堟挸鍙嗗ù锟� | |||
finally { | |||
try { | |||
if (in != null) { | |||
in.close(); | |||
} | |||
if (fw != null) { | |||
fw.close(); | |||
} | |||
} catch (Exception e2) { | |||
e2.printStackTrace(); | |||
} | |||
} | |||
return; | |||
} | |||
//NOTICE: no need to connect now, HTTP connection is kept by default | |||
public boolean load(String _db_name, String _username, String _password) { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.load"); | |||
return false; | |||
} | |||
String cmd = "?operation=load&db_name=" + _db_name + "&username=" + _username + "&password=" + _password; | |||
String msg = this.sendGet(cmd); | |||
//if (!send_return) { | |||
//System.err.println("send load command error. @GstoreConnector.load"); | |||
//return false; | |||
//} | |||
this.disconnect(); | |||
System.out.println(msg); | |||
if (msg.equals("load database done.")) { | |||
return true; | |||
} | |||
return false; | |||
} | |||
public boolean unload(String _db_name,String _username, String _password) { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.unload"); | |||
return false; | |||
} | |||
String cmd = "?operation=unload&db_name=" + _db_name + "&username=" + _username + "&password=" + _password; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
System.out.println(msg); | |||
if (msg.equals("unload database done.")) { | |||
return true; | |||
} | |||
return false; | |||
} | |||
public boolean build(String _db_name, String _rdf_file_path, String _username, String _password) { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.build"); | |||
return false; | |||
} | |||
//TODO: also use encode to support spaces? | |||
//Consider change format into ?name=DBname | |||
String cmd = "?operation=build&db_name=" + _db_name + "&ds_path=" + _rdf_file_path + "&username=" + _username + "&password=" + _password;; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
System.out.println(msg); | |||
if (msg.equals("import RDF file to database done.")) { | |||
return true; | |||
} | |||
return false; | |||
} | |||
//TODO: not implemented | |||
public boolean drop(String _db_name) { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.drop"); | |||
return false; | |||
} | |||
String cmd = "drop/" + _db_name; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
System.out.println(msg); | |||
return msg.equals("drop database done."); | |||
} | |||
public String query(String _username, String _password, String _db_name, String _sparql) { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.query"); | |||
return "connect to server error."; | |||
} | |||
//URL encode should be used here | |||
//try { | |||
//_sparql = URLEncoder.encode("\""+_sparql+"\"", "UTF-8"); | |||
//} | |||
//catch (UnsupportedEncodingException ex) { | |||
//throw new RuntimeException("Broken VM does not support UTF-8"); | |||
//} | |||
String cmd = "?operation=query&username=" + _username + "&password=" + _password + "&db_name=" + _db_name + "&format=txt&sparql=" + _sparql; | |||
//String cmd = "query/\"" + _sparql + "\""; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
return msg; | |||
} | |||
public void query(String _username, String _password, String _db_name, String _sparql, String _filename) { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.query"); | |||
} | |||
String cmd = "?operation=query&username=" + _username + "&password=" + _password + "&db_name=" + _db_name + "&format=json&sparql=" + _sparql; | |||
this.sendGet(cmd, _filename); | |||
this.disconnect(); | |||
return; | |||
} | |||
// public String show() { | |||
// return this.show(false); | |||
// } | |||
//show all databases | |||
public String show() { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.show"); | |||
return "connect to server error."; | |||
} | |||
String cmd = "?operation=show"; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
return msg; | |||
} | |||
public String user(String type, String username1, String password1, String username2, String addtion) { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.show"); | |||
return "connect to server error."; | |||
} | |||
String cmd = "?operation=user&type=" + type + "&username1=" + username1 + "&password1=" + password1 + "&username2=" + username2 + "&addtion=" + addtion; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
return msg; | |||
} | |||
public String showUser() { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.show"); | |||
return "connect to server error."; | |||
} | |||
String cmd = "?operation=showUser"; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
return msg; | |||
} | |||
public String monitor(String db_name) { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.show"); | |||
return "connect to server error."; | |||
} | |||
String cmd = "?operation=monitor&db_name=" + db_name; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
return msg; | |||
} | |||
public String checkpoint(String db_name) { | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.show"); | |||
return "connect to server error."; | |||
} | |||
String cmd = "?operation=checkpoint&db_name=" + db_name; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
return msg; | |||
} | |||
public String test_download(String filepath) | |||
{ | |||
boolean connect_return = this.connect(); | |||
if (!connect_return) { | |||
System.err.println("connect to server error. @GstoreConnector.query"); | |||
return "connect to server error."; | |||
} | |||
//TEST: a small file, a large file | |||
String cmd = "?operation=delete&download=true&filepath=" + filepath; | |||
String msg = this.sendGet(cmd); | |||
this.disconnect(); | |||
return msg; | |||
} | |||
private boolean connect() { | |||
return true; | |||
} | |||
private boolean disconnect() { | |||
return true; | |||
} | |||
private static byte[] packageMsgData(String _msg) { | |||
//byte[] data_context = _msg.getBytes(); | |||
byte[] data_context = null; | |||
try { | |||
data_context = _msg.getBytes("utf-8"); | |||
} catch (UnsupportedEncodingException e) { | |||
// TODO Auto-generated catch block | |||
e.printStackTrace(); | |||
System.err.println("utf-8 charset is unsupported."); | |||
data_context = _msg.getBytes(); | |||
} | |||
int context_len = data_context.length + 1; // 1 byte for '\0' at the end of the context. | |||
int data_len = context_len + 4; // 4 byte for one int(data_len at the data's head). | |||
byte[] data = new byte[data_len]; | |||
// padding head(context_len). | |||
byte[] head = GstoreConnector.intToByte4(context_len); | |||
for (int i = 0; i < 4; i++) { | |||
data[i] = head[i]; | |||
} | |||
// padding context. | |||
for (int i = 0; i < data_context.length; i++) { | |||
data[i + 4] = data_context[i]; | |||
} | |||
// in C, there should be '\0' as the terminator at the end of a char array. so we need add '\0' at the end of sending message. | |||
data[data_len - 1] = 0; | |||
return data; | |||
} | |||
private static byte[] intToByte4(int _x) // with Little Endian format. | |||
{ | |||
byte[] ret = new byte[4]; | |||
ret[0] = (byte) (_x); | |||
ret[1] = (byte) (_x >>> 8); | |||
ret[2] = (byte) (_x >>> 16); | |||
ret[3] = (byte) (_x >>> 24); | |||
return ret; | |||
} | |||
private static int byte4ToInt(byte[] _b) // with Little Endian format. | |||
{ | |||
int byte0 = _b[0] & 0xFF, byte1 = _b[1] & 0xFF, byte2 = _b[2] & 0xFF, byte3 = _b[3] & 0xFF; | |||
int ret = (byte0) | (byte1 << 8) | (byte2 << 16) | (byte3 << 24); | |||
return ret; | |||
} | |||
public static void main(String[] args) { | |||
// initialize the GStore server's IP address and port. | |||
GstoreConnector gc = new GstoreConnector("172.31.222.90", 9001); | |||
// build a new database by a RDF file. | |||
// note that the relative path is related to gserver. | |||
//gc.build("db_LUBM10", "example/rdf_triple/LUBM_10_GStore.n3"); | |||
String sparql = "select ?x where {" | |||
+ "<Area_51> <location> ?x" | |||
+ "}"; | |||
sparql = "select ?countries where { ?countries <type> <Country> . ?caves <type> <Cave> . ?caves <location> ?countries . } " | |||
+ "GROUP BY ?countries HAVING(COUNT(?caves) > 1000)"; | |||
sparql = "ASK where { <Proinsulin> <type> <Protein> .}"; | |||
sparql = "select DISTINCT ?film ?budget where { ?film <type> <Film> . ?film <director> <Paul_W._S._Anderson> . ?film <budget> ?budget . }"; | |||
// boolean flag = gc.load("dbpedia16", "root", "123456"); | |||
//System.out.println(flag); | |||
String answer = gc.query("root", "123456", "dbpedia16", sparql); | |||
System.out.println(answer); | |||
//To count the time cost | |||
//long startTime=System.nanoTime(); //ns | |||
//long startTime=System.currentTimeMillis(); //ms | |||
//doSomeThing(); //濞村鐦惃鍕敩閻焦顔� | |||
//long endTime=System.currentTimeMillis(); //閼惧嘲褰囩紒鎾存将閺冨爼妫� | |||
//System.out.println("缁嬪绨潻鎰攽閺冨爼妫块敍锟� "+(end-start)+"ms"); | |||
} | |||
} | |||
@@ -0,0 +1,133 @@ | |||
package lcn; | |||
import java.io.BufferedReader; | |||
import java.io.File; | |||
import java.io.FileInputStream; | |||
//import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import java.util.Date; | |||
import org.apache.lucene.analysis.Analyzer; | |||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||
import org.apache.lucene.document.Document; | |||
import org.apache.lucene.document.Field; | |||
import org.apache.lucene.index.IndexWriter; | |||
import qa.Globals; | |||
//import qa.Globals; | |||
/** | |||
* Lucene建立索引的基本单元是document,同时其中的域filed可以根据需要自己添加 | |||
* | |||
* Document是一个记录,用来表示一个条目,相当于数据库中的一行记录,就是搜索建立的倒排索引的条目。 | |||
* eg:你要搜索自己电脑上的文件,这个时候就可以创建field(字段,相关于数据库中的列。 然后用field组合成document,最后会变成若干文件。 | |||
* 这个document和文件系统document不是一个概念。 | |||
* | |||
* StandardAnalyzer是lucene中内置的"标准分析器",可以做如下功能: | |||
* 1、对原有句子按照空格进行了分词 | |||
* 2、所有的大写字母都可以能转换为小写的字母 | |||
* 3、可以去掉一些没有用处的单词,例如"is","the","are"等单词,也删除了所有的标点 | |||
*/ | |||
public class BuildIndexForEntityFragments{ | |||
public void indexforentity() throws Exception | |||
{ | |||
if(EntityFragmentFields.entityId2Name == null) | |||
EntityFragmentFields.load(); | |||
long startTime = new Date().getTime(); | |||
//Try update KB index to DBpedia2015. by husen 2016-04-08 | |||
//Try update KB index to DBpedia2016. by husen 2018-8-22 | |||
File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index"); | |||
File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"); | |||
Analyzer luceneAnalyzer_en = new StandardAnalyzer(); | |||
IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true); | |||
int mergeFactor = 100000; //default 10 | |||
int maxBufferedDoc = 1000; //default 10 | |||
int maxMergeDoc = Integer.MAX_VALUE; //INF | |||
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; | |||
indexWriter_en.setMergeFactor(mergeFactor); | |||
indexWriter_en.setMaxBufferedDocs(maxBufferedDoc); | |||
indexWriter_en.setMaxMergeDocs(maxMergeDoc); | |||
FileInputStream file = new FileInputStream(sourceDir_en); | |||
InputStreamReader in = new InputStreamReader(file,"UTF-8"); | |||
BufferedReader br = new BufferedReader(in); | |||
int count = 0; | |||
while(true) | |||
{ | |||
String _line = br.readLine(); | |||
{ | |||
if(_line == null) break; | |||
} | |||
count++; | |||
if(count % 100000 == 0) | |||
System.out.println(count); | |||
String line = _line; | |||
String temp[] = line.split("\t"); | |||
if(temp.length != 2) | |||
continue; | |||
else | |||
{ | |||
int entity_id = Integer.parseInt(temp[0]); | |||
if(!EntityFragmentFields.entityId2Name.containsKey(entity_id)) | |||
continue; | |||
String entity_name = EntityFragmentFields.entityId2Name.get(entity_id); | |||
String entity_fragment = temp[1]; | |||
entity_name = entity_name.replace("____", " "); | |||
entity_name = entity_name.replace("__", " "); | |||
entity_name = entity_name.replace("_", " "); | |||
Document document = new Document(); | |||
Field EntityName = new Field("EntityName", entity_name, Field.Store.YES, | |||
Field.Index.TOKENIZED, | |||
Field.TermVector.WITH_POSITIONS_OFFSETS); | |||
Field EntityId = new Field("EntityId", String.valueOf(entity_id), | |||
Field.Store.YES, Field.Index.NO); | |||
Field EntityFragment = new Field("EntityFragment", entity_fragment, | |||
Field.Store.YES, Field.Index.NO); | |||
document.add(EntityName); | |||
document.add(EntityId); | |||
document.add(EntityFragment); | |||
indexWriter_en.addDocument(document); | |||
} | |||
} | |||
indexWriter_en.optimize(); | |||
indexWriter_en.close(); | |||
br.close(); | |||
// input the time of Build index | |||
long endTime = new Date().getTime(); | |||
System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime)); | |||
} | |||
public static void main(String[] args) | |||
{ | |||
BuildIndexForEntityFragments bef = new BuildIndexForEntityFragments(); | |||
try | |||
{ | |||
Globals.localPath="D:/husen/gAnswer/"; | |||
bef.indexforentity(); | |||
} | |||
catch (Exception e) | |||
{ | |||
e.printStackTrace(); | |||
} | |||
} | |||
} | |||
@@ -0,0 +1,107 @@ | |||
package lcn; | |||
import java.io.File; | |||
import java.util.ArrayList; | |||
import java.util.Date; | |||
import java.util.HashMap; | |||
import java.util.Iterator; | |||
import org.apache.lucene.analysis.Analyzer; | |||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||
import org.apache.lucene.document.Document; | |||
import org.apache.lucene.document.Field; | |||
import org.apache.lucene.index.IndexWriter; | |||
import qa.Globals; | |||
import fgmt.TypeFragment; | |||
public class BuildIndexForTypeShortName { | |||
public static void buildIndex(HashMap<String, ArrayList<Integer>> typeShortName2IdList) throws Exception | |||
{ | |||
long startTime = new Date().getTime(); | |||
File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index"); | |||
Analyzer luceneAnalyzer_li = new StandardAnalyzer(); | |||
IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true); | |||
int mergeFactor = 100000; | |||
int maxBufferedDoc = 1000; | |||
int maxMergeDoc = Integer.MAX_VALUE; | |||
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; | |||
indexWriter_li.setMergeFactor(mergeFactor); | |||
indexWriter_li.setMaxBufferedDocs(maxBufferedDoc); | |||
indexWriter_li.setMaxMergeDocs(maxMergeDoc); | |||
int count = 0; | |||
Iterator<String> it = typeShortName2IdList.keySet().iterator(); | |||
while (it.hasNext()) | |||
{ | |||
String sn = it.next(); | |||
if (sn.length() == 0) { | |||
continue; | |||
} | |||
count ++; | |||
StringBuilder splittedSn = new StringBuilder(""); | |||
if(sn.contains("_")) | |||
{ | |||
String nsn = sn.replace("_", " "); | |||
splittedSn.append(nsn.toLowerCase()); | |||
} | |||
else | |||
{ | |||
int last = 0, i = 0; | |||
for(i = 0; i < sn.length(); i ++) | |||
{ | |||
// if it were not a small letter, then break it. | |||
if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z')) | |||
{ | |||
splittedSn.append(sn.substring(last, i).toLowerCase()); | |||
splittedSn.append(' '); | |||
last = i; | |||
} | |||
} | |||
splittedSn.append(sn.substring(last, i).toLowerCase()); | |||
while(splittedSn.charAt(0) == ' ') { | |||
splittedSn.deleteCharAt(0); | |||
} | |||
} | |||
System.out.println("SplitttedType: "+splittedSn); | |||
Document document = new Document(); | |||
Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(), | |||
Field.Store.YES, | |||
Field.Index.TOKENIZED, | |||
Field.TermVector.WITH_POSITIONS_OFFSETS); | |||
Field TypeShortName = new Field("TypeShortName", sn, | |||
Field.Store.YES, Field.Index.NO); | |||
document.add(SplittedTypeShortName); | |||
document.add(TypeShortName); | |||
indexWriter_li.addDocument(document); | |||
} | |||
indexWriter_li.optimize(); | |||
indexWriter_li.close(); | |||
// input the time of Build index | |||
long endTime = new Date().getTime(); | |||
System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime)); | |||
} | |||
public static void main (String[] args) { | |||
try { | |||
Globals.localPath="D:/husen/gAnswer/"; | |||
TypeFragment.load(); | |||
BuildIndexForTypeShortName.buildIndex(TypeFragment.typeShortName2IdList); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,64 @@ | |||
package lcn; | |||
import java.io.BufferedReader; | |||
import java.io.File; | |||
import java.io.FileInputStream; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import java.util.HashMap; | |||
import qa.Globals; | |||
public class EntityFragmentFields { | |||
// entity dictionary | |||
public static HashMap<String, Integer> entityName2Id = null; | |||
public static HashMap<Integer, String> entityId2Name = null; | |||
public static HashMap<Integer, String> entityFragmentString = null; | |||
public static void load() throws IOException | |||
{ | |||
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16entity_id.txt"; | |||
String fragmentFileName = Globals.localPath+"data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"; | |||
File file = new File(filename); | |||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8")); | |||
entityName2Id = new HashMap<String, Integer>(); | |||
entityId2Name = new HashMap<Integer, String>(); | |||
long t1, t2, t3; | |||
t1 = System.currentTimeMillis(); | |||
// load entity id | |||
System.out.println("Loading entity id ..."); | |||
String line; | |||
while((line = br.readLine()) != null) | |||
{ | |||
String[] lines = line.split("\t"); | |||
String entName = lines[0].substring(1, lines[0].length()-1); | |||
entityName2Id.put(entName, Integer.parseInt(lines[1])); | |||
entityId2Name.put(Integer.parseInt(lines[1]), entName); | |||
} | |||
br.close(); | |||
t2 = System.currentTimeMillis(); | |||
System.out.println("Load "+entityId2Name.size()+" entity ids in "+ (t2-t1) + "ms."); | |||
// load entity fragment | |||
System.out.println("Loading entity fragments ..."); | |||
br = new BufferedReader(new InputStreamReader(new FileInputStream(fragmentFileName),"utf-8")); | |||
entityFragmentString = new HashMap<Integer, String>(); | |||
while((line = br.readLine()) != null) | |||
{ | |||
String[] lines = line.split("\t"); | |||
if(lines.length != 2) | |||
continue; | |||
int eId = Integer.parseInt(lines[0]); | |||
entityFragmentString.put(eId, lines[1]); | |||
} | |||
t3 = System.currentTimeMillis(); | |||
System.out.println("Load "+entityFragmentString.size()+" entity fragments in "+ (t3-t2) + "ms."); | |||
br.close(); | |||
} | |||
} |
@@ -0,0 +1,31 @@ | |||
package lcn; | |||
public class EntityNameAndScore implements Comparable<EntityNameAndScore> { | |||
public int entityID; | |||
public String entityName; | |||
public double score; | |||
public EntityNameAndScore(int id, String n, double s) { | |||
entityID = id; | |||
entityName = n; | |||
score = s; | |||
} | |||
@Override | |||
public String toString() { | |||
return entityID + ":<" + entityName + ">\t" + score; | |||
} | |||
public int compareTo(EntityNameAndScore o) { | |||
if(this.score < o.score) { | |||
return 1; | |||
} | |||
else if (this.score > o.score) { | |||
return -1; | |||
} | |||
else { | |||
return 0; | |||
} | |||
} | |||
} |
@@ -0,0 +1,58 @@ | |||
package lcn; | |||
//import java.io.IOException; | |||
//import java.util.ArrayList; | |||
import java.util.ArrayList; | |||
import java.util.Scanner; | |||
import fgmt.EntityFragment; | |||
import qa.Globals; | |||
import qa.mapping.EntityFragmentDict; | |||
public class Main { | |||
//Test: searching Entities and Types through Lucene Index. | |||
public static void main(String[] aStrings) throws Exception{ | |||
//SearchInLiteralSubset se = new SearchInLiteralSubset(); | |||
SearchInTypeShortName st = new SearchInTypeShortName(); | |||
SearchInEntityFragments sf = new SearchInEntityFragments(); | |||
EntityFragmentDict efd = new EntityFragmentDict(); | |||
EntityFragmentFields eff = null; | |||
Globals.localPath = "D:/husen/gAnswer/"; | |||
Scanner sc = new Scanner(System.in); | |||
System.out.print("input name: "); | |||
while(sc.hasNextLine()) | |||
{ | |||
String literal = sc.nextLine(); | |||
System.out.println(literal); | |||
//literal = cnlp.getBaseFormOfPattern(literal); | |||
//search Type | |||
ArrayList<String> result = st.searchType(literal, 0.4, 0.8, 10); | |||
System.out.println("TypeShortName-->RESULT:"); | |||
for (String s : result) { | |||
System.out.println("<"+s + ">"); | |||
} | |||
//search Ent Fragment | |||
// int eId = EntityFragmentFields.entityName2Id.get(literal); | |||
// EntityFragment ef = EntityFragment.getEntityFragmentByEntityId(eId); | |||
// System.out.println(ef); | |||
//search Ent Name | |||
// ArrayList<EntityNameAndScore> result = sf.searchName(literal, 0.4, 0.8, 50); | |||
// System.out.println("EntityName-->RESULT:"); | |||
// for(EntityNameAndScore enas: result) | |||
// { | |||
// System.out.println(enas); | |||
// } | |||
System.out.print("input name: "); | |||
} | |||
sc.close(); | |||
} | |||
} |
@@ -0,0 +1,84 @@ | |||
package lcn; | |||
import java.io.IOException; | |||
import java.util.ArrayList; | |||
import org.apache.lucene.analysis.Analyzer; | |||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||
import org.apache.lucene.queryParser.ParseException; | |||
import org.apache.lucene.queryParser.QueryParser; | |||
import org.apache.lucene.search.Hits; | |||
import org.apache.lucene.search.IndexSearcher; | |||
import org.apache.lucene.search.Query; | |||
import qa.Globals; | |||
public class SearchInEntityFragments { | |||
/* | |||
* Search entity in Lucene | |||
* */ | |||
public ArrayList<EntityNameAndScore> searchName(String literal, double thres1, double thres2, int k) throws IOException { | |||
Hits hits = null; | |||
String queryString = null; | |||
Query query = null; | |||
IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/entity_fragment_index"); | |||
ArrayList<EntityNameAndScore> result = new ArrayList<EntityNameAndScore>(); | |||
queryString = literal; | |||
Analyzer analyzer = new StandardAnalyzer(); | |||
try | |||
{ | |||
QueryParser qp = new QueryParser("EntityName", analyzer); | |||
query = qp.parse(queryString); | |||
} catch (ParseException e) | |||
{ | |||
e.printStackTrace(); | |||
} | |||
if (searcher != null) | |||
{ | |||
hits = searcher.search(query); | |||
//System.out.println("search for entity fragment hits.length=" + hits.length()); | |||
if (hits.length() > 0) | |||
{ | |||
//System.out.println("find " + hits.length() + " result!"); | |||
for (int i=0; i<hits.length(); i++) { | |||
//System.out.println(i+": <"+hits.doc(i).get("EntityName") +">;" | |||
// +hits.doc(i).get("EntityFragment") | |||
// + "; Score: " + hits.score(i) | |||
// + "; Score2: " + hits.score(i)*(literalLength/hits.doc(i).get("EntityName").length())); | |||
if(i<k) { | |||
if (hits.score(i) >= thres1) { | |||
String en = hits.doc(i).get("EntityName"); | |||
int id = Integer.parseInt(hits.doc(i).get("EntityId")); | |||
result.add(new EntityNameAndScore(id, en, hits.score(i))); | |||
} | |||
else { | |||
break; | |||
} | |||
} | |||
else { | |||
if (hits.score(i) >= thres2) { | |||
String en = hits.doc(i).get("EntityName"); | |||
int id = Integer.parseInt(hits.doc(i).get("EntityId")); | |||
result.add(new EntityNameAndScore(id, en, hits.score(i))); | |||
} | |||
else { | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
//Collections.sort(result); | |||
return result; | |||
} | |||
} |
@@ -0,0 +1,176 @@ | |||
package lcn; | |||
import java.util.ArrayList; | |||
import org.apache.lucene.analysis.Analyzer; | |||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||
import org.apache.lucene.queryParser.ParseException; | |||
import org.apache.lucene.queryParser.QueryParser; | |||
import org.apache.lucene.search.Hits; | |||
import org.apache.lucene.search.IndexSearcher; | |||
import org.apache.lucene.search.Query; | |||
import fgmt.TypeFragment; | |||
import qa.Globals; | |||
import rdf.TypeMapping; | |||
public class SearchInTypeShortName { | |||
// get id and score -- husen | |||
public ArrayList<TypeMapping> searchTypeScore(String s, double thres1, double thres2, int k) throws Exception | |||
{ | |||
Hits hits = null; | |||
String queryString = s; | |||
Query query = null; | |||
IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/type_fragment_index"); | |||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||
Analyzer analyzer = new StandardAnalyzer(); | |||
try { | |||
QueryParser qp = new QueryParser("SplittedTypeShortName", analyzer); | |||
query = qp.parse(queryString); | |||
} catch (ParseException e) { | |||
e.printStackTrace(); | |||
} | |||
if (searcher != null) { | |||
hits = searcher.search(query); | |||
//System.out.println("find " + hits.length() + " matched type."); | |||
if (hits.length() > 0) { | |||
for (int i=0; i<hits.length(); i++) { | |||
if (i < k) { | |||
//System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||
if(hits.score(i) >= thres1) | |||
{ | |||
//System.out.println("Score>=thres1("+thres1+") ---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||
String type = hits.doc(i).get("TypeShortName"); | |||
System.out.println("Matched type: " + type + " : " + hits.score(i)); | |||
ArrayList<Integer> ret_in = TypeFragment.typeShortName2IdList.get(type); | |||
if(ret_in!=null) | |||
{ | |||
for(Integer tid: ret_in) | |||
{ | |||
TypeMapping typeMapping = new TypeMapping(tid, hits.doc(i).get("TypeShortName"), hits.score(i)); | |||
tmList.add(typeMapping); | |||
} | |||
} | |||
} | |||
else { | |||
break; | |||
} | |||
} | |||
else { | |||
if(hits.score(i) >= thres2) | |||
{ | |||
System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||
ArrayList<Integer> ret_in = TypeFragment.typeShortName2IdList.get(s); | |||
if(ret_in!=null) | |||
{ | |||
for(Integer tid: ret_in) | |||
{ | |||
TypeMapping typeMapping = new TypeMapping(tid, hits.doc(i).get("TypeShortName"), hits.score(i)); | |||
tmList.add(typeMapping); | |||
} | |||
} | |||
} | |||
else { | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
return tmList; | |||
} | |||
public ArrayList<String> searchType(String s, double thres1, double thres2, int k) throws Exception | |||
{ | |||
Hits hits = null; | |||
String queryString = null; | |||
Query query = null; | |||
IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/type_fragment_index"); | |||
ArrayList<String> typeNames = new ArrayList<String>(); | |||
//String[] array = s.split(" "); | |||
//queryString = array[array.length-1]; | |||
queryString = s; | |||
Analyzer analyzer = new StandardAnalyzer(); | |||
try { | |||
QueryParser qp = new QueryParser("SplittedTypeShortName", analyzer); | |||
query = qp.parse(queryString); | |||
} catch (ParseException e) { | |||
e.printStackTrace(); | |||
} | |||
if (searcher != null) { | |||
hits = searcher.search(query); | |||
System.out.println("find " + hits.length() + " answars!"); | |||
if (hits.length() > 0) { | |||
for (int i=0; i<hits.length(); i++) { | |||
if (i < k) { | |||
System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||
if(hits.score(i) >= thres1){ | |||
System.out.println("Score>=thres1("+thres1+") ---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||
typeNames.add(hits.doc(i).get("TypeShortName")); | |||
//if (satisfiedStrictly(hits.doc(i).get("SplittedTypeShortName"), queryString)) typeNames.add(hits.doc(i).get("TypeShortName")); | |||
} | |||
else { | |||
//break; | |||
} | |||
} | |||
else { | |||
if(hits.score(i) >= thres2){ | |||
System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i)); | |||
typeNames.add(hits.doc(i).get("TypeShortName")); | |||
//if (satisfiedStrictly(hits.doc(i).get("SplittedTypeShortName"), queryString)) typeNames.add(hits.doc(i).get("TypeShortName")); | |||
} | |||
else { | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
return typeNames; | |||
} | |||
private boolean satisfiedStrictly (String splittedTypeShortName, String queryString) | |||
{ | |||
String[] tnames = splittedTypeShortName.toLowerCase().split(" "); | |||
String[] qnames = queryString.toLowerCase().split(" "); | |||
for (int i = 0; i < tnames.length; i ++) { | |||
if (tnames[i].length() == 0) continue; | |||
boolean matched = false; | |||
for (int j = 0; j < qnames.length; j ++) { | |||
if (tnames[i].equals(qnames[j])) { | |||
matched = true; | |||
break; | |||
} | |||
} | |||
if (!matched && !Globals.stopWordsList.isStopWord(tnames[i])) { | |||
return false; | |||
} | |||
} | |||
String qlast = qnames[qnames.length-1]; | |||
boolean flag = false; | |||
for (int i = 0; i < tnames.length; i ++) { | |||
if (tnames[i].length() == 0) continue; | |||
if (tnames[i].equals(qlast)) { | |||
flag = true; | |||
break; | |||
} | |||
} | |||
if (flag) return true; | |||
else return false; | |||
} | |||
} |
@@ -0,0 +1,116 @@ | |||
package log; | |||
//import java.io.File; | |||
//import java.io.FileNotFoundException; | |||
//import java.io.FileOutputStream; | |||
//import java.io.OutputStreamWriter; | |||
//import java.io.UnsupportedEncodingException; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import javax.servlet.http.HttpServletRequest; | |||
//import qa.Globals; | |||
import qa.Matches; | |||
import qa.Query; | |||
import rdf.EntityMapping; | |||
import rdf.SemanticRelation; | |||
import rdf.Sparql; | |||
import rdf.MergedWord; | |||
import rdf.SemanticUnit; | |||
import qa.Answer; | |||
import nlp.ds.Sentence; | |||
import nlp.ds.Word; | |||
public class QueryLogger { | |||
public Sentence s = null; | |||
public String ipAdress = null; | |||
public Word target = null; | |||
public Sparql sparql = null; | |||
public Matches match = null; | |||
public ArrayList<Answer> answers = null; | |||
public boolean MODE_debug = false; | |||
public boolean MODE_log = true; | |||
public boolean MODE_fragment = true; | |||
public boolean isMaltParserUsed = true; // Notice, we utilize Malt Parser as default parser, which is different from the older version. TODO: some coref rules need changed to fit Malt Parser. | |||
public HashMap<String, Integer> timeTable = null; | |||
public ArrayList<MergedWord> mWordList = null; | |||
public ArrayList<SemanticUnit> semanticUnitList = null; | |||
public HashMap<Integer, SemanticRelation> semanticRelations = null; | |||
public HashMap<Integer, SemanticRelation> potentialSemanticRelations = null; | |||
public HashMap<Word, ArrayList<EntityMapping>> entityDictionary = null; | |||
public ArrayList<Sparql> rankedSparqls = null; | |||
public String NRlog = ""; | |||
public String SQGlog = ""; | |||
public int gStoreCallTimes = 0; | |||
public QueryLogger (Query query) | |||
{ | |||
timeTable = new HashMap<String, Integer>(); | |||
rankedSparqls = new ArrayList<Sparql>(); | |||
mWordList = query.mWordList; | |||
} | |||
public void reloadSentence(Sentence sentence) | |||
{ | |||
this.s = sentence; | |||
if(this.semanticUnitList != null) | |||
this.semanticUnitList.clear(); | |||
if(this.semanticRelations != null) | |||
this.semanticRelations.clear(); | |||
if(this.rankedSparqls != null) | |||
this.rankedSparqls.clear(); | |||
} | |||
// Source code: http://edu.21cn.com/java/g_189_755584-1.htm | |||
public static String getIpAddr(HttpServletRequest request) { | |||
String ip = request.getHeader("x-forwarded-for"); | |||
if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) { | |||
ip = request.getHeader("Proxy-Client-IP"); | |||
} | |||
if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) { | |||
ip = request.getHeader("WL-Proxy-Client-IP"); | |||
} | |||
if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) { | |||
ip = request.getRemoteAddr(); | |||
} | |||
int idx; | |||
if((idx = ip.indexOf(',')) != -1) { | |||
ip = ip.substring(0, idx); | |||
} | |||
return ip; | |||
} | |||
public void reviseAnswers() | |||
{ | |||
System.out.println("Revise Answers:"); | |||
answers = new ArrayList<Answer>(); | |||
if (match == null || sparql == null || match.answers == null || sparql.questionFocus == null) | |||
return; | |||
HashSet<Answer> answerSet = new HashSet<Answer>(); | |||
String questionFocus = sparql.questionFocus; | |||
String sparqlString = sparql.toStringForGStore(); | |||
//System.out.println("mal="+match.answers.length); | |||
for (int i=0;i<match.answers.length;i++) | |||
{ | |||
Answer ans = new Answer(questionFocus, match.answers[i]); | |||
if (!sparqlString.contains(ans.questionFocusValue)) | |||
answerSet.add(ans); | |||
} | |||
for (Answer ans : answerSet) | |||
answers.add(ans); | |||
Collections.sort(answers); | |||
} | |||
} |
@@ -0,0 +1,402 @@ | |||
package nlp.ds; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.List; | |||
import java.util.Stack; | |||
import nlp.tool.CoreNLP; | |||
import nlp.tool.MaltParser; | |||
import nlp.tool.StanfordParser; | |||
import org.maltparser.core.exception.MaltChainedException; | |||
import org.maltparser.core.syntaxgraph.DependencyStructure; | |||
import org.maltparser.core.syntaxgraph.node.DependencyNode; | |||
import edu.stanford.nlp.ling.IndexedWord; | |||
import edu.stanford.nlp.trees.GrammaticalStructure; | |||
import edu.stanford.nlp.trees.TypedDependency; | |||
import edu.stanford.nlp.trees.semgraph.SemanticGraph; | |||
public class DependencyTree { | |||
public DependencyTreeNode root = null; | |||
public ArrayList<DependencyTreeNode> nodesList = null; | |||
public SemanticGraph dependencies = null; // Method 1: CoreNLP (discarded) | |||
public GrammaticalStructure gs = null; // Method 2: Stanford Parser | |||
public DependencyStructure maltGraph = null; // Method 3: MaltParser | |||
public HashMap<String, ArrayList<DependencyTreeNode>> wordBaseFormIndex = null; | |||
public DependencyTree (Sentence sentence, CoreNLP coreNLPparser) { | |||
SemanticGraph dependencies = coreNLPparser.getBasicDependencies(sentence.plainText); | |||
this.dependencies = dependencies; | |||
Stack<IndexedWord> stack = new Stack<IndexedWord>(); | |||
IndexedWord iwRoot = dependencies.getFirstRoot(); | |||
HashMap<IndexedWord, DependencyTreeNode> map = new HashMap<IndexedWord, DependencyTreeNode>(); | |||
nodesList = new ArrayList<DependencyTreeNode>(); | |||
stack.push(iwRoot); | |||
root = this.setRoot(sentence.getWordByIndex(iwRoot.index())); | |||
map.put(iwRoot, root); | |||
while (!stack.empty()) | |||
{ | |||
IndexedWord curIWNode = stack.pop(); | |||
DependencyTreeNode curDTNode = map.get(curIWNode); | |||
for (IndexedWord iwChild : dependencies.getChildList(curIWNode)) { | |||
Word w = sentence.getWordByIndex(iwChild.index()); | |||
DependencyTreeNode newDTNode = this.insert( | |||
curDTNode, | |||
w, | |||
dependencies.reln(curIWNode, iwChild).getShortName()); | |||
map.put(iwChild, newDTNode); | |||
stack.push(iwChild); | |||
} | |||
curDTNode.sortChildrenList(); | |||
nodesList.add(curDTNode); | |||
} | |||
} | |||
public DependencyTree (Sentence sentence, StanfordParser stanfordParser) { | |||
this.gs = stanfordParser.getGrammaticalStructure(sentence.plainText); | |||
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>(); | |||
nodesList = new ArrayList<DependencyTreeNode>(); | |||
List<TypedDependency> tdl = gs.typedDependencies(false); | |||
// 1. generate all nodes. | |||
for (TypedDependency td : tdl) { | |||
// gov | |||
if (!map.containsKey(td.gov().index()) && !td.reln().getShortName().equals("root")) { | |||
Word w = sentence.getWordByIndex(td.gov().index()); | |||
DependencyTreeNode newNode = new DependencyTreeNode(w); | |||
map.put(td.gov().index(), newNode); | |||
nodesList.add(newNode); | |||
} | |||
// dep | |||
if (!map.containsKey(td.dep().index())) { | |||
Word w = sentence.getWordByIndex(td.dep().index()); | |||
DependencyTreeNode newNode = new DependencyTreeNode(w); | |||
map.put(td.dep().index(), newNode); | |||
nodesList.add(newNode); | |||
} | |||
} | |||
// 2. add edges. | |||
for (TypedDependency td : tdl) { | |||
if (td.reln().getShortName().equals("root")) { | |||
this.root = map.get(td.dep().index()); | |||
this.root.levelInTree = 0; | |||
this.root.dep_father2child = "root"; | |||
} | |||
else { | |||
DependencyTreeNode gov = map.get(td.gov().index()); | |||
DependencyTreeNode dep = map.get(td.dep().index()); | |||
dep.father = gov; | |||
gov.childrenList.add(dep); | |||
dep.dep_father2child = td.reln().getShortName(); | |||
} | |||
} | |||
// add levelInTree, sort childrenList & nodesList | |||
Stack<DependencyTreeNode> stack = new Stack<DependencyTreeNode>(); | |||
stack.push(this.root); | |||
while (!stack.empty()) { | |||
DependencyTreeNode dtn = stack.pop(); | |||
if (dtn.father != null) { | |||
dtn.levelInTree = dtn.father.levelInTree + 1; | |||
dtn.sortChildrenList(); | |||
} | |||
for (DependencyTreeNode chd : dtn.childrenList) { | |||
stack.push(chd); | |||
} | |||
} | |||
Collections.sort(nodesList, new DependencyTreeNodeComparator()); | |||
for (DependencyTreeNode dtn : nodesList) { | |||
dtn.linkNN(this); | |||
} | |||
} | |||
public DependencyTree (Sentence sentence, MaltParser maltParser)throws MaltChainedException { | |||
try { | |||
// the tokens are parsed in the following line | |||
DependencyStructure graph = maltParser.getDependencyStructure(sentence); | |||
this.maltGraph = graph; | |||
//System.out.println(graph); | |||
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>(); | |||
ArrayList<DependencyTreeNode> list = new ArrayList<DependencyTreeNode>(); | |||
Stack<DependencyNode> stack = new Stack<DependencyNode>(); | |||
DependencyNode nroot = graph.getDependencyRoot(); | |||
stack.add(nroot); | |||
// 1. generate all nodes. | |||
while (!stack.isEmpty()) { | |||
DependencyNode n = stack.pop(); | |||
DependencyNode sib = n.getRightmostDependent(); | |||
int key = n.getIndex(); | |||
//System.out.println("[current node][key="+key+"] "+n+" <"+n.getHeadEdge()+">"); | |||
boolean flag = true; | |||
while (sib != null) { | |||
flag = false; | |||
stack.push(sib); | |||
sib = sib.getLeftSibling(); | |||
} | |||
if (flag) { | |||
sib = n.getLeftmostDependent(); | |||
while (sib != null) { | |||
stack.push(sib); | |||
sib = sib.getRightSibling(); | |||
} | |||
} | |||
if (n.hasHead() && !map.containsKey(key)) { | |||
//String snode = n.toString(); | |||
String sedge = n.getHeadEdge().toString(); | |||
//System.out.println("[" + snode + "] <" + sedge + ">"); | |||
/*int position = 0; | |||
String wordOriginal = null; | |||
String wordBase; | |||
String postag = null;*/ | |||
String dep = null; | |||
int idx1, idx2; | |||
/*// position | |||
idx1 = snode.indexOf("ID:")+3; | |||
idx2 = snode.indexOf(' ', idx1); | |||
position = Integer.parseInt(snode.substring(idx1, idx2)); | |||
// word | |||
idx1 = snode.indexOf("FORM:", idx2)+5; | |||
idx2 = snode.indexOf(' ', idx1); | |||
wordOriginal = snode.substring(idx1, idx2); | |||
wordBase = Globals.coreNLP.getBaseFormOfPattern(wordOriginal.toLowerCase()); | |||
// postag | |||
idx1 = snode.indexOf("POSTAG:", idx2)+7; | |||
idx2 = snode.indexOf(' ', idx1); | |||
postag = snode.substring(idx1, idx2);*/ | |||
// dep | |||
idx1 = sedge.lastIndexOf(':')+1; | |||
idx2 = sedge.lastIndexOf(' '); | |||
dep = sedge.substring(idx1, idx2); | |||
if (dep.equals("null")) { | |||
dep = null; | |||
} | |||
else if (dep.equals("punct")) {// No consider about punctuation | |||
continue; | |||
} | |||
DependencyTreeNode newNode = new DependencyTreeNode(sentence.getWordByIndex(key)); | |||
newNode.dep_father2child = dep; | |||
map.put(key, newNode); | |||
list.add(newNode); | |||
} | |||
} | |||
// 2. add edges | |||
for (Integer k : map.keySet()) { | |||
DependencyNode n = graph.getDependencyNode(k); | |||
DependencyTreeNode dtn = map.get(k); | |||
if (dtn.dep_father2child == null) { | |||
this.setRoot(dtn); | |||
this.root.levelInTree = 0; | |||
this.root.dep_father2child = "root"; | |||
} | |||
else { | |||
DependencyTreeNode father = map.get(n.getHead().getIndex()); | |||
DependencyTreeNode child = map.get(n.getIndex()); | |||
child.father = father; | |||
father.childrenList.add(child); | |||
} | |||
} | |||
// Fix the tree for some cases. | |||
if(list.size() > 11) | |||
{ | |||
DependencyTreeNode dt1 = list.get(11), dt2 = list.get(5); | |||
if(dt1!=null && dt2!=null && dt1.word.baseForm.equals("star") && dt1.father.word.baseForm.equals("be")) | |||
{ | |||
if (dt2.word.baseForm.equals("film") || dt2.word.baseForm.equals("movie")) | |||
{ | |||
dt1.father.childrenList.remove(dt1); | |||
dt1.father = dt2; | |||
dt2.childrenList.add(dt1); | |||
} | |||
} | |||
} | |||
// add levelInTree, sort childrenList & nodesList | |||
for (DependencyTreeNode dtn : list) { | |||
if (dtn.father != null) { | |||
dtn.levelInTree = dtn.father.levelInTree + 1; | |||
dtn.sortChildrenList(); | |||
} | |||
} | |||
nodesList = list; | |||
Collections.sort(nodesList, new DependencyTreeNodeComparator()); | |||
for (DependencyTreeNode dtn : nodesList) { | |||
dtn.linkNN(this); | |||
} | |||
} catch (MaltChainedException e) { | |||
//e.printStackTrace(); | |||
//System.err.println("MaltParser exception: " + e.getMessage()); | |||
throw e; | |||
} | |||
} | |||
public DependencyTreeNode setRoot(Word w) { | |||
root = new DependencyTreeNode(w, "root", null); | |||
return root; | |||
} | |||
public DependencyTreeNode setRoot(DependencyTreeNode root) { | |||
this.root = root; | |||
return this.root; | |||
} | |||
public void buildWordBaseFormIndex () { | |||
wordBaseFormIndex = new HashMap<String, ArrayList<DependencyTreeNode>>(); | |||
for (DependencyTreeNode dtn: nodesList) { | |||
String w = dtn.word.baseForm; | |||
if (!wordBaseFormIndex.keySet().contains(w)) | |||
wordBaseFormIndex.put(w, new ArrayList<DependencyTreeNode>()); | |||
wordBaseFormIndex.get(w).add(dtn); | |||
} | |||
} | |||
public DependencyTreeNode insert(DependencyTreeNode father, Word w, String dep_father2child) { | |||
if (father == null || w == null) | |||
return null; | |||
DependencyTreeNode newNode = new DependencyTreeNode(w, dep_father2child, father); | |||
father.childrenList.add(newNode); | |||
return newNode; | |||
} | |||
public DependencyTreeNode getRoot() { | |||
return root; | |||
} | |||
public ArrayList<DependencyTreeNode> getNodesList(){ | |||
return nodesList; | |||
} | |||
public ArrayList<DependencyTreeNode> getShortestNodePathBetween(DependencyTreeNode n1, DependencyTreeNode n2) | |||
{ | |||
if(n1 == n2) { | |||
return new ArrayList<DependencyTreeNode>(); | |||
} | |||
ArrayList<DependencyTreeNode> path1 = getPath2Root(n1); | |||
ArrayList<DependencyTreeNode> path2 = getPath2Root(n2); | |||
int idx1 = path1.size()-1; | |||
int idx2 = path2.size()-1; | |||
DependencyTreeNode curNode1 = path1.get(idx1); | |||
DependencyTreeNode curNode2 = path2.get(idx2); | |||
while (curNode1 == curNode2) { | |||
idx1 --; | |||
idx2 --; | |||
if(idx1 < 0 || idx2 < 0) break; | |||
curNode1 = path1.get(idx1); | |||
curNode2 = path2.get(idx2); | |||
} | |||
ArrayList<DependencyTreeNode> shortestPath = new ArrayList<DependencyTreeNode>(); | |||
for (int i = 0; i <= idx1; i ++) { | |||
shortestPath.add(path1.get(i)); | |||
} | |||
for (int i = idx2+1; i >= 0; i --) { | |||
shortestPath.add(path2.get(i)); | |||
} | |||
System.out.println("Shortest Path between <" + n1 + "> and <" + n2 + ">:"); | |||
System.out.print("\t-"); | |||
for (DependencyTreeNode dtn : shortestPath) { | |||
System.out.print("<" + dtn + ">-"); | |||
} | |||
System.out.println(); | |||
return shortestPath; | |||
} | |||
public ArrayList<DependencyTreeNode> getPath2Root(DependencyTreeNode n1) { | |||
ArrayList<DependencyTreeNode> path = new ArrayList<DependencyTreeNode>(); | |||
DependencyTreeNode curNode = n1; | |||
path.add(curNode); | |||
while (curNode.father != null) { | |||
curNode = curNode.father; | |||
path.add(curNode); | |||
} | |||
return path; | |||
} | |||
public ArrayList<DependencyTreeNode> getTreeNodesListContainsWords(String words) { | |||
ArrayList<DependencyTreeNode> ret = new ArrayList<DependencyTreeNode>(); | |||
for (DependencyTreeNode dtn : nodesList) { | |||
if (dtn.word.originalForm.equalsIgnoreCase(words) | |||
|| dtn.word.baseForm.equalsIgnoreCase(words) | |||
|| words.contains(dtn.word.originalForm) | |||
|| words.contains(dtn.word.baseForm)) | |||
ret.add(dtn); | |||
} | |||
return ret; | |||
} | |||
public DependencyTreeNode getNodeByIndex (int posi) { | |||
for (DependencyTreeNode dt : nodesList) { | |||
if (dt.word.position == posi) { | |||
return dt; | |||
} | |||
} | |||
return null; | |||
} | |||
public DependencyTreeNode getFirstPositionNodeInList(ArrayList<DependencyTreeNode> list) { | |||
int firstPosi = Integer.MAX_VALUE; | |||
DependencyTreeNode firstNode = null; | |||
for (DependencyTreeNode dtn : list) { | |||
if (dtn.word.position < firstPosi) { | |||
firstPosi = dtn.word.position; | |||
firstNode = dtn; | |||
} | |||
} | |||
return firstNode; | |||
} | |||
@Override | |||
public String toString() { | |||
String ret = ""; | |||
Stack<DependencyTreeNode> stack = new Stack<DependencyTreeNode>(); | |||
stack.push(root); | |||
while(!stack.empty()) { | |||
DependencyTreeNode curNode = stack.pop(); | |||
for (int i = 0; i <= curNode.levelInTree; i ++) | |||
ret += " "; | |||
ret += "-> "; | |||
ret += curNode.word.baseForm; | |||
ret += "-"; | |||
ret += curNode.word.posTag; | |||
ret += " ("; | |||
ret += curNode.dep_father2child; | |||
ret += ")"; | |||
ret += "[" + curNode.word.position + "]\n"; | |||
for (DependencyTreeNode child : curNode.childrenList) { | |||
stack.push(child); | |||
} | |||
} | |||
return ret; | |||
} | |||
} |
@@ -0,0 +1,150 @@ | |||
package nlp.ds; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.Comparator; | |||
import java.util.Stack; | |||
public class DependencyTreeNode { | |||
public Word word = null; | |||
public String dep_father2child = null; | |||
public DependencyTreeNode father = null; | |||
public ArrayList<DependencyTreeNode> childrenList = null; | |||
public int levelInTree = -1; | |||
/** | |||
* The constructor for knowing its father | |||
* | |||
* @param w | |||
* @param dep_father2child | |||
* @param father | |||
*/ | |||
public DependencyTreeNode(Word w, String dep_father2child, DependencyTreeNode father) | |||
{ | |||
word = w; | |||
this.dep_father2child = dep_father2child; | |||
this.father = father; | |||
this.childrenList = new ArrayList<DependencyTreeNode>(); | |||
if(father==null) levelInTree = 0; | |||
else levelInTree = father.levelInTree+1; | |||
} | |||
/** | |||
* The constructor for not knowing the father | |||
* | |||
* @param word | |||
*/ | |||
public DependencyTreeNode(Word w) | |||
{ | |||
this.word = w; | |||
this.childrenList = new ArrayList<DependencyTreeNode>(); | |||
} | |||
public void sortChildrenList () { | |||
childrenList.trimToSize(); | |||
Collections.sort(childrenList, new DependencyTreeNodeComparator()); | |||
} | |||
@Override | |||
public String toString(){ | |||
return word.originalForm + "-" + word.posTag + "(" + dep_father2child + ")[" + word.position + "]"; | |||
} | |||
public static void sortArrayList(ArrayList<DependencyTreeNode> list) { | |||
Collections.sort(list, new DependencyTreeNodeComparator()); | |||
} | |||
public DependencyTreeNode containDependencyWithChildren (String dep) { | |||
for (DependencyTreeNode son : childrenList) { | |||
if (son.dep_father2child.equals(dep)) return son; | |||
} | |||
return null; | |||
} | |||
/** | |||
* equal_or_startWith = true: equal | |||
* equal_or_startWith = false: startWith | |||
* | |||
* @param posChild | |||
* @param equal_or_startWith | |||
* @return | |||
*/ | |||
public DependencyTreeNode containPosInChildren (String posChild, boolean equal_or_startWith) { | |||
for (DependencyTreeNode son : childrenList) { | |||
if (equal_or_startWith) { | |||
if (son.word.posTag.equals(posChild)) return son; | |||
} | |||
else { | |||
if (son.word.posTag.startsWith(posChild)) return son; | |||
} | |||
} | |||
return null; | |||
} | |||
public DependencyTreeNode containWordBaseFormInChildren (String wordBaseFormChild) { | |||
for (DependencyTreeNode son : childrenList) { | |||
if (son.word.baseForm.equals(wordBaseFormChild)) return son; | |||
} | |||
return null; | |||
} | |||
public DependencyTreeNode getNNTopTreeNode (DependencyTree T) { | |||
if(this.father != null && (this.dep_father2child.equals("nn") || (this.word.posTag.startsWith("NN") && this.dep_father2child.equals("dep")))) { | |||
return this.father.getNNTopTreeNode(T); | |||
} | |||
else return this; | |||
} | |||
public Word linkNN(DependencyTree T) { | |||
// (Now useless) backtracking the NN connections. | |||
ArrayList<DependencyTreeNode> nn = new ArrayList<DependencyTreeNode>(); | |||
nn.add(this); | |||
if(this.father != null && (this.dep_father2child.equals("nn") | |||
|| (this.word.posTag.startsWith("NN") && this.dep_father2child.equals("dep") && this.father.word.posTag.startsWith("NN")))) { | |||
nn.add(this.father); | |||
for(DependencyTreeNode son : this.father.childrenList) { | |||
if (son != this && son.dep_father2child.equals("nn")) { | |||
nn.add(son); | |||
} | |||
} | |||
} | |||
Stack<DependencyTreeNode> stack = new Stack<DependencyTreeNode>(); | |||
stack.push(this); | |||
while (!stack.empty()) { | |||
DependencyTreeNode curNode = stack.pop(); | |||
for(DependencyTreeNode son : curNode.childrenList) { | |||
if (son.dep_father2child.equals("nn") | |||
|| (son.word.posTag.startsWith("NN") && son.dep_father2child.equals("dep") && son.father.word.posTag.startsWith("NN"))) { | |||
nn.add(son); | |||
stack.push(son); | |||
} | |||
} | |||
} | |||
DependencyTreeNode.sortArrayList(nn); | |||
int size = nn.size() - 1; | |||
for (int i = 0; i < size; i ++) { | |||
nn.get(i).word.nnNext = nn.get(i+1).word; | |||
nn.get(i+1).word.nnPrev = nn.get(i).word; | |||
} | |||
return this.word.getNnHead(); | |||
} | |||
}; | |||
class DependencyTreeNodeComparator implements Comparator<DependencyTreeNode> { | |||
public int compare(DependencyTreeNode n1, DependencyTreeNode n2) { | |||
return n1.word.position - n2.word.position; | |||
} | |||
} |
@@ -0,0 +1,88 @@ | |||
package nlp.ds; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import qa.Globals; | |||
import qa.Query; | |||
import rdf.MergedWord; | |||
public class Sentence { | |||
public String plainText = null; | |||
public Word[] words = null; | |||
public HashMap<String, Word> map = null; | |||
public DependencyTree dependencyTreeStanford = null; | |||
public DependencyTree dependencyTreeMalt = null; | |||
public enum SentenceType {SpecialQuestion,GeneralQuestion,ImperativeSentence} | |||
public SentenceType sentenceType = SentenceType.SpecialQuestion; | |||
public Sentence (String s) | |||
{ | |||
plainText = s; | |||
words = Globals.coreNLP.getTaggedWords(plainText); | |||
map = new HashMap<String, Word>(); | |||
for (Word w : words) | |||
map.put(w.key, w); | |||
} | |||
public Sentence (Query query, String s) | |||
{ | |||
plainText = s; | |||
words = Globals.coreNLP.getTaggedWords(plainText); | |||
// inherit NodeRecognition's information | |||
for(Word word: words) | |||
{ | |||
for(MergedWord mWord: query.mWordList) | |||
{ | |||
if(word.originalForm.equals(mWord.name)) | |||
{ | |||
word.mayLiteral = mWord.mayLiteral; | |||
word.mayEnt = mWord.mayEnt; | |||
word.mayType = mWord.mayType; | |||
word.mayCategory = mWord.mayCategory; | |||
word.tmList = mWord.tmList; | |||
word.emList = mWord.emList; | |||
word.category = mWord.category; | |||
} | |||
} | |||
} | |||
map = new HashMap<String, Word>(); | |||
for (Word w : words) | |||
map.put(w.key, w); | |||
} | |||
public ArrayList<Word> getWordsByString (String w) { | |||
ArrayList<Word> ret = new ArrayList<Word>(); | |||
for (Word wo: words) { | |||
if (wo.originalForm.equals(w)) ret.add(wo); | |||
} | |||
return ret; | |||
} | |||
public Word getWordByIndex (int idx) { | |||
return words[idx-1]; | |||
} | |||
public Word getWordByKey (String k) { | |||
return map.get(k); | |||
} | |||
public boolean hasModifier(Word w) | |||
{ | |||
for(Word word: words) | |||
if(word!=w && word.modifiedWord==w) | |||
return true; | |||
return false; | |||
} | |||
public void printNERResult () { | |||
for (Word word : words) { | |||
System.out.print(word + " "); | |||
System.out.println("ner=" + word.ner); | |||
} | |||
} | |||
} | |||
@@ -0,0 +1,126 @@ | |||
package nlp.ds; | |||
import java.util.ArrayList; | |||
import rdf.EntityMapping; | |||
import rdf.Triple; | |||
import rdf.TypeMapping; | |||
public class Word implements Comparable<Word> | |||
{ | |||
public boolean mayCategory = false; | |||
public boolean mayLiteral = false; | |||
public boolean mayEnt = false; | |||
public boolean mayType = false; | |||
public boolean mayExtendVariable = false; | |||
public String category = null; | |||
public ArrayList<EntityMapping> emList = null; | |||
public ArrayList<TypeMapping> tmList = null; | |||
public Triple embbededTriple = null; | |||
public String baseForm = null; | |||
public String originalForm = null; | |||
public String posTag = null; | |||
public int position = -1; // Notice the first word's position = 1 | |||
public String key = null; | |||
public boolean isCovered = false; | |||
public boolean isIgnored = false; | |||
//Notice: These variables are not used because we merge a phrase to a word if it is a node now. | |||
public String ner = null; // record NER result | |||
public Word nnNext = null; | |||
public Word nnPrev = null; | |||
public Word crr = null; // coreference resolution result | |||
public Word represent = null; // This word is represented by others, eg, "which book is ..." "which" | |||
public boolean omitNode = false; // This word can not be node | |||
public Word modifiedWord = null; // This word modify which word (it modify itself if it is not a modified word) | |||
public Word (String base, String original, String pos, int posi) { | |||
baseForm = base; | |||
originalForm = original; | |||
posTag = pos; | |||
position = posi; | |||
key = new String(originalForm+"["+position+"]"); | |||
} | |||
@Override | |||
public String toString() { | |||
return key; | |||
} | |||
public int compareTo(Word another) { | |||
return this.position-another.position; | |||
} | |||
@Override | |||
public int hashCode() { | |||
return key.hashCode(); | |||
} | |||
@Override | |||
public boolean equals(Object o) { | |||
return (o instanceof Word) | |||
&& originalForm.equals(((Word)o).originalForm) | |||
&& position == ((Word)o).position; | |||
} | |||
// We now discard all NN information and return the word itself. | husen 2016 | |||
public Word getNnHead() { | |||
Word w = this; | |||
return w; | |||
// if(w.mayEnt || w.mayType) | |||
// return w; | |||
// | |||
// while (w.nnPrev != null) { | |||
// w = w.nnPrev; | |||
// } | |||
// return w; | |||
} | |||
public String getFullEntityName() { | |||
Word w = this.getNnHead(); | |||
return w.originalForm; | |||
// if(w.mayEnt || w.mayType) | |||
// return w.originalForm; | |||
// | |||
// StringBuilder sb = new StringBuilder(""); | |||
// while (w != null) { | |||
// sb.append(w.originalForm); | |||
// sb.append(' '); | |||
// w = w.nnNext; | |||
// } | |||
// sb.deleteCharAt(sb.length()-1); | |||
// return sb.toString(); | |||
} | |||
public String getBaseFormEntityName() { | |||
Word w = this.getNnHead(); | |||
if(w.mayEnt || w.mayType) | |||
return w.baseForm; | |||
StringBuilder sb = new StringBuilder(""); | |||
while (w != null) { | |||
sb.append(w.baseForm); | |||
sb.append(' '); | |||
w = w.nnNext; | |||
} | |||
sb.deleteCharAt(sb.length()-1); | |||
return sb.toString(); | |||
} | |||
public String isNER () { | |||
return this.getNnHead().ner; | |||
} | |||
public void setIsCovered () { | |||
Word w = this.getNnHead(); | |||
while (w != null) { | |||
w.isCovered = true; | |||
w = w.nnNext; | |||
} | |||
} | |||
} |
@@ -0,0 +1,202 @@ | |||
package nlp.tool; | |||
import java.util.List; | |||
import java.util.Properties; | |||
import nlp.ds.Word; | |||
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; | |||
import edu.stanford.nlp.ling.CoreLabel; | |||
import edu.stanford.nlp.pipeline.Annotation; | |||
import edu.stanford.nlp.pipeline.StanfordCoreNLP; | |||
import edu.stanford.nlp.trees.Tree; | |||
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; | |||
import edu.stanford.nlp.trees.semgraph.SemanticGraph; | |||
import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; | |||
import edu.stanford.nlp.util.CoreMap; | |||
public class CoreNLP { | |||
// CoreNLP can also recognize TIME and NUMBER (see SUTime) | |||
private StanfordCoreNLP pipeline_lemma; | |||
public CoreNLP () { | |||
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution | |||
/*Properties props_all = new Properties(); | |||
props_all.put("annotators", "tokenize, ssplit, pos, lemma, parse"); // full list: "tokenize, ssplit, pos, lemma, ner, parse, dcoref" | |||
pipeline_all = new StanfordCoreNLP(props_all);*/ | |||
Properties props_lemma = new Properties(); | |||
props_lemma.put("annotators", "tokenize, ssplit, pos, lemma"); | |||
pipeline_lemma = new StanfordCoreNLP(props_lemma); | |||
} | |||
// For more efficient usage, refer to "http://www.jarvana.com/jarvana/view/edu/stanford/nlp/stanford-corenlp/1.2.0/stanford-corenlp-1.2.0-javadoc.jar!/edu/stanford/nlp/process/Morphology.html" | |||
public String getBaseFormOfPattern (String text) { | |||
String ret = new String(""); | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(text); | |||
// run all Annotators on this text | |||
pipeline_lemma.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
// ��������� | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
int count = 0; | |||
for(CoreMap sentence: sentences) { | |||
// traversing the words in the current sentence | |||
// a CoreLabel is a CoreMap with additional token-specific methods | |||
for (CoreLabel token: sentence.get(TokensAnnotation.class)) { | |||
// this is the base form (lemma) of the token | |||
String lemma = token.getString(LemmaAnnotation.class); | |||
ret += lemma; | |||
ret += " "; | |||
} | |||
count ++; | |||
if (count % 100 == 0) { | |||
System.out.println(count); | |||
} | |||
} | |||
return ret.substring(0, ret.length()-1); | |||
} | |||
public SemanticGraph getBasicDependencies (String s) { | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(s); | |||
// run all Annotators on this text | |||
pipeline_lemma.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
for(CoreMap sentence: sentences) { | |||
// this is the Stanford dependency graph of the current sentence | |||
SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class); | |||
return dependencies; | |||
} | |||
return null; | |||
} | |||
public Tree getParseTree (String text) { | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(text); | |||
// run all Annotators on this text | |||
pipeline_lemma.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
for(CoreMap sentence: sentences) { | |||
// this is the parse tree of the current sentence | |||
return sentence.get(TreeAnnotation.class); | |||
} | |||
return null; | |||
} | |||
/** | |||
* How to use: | |||
* for (CoreLabel token : sentence.get(TokensAnnotation.class)) { | |||
* // this is the text of the token | |||
* String word = token.get(TextAnnotation.class); | |||
* // this is the POS tag of the token | |||
* String pos = token.get(PartOfSpeechAnnotation.class); | |||
* } | |||
* @param s | |||
* @return | |||
*/ | |||
public CoreMap getPOS (String s) { | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(s); | |||
// run all Annotators on this text | |||
pipeline_lemma.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
for(CoreMap sentence: sentences) { | |||
// this is the sentence with POS Tags | |||
return sentence; | |||
} | |||
return null; | |||
} | |||
public Word[] getTaggedWords (String sentence) { | |||
CoreMap taggedSentence = getPOS(sentence); | |||
Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()]; | |||
int count = 0; | |||
for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) { | |||
// this is the text of the token | |||
String word = token.get(TextAnnotation.class); | |||
// this is the POS tag of the token | |||
String pos = token.get(PartOfSpeechAnnotation.class); | |||
//System.out.println(word+"["+pos+"]"); | |||
ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1); | |||
count ++; | |||
} | |||
return ret; | |||
} | |||
/*public void demo () { | |||
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution | |||
Properties props = new Properties(); | |||
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); | |||
StanfordCoreNLP pipeline = new StanfordCoreNLP(props); | |||
// read some text in the text variable | |||
String text = ... // Add your text here! | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(text); | |||
// run all Annotators on this text | |||
pipeline.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
for(CoreMap sentence: sentences) { | |||
// traversing the words in the current sentence | |||
// a CoreLabel is a CoreMap with additional token-specific methods | |||
for (CoreLabel token: sentence.get(TokensAnnotation.class)) { | |||
// this is the text of the token | |||
String word = token.get(TextAnnotation.class); | |||
// this is the POS tag of the token | |||
String pos = token.get(PartOfSpeechAnnotation.class); | |||
// this is the NER label of the token | |||
String ne = token.get(NamedEntityTagAnnotation.class); | |||
} | |||
// this is the parse tree of the current sentence | |||
Tree tree = sentence.get(TreeAnnotation.class); | |||
// this is the Stanford dependency graph of the current sentence | |||
SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); | |||
} | |||
// This is the coreference link graph | |||
// Each chain stores a set of mentions that link to each other, | |||
// along with a method for getting the most representative mention | |||
// Both sentence and token offsets start at 1! | |||
Map<Integer, CorefChain> graph = | |||
document.get(CorefChainAnnotation.class); | |||
}*/ | |||
} |
@@ -0,0 +1,42 @@ | |||
package nlp.tool; | |||
import java.io.BufferedReader; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import nlp.ds.DependencyTree; | |||
import nlp.ds.Sentence; | |||
import qa.Globals; | |||
public class Main { | |||
public static void main (String[] args) { | |||
Globals.init(); | |||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||
try { | |||
while (true) { | |||
System.out.println("Test maltparser."); | |||
System.out.print("Please input the NL question: "); | |||
String question = br.readLine(); | |||
if (question.length() <= 3) | |||
break; | |||
try { | |||
long t1 = System.currentTimeMillis(); | |||
Sentence s = new Sentence(question); | |||
DependencyTree dt = new DependencyTree(s, Globals.stanfordParser); | |||
System.out.println("====StanfordDependencies===="); | |||
System.out.println(dt); | |||
DependencyTree dt2 = new DependencyTree(s, Globals.maltParser); | |||
System.out.println("====MaltDependencies===="); | |||
System.out.println(dt2); | |||
long t2 = System.currentTimeMillis(); | |||
System.out.println("time=" + (t2-t1) + "ms"); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} catch (IOException e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,70 @@ | |||
package nlp.tool; | |||
import nlp.ds.Sentence; | |||
import nlp.ds.Word; | |||
import org.maltparser.MaltParserService; | |||
import org.maltparser.core.exception.MaltChainedException; | |||
import org.maltparser.core.syntaxgraph.DependencyStructure; | |||
import qa.Globals; | |||
public class MaltParser { | |||
private MaltParserService service = null; | |||
public MaltParser() { | |||
try | |||
{ | |||
System.out.print("Loading MaltParser ..."); | |||
service = new MaltParserService(); | |||
// Inititalize the parser model 'model0' and sets the working directory to '.' and sets the logging file to 'parser.log' | |||
//service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w . -lfi parser.log"); | |||
service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w "+Globals.localPath+"lib/maltparser-1.9.1 -lfi parser.log"); | |||
firstParse(); | |||
System.out.println("ok!"); | |||
} catch (MaltChainedException e) { | |||
e.printStackTrace(); | |||
System.err.println("MaltParser exception: " + e.getMessage()); | |||
} | |||
} | |||
private void firstParse() { | |||
String[] tokens = new String[12]; | |||
tokens[0] = "1\tIn\t_\tIN\tIN\t_"; | |||
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; | |||
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; | |||
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; | |||
tokens[4] = "5\tby\t_\tIN\tIN\t_"; | |||
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; | |||
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; | |||
tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; | |||
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; | |||
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; | |||
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; | |||
tokens[11] = "12\t?\t_\t.\t.\t_"; | |||
try { | |||
service.parse(tokens); | |||
} catch (MaltChainedException e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
public DependencyStructure getDependencyStructure (Sentence sentence) { | |||
try { | |||
return service.parse(getTaggedTokens(sentence)); | |||
} catch (MaltChainedException e) { | |||
e.printStackTrace(); | |||
} | |||
return null; | |||
} | |||
private String[] getTaggedTokens (Sentence sentence) { | |||
String[] ret = new String[sentence.words.length]; | |||
int count = 0; | |||
for (Word w : sentence.words) { | |||
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); | |||
count ++; | |||
} | |||
return ret; | |||
} | |||
} |
@@ -0,0 +1,73 @@ | |||
package nlp.tool; | |||
import java.io.File; | |||
import java.net.URL; | |||
import nlp.ds.Sentence; | |||
import nlp.ds.Word; | |||
import org.maltparser.concurrent.ConcurrentMaltParserModel; | |||
import org.maltparser.concurrent.ConcurrentMaltParserService; | |||
import org.maltparser.concurrent.graph.ConcurrentDependencyGraph; | |||
import org.maltparser.core.exception.MaltChainedException; | |||
//import org.maltparser.core.syntaxgraph.DependencyStructure; | |||
public class MaltParserCon { | |||
private ConcurrentMaltParserModel model = null; | |||
public ConcurrentDependencyGraph outputGraph = null; | |||
public MaltParserCon(){ | |||
try{ | |||
System.out.println("Loading Maltparser...\n"); | |||
URL ModelURL = new File("output/engmalt.linear-1.7.mco").toURI().toURL(); | |||
model = ConcurrentMaltParserService.initializeParserModel(ModelURL); | |||
firstTest(); | |||
System.out.println("ok!\n"); | |||
}catch(Exception e){ | |||
e.printStackTrace(); | |||
System.err.println("MaltParser exception: " + e.getMessage()); | |||
} | |||
} | |||
private void firstTest(){ | |||
String[] tokens = new String[12]; | |||
tokens[0] = "1\tIn\t_\tIN\tIN\t_"; | |||
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; | |||
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; | |||
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; | |||
tokens[4] = "5\tby\t_\tIN\tIN\t_"; | |||
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; | |||
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; | |||
tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; | |||
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; | |||
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; | |||
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; | |||
tokens[11] = "12\t?\t_\t.\t.\t_"; | |||
try { | |||
outputGraph = model.parse(tokens); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
System.out.println(outputGraph); | |||
} | |||
public ConcurrentDependencyGraph getDependencyStructure (Sentence sentence) { | |||
try { | |||
return model.parse(getTaggedTokens(sentence)); | |||
} catch (MaltChainedException e) { | |||
e.printStackTrace(); | |||
} | |||
return null; | |||
} | |||
private String[] getTaggedTokens (Sentence sentence) { | |||
String[] ret = new String[sentence.words.length]; | |||
int count = 0; | |||
for (Word w : sentence.words) { | |||
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); | |||
count ++; | |||
} | |||
return ret; | |||
} | |||
} |
@@ -0,0 +1,53 @@ | |||
package nlp.tool; | |||
import java.util.List; | |||
import qa.Globals; | |||
import nlp.ds.Sentence; | |||
import nlp.ds.Word; | |||
import edu.stanford.nlp.ie.AbstractSequenceClassifier; | |||
import edu.stanford.nlp.ie.crf.CRFClassifier; | |||
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation; | |||
import edu.stanford.nlp.ling.CoreLabel; | |||
public class NERecognizer { | |||
static String serializedClassifier; | |||
static AbstractSequenceClassifier<CoreLabel> classifier; | |||
//public static String localPath="E:\\Hanshuo\\gAnswer\\"; | |||
public NERecognizer() { | |||
serializedClassifier = Globals.localPath+"lib/stanford-ner-2012-11-11/classifiers/english.all.3class.distsim.crf.ser.gz"; | |||
classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); | |||
} | |||
/*public NERecognizer(String basePath, boolean flag) { | |||
serializedClassifier = "WEB-INF\\lib\\stanford-ner-2012-11-11\\stanford-ner-2012-11-11\\classifiers\\english.all.3class.distsim.crf.ser.gz"; | |||
}*/ | |||
public void recognize(Sentence sentence) { | |||
List<CoreLabel> lcl = classifier.classify(sentence.plainText).get(0); | |||
for (CoreLabel cl : lcl) { | |||
int position = Integer.parseInt(cl.get(PositionAnnotation.class))+1; | |||
Word w = sentence.getWordByIndex(position); | |||
String ner = cl.get(AnswerAnnotation.class); | |||
if (ner.equals("O")) w.ner = null; | |||
else w.ner = ner; | |||
} | |||
} | |||
public static void main(String[] args) { | |||
System.out.println("Test NER"); | |||
Globals.init(); | |||
Sentence s = new Sentence("I go to school at Stanford University, which is located in California.");//"Which states of Germany are governed by the Social Democratic Party?" | |||
Globals.nerRecognizer.recognize(s); | |||
for (Word word : s.words) { | |||
System.out.print(word + " "); | |||
System.out.println("ner=" + word.ner); | |||
} | |||
} | |||
} |
@@ -0,0 +1,51 @@ | |||
package nlp.tool; | |||
import java.io.StringReader; | |||
import java.util.List; | |||
import edu.stanford.nlp.ling.CoreLabel; | |||
import edu.stanford.nlp.objectbank.TokenizerFactory; | |||
import edu.stanford.nlp.parser.lexparser.LexicalizedParser; | |||
import edu.stanford.nlp.process.CoreLabelTokenFactory; | |||
import edu.stanford.nlp.process.PTBTokenizer; | |||
import edu.stanford.nlp.trees.GrammaticalStructure; | |||
import edu.stanford.nlp.trees.GrammaticalStructureFactory; | |||
import edu.stanford.nlp.trees.PennTreebankLanguagePack; | |||
import edu.stanford.nlp.trees.Tree; | |||
import edu.stanford.nlp.trees.TreebankLanguagePack; | |||
public class StanfordParser { | |||
private LexicalizedParser lp; | |||
private TokenizerFactory<CoreLabel> tokenizerFactory; | |||
private TreebankLanguagePack tlp; | |||
private GrammaticalStructureFactory gsf; | |||
public StanfordParser() { | |||
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); | |||
tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); | |||
tlp = new PennTreebankLanguagePack(); | |||
gsf = tlp.grammaticalStructureFactory(); | |||
} | |||
public GrammaticalStructure getGrammaticalStructure (String sentence) { | |||
List<CoreLabel> rawWords2 = | |||
tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); | |||
// Converts a Sentence/List/String into a Tree. | |||
// In all circumstances, the input will be treated as a single sentence to be parsed. | |||
Tree parse = lp.apply(rawWords2); | |||
return gsf.newGrammaticalStructure(parse); | |||
/*List<TypedDependency> tdl = gs.typedDependencies(false); | |||
for (TypedDependency td : tdl) { | |||
System.out.println(td.reln().getShortName()+"("+td.gov()+","+td.dep()+")"); | |||
System.out.println("gov="+td.gov() | |||
+"\tgov.index=" | |||
+td.gov().index() | |||
+"\tgov.value=" | |||
+td.gov().value() | |||
+"\tgov.pos=" | |||
+((TreeGraphNode)td.gov().parent()).value()); | |||
}*/ | |||
//System.out.println(tdl); | |||
} | |||
} |
@@ -0,0 +1,614 @@ | |||
package nlp.tool; | |||
import java.util.HashSet; | |||
import java.util.Arrays; | |||
public class StopWordsList { | |||
public static HashSet<String> sw_list = new HashSet<String>(); | |||
public StopWordsList() { | |||
initiate(); | |||
} | |||
public void initiate() { | |||
sw_list.addAll(Arrays.asList(sw_array)); | |||
// some commas | |||
/*sw_list.add("."); | |||
sw_list.add(","); | |||
sw_list.add(";"); | |||
sw_list.add("?"); | |||
sw_list.add("!"); | |||
sw_list.add(":"); | |||
sw_list.add("("); | |||
sw_list.add(")"); | |||
sw_list.add("-");*/ | |||
} | |||
/** | |||
* To judge whether a word is a stop-word | |||
* @param word_lowercase: the word, should be in lower-case | |||
* @return if the word is a stop-word, then true; otherwise, false. | |||
*/ | |||
public boolean isStopWord(String word_lowercase) { | |||
if (sw_list.contains(word_lowercase)) return true; | |||
else return false; | |||
} | |||
private static final String sw_array[] = new String[]{ | |||
"a", | |||
"able", | |||
"about", | |||
"across", | |||
"after", | |||
"all", | |||
"almost", | |||
"also", | |||
"am", | |||
"among", | |||
"an", | |||
"and", | |||
"any", | |||
"are", | |||
"as", | |||
"at", | |||
//"be", | |||
"because", | |||
"been", | |||
"but", | |||
"by", | |||
"can", | |||
"cannot", | |||
"could", | |||
"dear", | |||
"did", | |||
"do", | |||
"does", | |||
"either", | |||
"else", | |||
"ever", | |||
"every", | |||
"for", | |||
"from", | |||
"get", | |||
"got", | |||
"had", | |||
"has", | |||
"have", | |||
"he", | |||
"her", | |||
"hers", | |||
"him", | |||
"his", | |||
"how", | |||
"however", | |||
"i", | |||
"if", | |||
"in", | |||
"into", | |||
"is", | |||
"it", | |||
"its", | |||
"just", | |||
"least", | |||
"let", | |||
"like", | |||
"likely", | |||
"may", | |||
"me", | |||
"might", | |||
"most", | |||
"must", | |||
"my", | |||
"neither", | |||
"no", | |||
"nor", | |||
"not", | |||
"of", | |||
"off", | |||
"often", | |||
"on", | |||
"only", | |||
"or", | |||
"other", | |||
"our", | |||
"own", | |||
"rather", | |||
"said", | |||
"say", | |||
"says", | |||
"she", | |||
"should", | |||
"since", | |||
"so", | |||
"some", | |||
"than", | |||
"that", | |||
"the", | |||
"their", | |||
"them", | |||
"then", | |||
"there", | |||
"these", | |||
"they", | |||
"this", | |||
"tis", | |||
"to", | |||
"too", | |||
"twas", | |||
"us", | |||
"wants", | |||
"was", | |||
"we", | |||
"were", | |||
"what", | |||
"when", | |||
"where", | |||
"which", | |||
"while", | |||
"who", | |||
"whom", | |||
"why", | |||
"will", | |||
"with", | |||
"would", | |||
"yet", | |||
"you", | |||
"your" | |||
}; | |||
}; | |||
/*// stop word 308 | |||
// http://norm.al/2009/04/14/list-of-english-stop-words/ | |||
private static final String sw_array[] = new String[]{ | |||
"a", | |||
"about", | |||
"above", | |||
"across", | |||
"after", | |||
"afterwards", | |||
"again", | |||
"against", | |||
"all", | |||
"almost", | |||
"alone", | |||
"along", | |||
"already", | |||
"also", | |||
"although", | |||
"always", | |||
"am", | |||
"among", | |||
"amongst", | |||
"amoungst", | |||
"amount", | |||
"an", | |||
"and", | |||
"another", | |||
"any", | |||
"anyhow", | |||
"anyone", | |||
"anything", | |||
"anyway", | |||
"anywhere", | |||
"are", | |||
"around", | |||
"as", | |||
"at", | |||
"back", | |||
"be", | |||
"became", | |||
"because", | |||
"become", | |||
"becomes", | |||
"becoming", | |||
"been", | |||
"before", | |||
"beforehand", | |||
"behind", | |||
"being", | |||
"below", | |||
"beside", | |||
"besides", | |||
"between", | |||
"beyond", | |||
"bill", | |||
"both", | |||
"bottom", | |||
"but", | |||
"by", | |||
"call", | |||
"can", | |||
"cannot", | |||
"cant", | |||
"co", | |||
"computer", | |||
"con", | |||
"could", | |||
"couldnt", | |||
"cry", | |||
"de", | |||
"describe", | |||
"detail", | |||
"do", | |||
"did", | |||
"done", | |||
"down", | |||
"due", | |||
"during", | |||
"each", | |||
"eg", | |||
"eight", | |||
"either", | |||
"eleven", | |||
"else", | |||
"elsewhere", | |||
"empty", | |||
"enough", | |||
"etc", | |||
"even", | |||
"ever", | |||
"every", | |||
"everyone", | |||
"everything", | |||
"everywhere", | |||
"except", | |||
"few", | |||
"fifteen", | |||
"fify", | |||
"fill", | |||
"find", | |||
"fire", | |||
"first", | |||
"five", | |||
"for", | |||
"former", | |||
"formerly", | |||
"forty", | |||
"found", | |||
"four", | |||
"from", | |||
"front", | |||
"full", | |||
"further", | |||
"get", | |||
"give", | |||
"go", | |||
"had", | |||
"has", | |||
"hasnt", | |||
"have", | |||
"he", | |||
"hence", | |||
"her", | |||
"here", | |||
"here", | |||
"hereafter", | |||
"hereby", | |||
"herein", | |||
"hereupon", | |||
"hers", | |||
"herself", | |||
"him", | |||
"himself", | |||
"his", | |||
"how", | |||
"however", | |||
"hundred", | |||
"i", | |||
"ie", | |||
"if", | |||
"in", | |||
"inc", | |||
"indeed", | |||
"interest", | |||
"into", | |||
"is", | |||
"it", | |||
"its", | |||
"itself", | |||
"keep", | |||
"last", | |||
"latter", | |||
"latterly", | |||
"least", | |||
"less", | |||
"ltd", | |||
"made", | |||
"many", | |||
"may", | |||
"me", | |||
"meanwhile", | |||
"might", | |||
"mill", | |||
"mine", | |||
"more", | |||
"moreover", | |||
"most", | |||
"mostly", | |||
"move", | |||
"much", | |||
"must", | |||
"my", | |||
"myself", | |||
"name", | |||
"namely", | |||
"neither", | |||
"never", | |||
"nevertheless", | |||
"next", | |||
"nine", | |||
"no", | |||
"nobody", | |||
"none", | |||
"noone", | |||
"nor", | |||
"not", | |||
"nothing", | |||
"now", | |||
"nowhere", | |||
"of", | |||
"off", | |||
"often", | |||
"on", | |||
"once", | |||
"one", | |||
"only", | |||
"onto", | |||
"or", | |||
"other", | |||
"others", | |||
"otherwise", | |||
"our", | |||
"ours", | |||
"ourselves", | |||
"out", | |||
"over", | |||
"own", | |||
"part", | |||
"per", | |||
"perhaps", | |||
"please", | |||
"put", | |||
"rather", | |||
"re", | |||
"same", | |||
"see", | |||
"seem", | |||
"seemed", | |||
"seeming", | |||
"seems", | |||
"serious", | |||
"several", | |||
"she", | |||
"should", | |||
"show", | |||
"side", | |||
"since", | |||
"sincere", | |||
"six", | |||
"sixty", | |||
"so", | |||
"some", | |||
"somehow", | |||
"someone", | |||
"something", | |||
"sometime", | |||
"sometimes", | |||
"somewhere", | |||
"still", | |||
"such", | |||
"system", | |||
"take", | |||
"ten", | |||
"than", | |||
"that", | |||
"the", | |||
"their", | |||
"them", | |||
"themselves", | |||
"then", | |||
"thence", | |||
"there", | |||
"thereafter", | |||
"thereby", | |||
"therefore", | |||
"therein", | |||
"thereupon", | |||
"these", | |||
"they", | |||
"thick", | |||
"thin", | |||
"third", | |||
"this", | |||
"those", | |||
"though", | |||
"throughout", | |||
"thru", | |||
"thus", | |||
"to", | |||
"together", | |||
"too", | |||
"top", | |||
"toward", | |||
"towards", | |||
"twelve", | |||
"twenty", | |||
"two", | |||
"un", | |||
"under", | |||
"until", | |||
"up", | |||
"upon", | |||
"us", | |||
"very", | |||
"via", | |||
"was", | |||
"we", | |||
"we", | |||
"well", | |||
"were", | |||
"what", | |||
"whatever", | |||
"when", | |||
"whence", | |||
"whenever", | |||
"where", | |||
"whereafter", | |||
"whereas", | |||
"whereby", | |||
"wherein", | |||
"whereupon", | |||
"wherever", | |||
"whether", | |||
"which", | |||
"while", | |||
"whither", | |||
"who", | |||
"whoever", | |||
"whole", | |||
"whom", | |||
"whose", | |||
"why", | |||
"will", | |||
"with", | |||
"within", | |||
"without", | |||
"would", | |||
"yet", | |||
"you", | |||
"your", | |||
"yours", | |||
"yourself", | |||
"yourselves" | |||
}; | |||
*/ | |||
/* // stop words 119 | |||
// http://www.textfixer.com/resources/common-english-words.txt | |||
private static final String sw_array[] = new String[]{ | |||
"a", | |||
"able", | |||
"about", | |||
"across", | |||
"after", | |||
"all", | |||
"almost", | |||
"also", | |||
"am", | |||
"among", | |||
"an", | |||
"and", | |||
"any", | |||
"are", | |||
"as", | |||
"at", | |||
"be", | |||
"because", | |||
"been", | |||
"but", | |||
"by", | |||
"can", | |||
"cannot", | |||
"could", | |||
"dear", | |||
"did", | |||
"do", | |||
"does", | |||
"either", | |||
"else", | |||
"ever", | |||
"every", | |||
"for", | |||
"from", | |||
"get", | |||
"got", | |||
"had", | |||
"has", | |||
"have", | |||
"he", | |||
"her", | |||
"hers", | |||
"him", | |||
"his", | |||
"how", | |||
"however", | |||
"i", | |||
"if", | |||
"in", | |||
"into", | |||
"is", | |||
"it", | |||
"its", | |||
"just", | |||
"least", | |||
"let", | |||
"like", | |||
"likely", | |||
"may", | |||
"me", | |||
"might", | |||
"most", | |||
"must", | |||
"my", | |||
"neither", | |||
"no", | |||
"nor", | |||
"not", | |||
"of", | |||
"off", | |||
"often", | |||
"on", | |||
"only", | |||
"or", | |||
"other", | |||
"our", | |||
"own", | |||
"rather", | |||
"said", | |||
"say", | |||
"says", | |||
"she", | |||
"should", | |||
"since", | |||
"so", | |||
"some", | |||
"than", | |||
"that", | |||
"the", | |||
"their", | |||
"them", | |||
"then", | |||
"there", | |||
"these", | |||
"they", | |||
"this", | |||
"tis", | |||
"to", | |||
"too", | |||
"twas", | |||
"us", | |||
"wants", | |||
"was", | |||
"we", | |||
"were", | |||
"what", | |||
"when", | |||
"where", | |||
"which", | |||
"while", | |||
"who", | |||
"whom", | |||
"why", | |||
"will", | |||
"with", | |||
"would", | |||
"yet", | |||
"you", | |||
"your" | |||
}; | |||
*/ |
@@ -0,0 +1,441 @@ | |||
package paradict; | |||
import java.io.BufferedReader; | |||
import java.io.File; | |||
import java.io.FileInputStream; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import java.util.Iterator; | |||
import nlp.tool.CoreNLP; | |||
import qa.Globals; | |||
public class ParaphraseDictionary { | |||
public static String localDataPath; | |||
public static String dbpedia_relation_paraphrases_baseform_withScore; | |||
public static String dbpedia_relation_paraphrases_baseform_withScore_rerank; | |||
public static String dbpedia_relation_paraphrases_handwrite; | |||
public static String dbpedia_predicate_id; | |||
public static String dbpedia_dbo_predicate; | |||
public HashMap<String, Integer> predicate_2_id = null; | |||
public HashMap<Integer, String> id_2_predicate = null; | |||
public HashSet<Integer> dbo_predicate_id = null; | |||
public HashMap<String, ArrayList<PredicateIDAndSupport>> nlPattern_2_predicateList = null; | |||
public HashMap<String, ArrayList<String>> invertedIndex = null; | |||
public HashSet<String> relns_subject; | |||
public HashSet<String> relns_object; | |||
public HashSet<String> prepositions; | |||
public HashSet<String> bannedTypes; | |||
//public final int typePredicateID = 1541; //dbpedia2015 <type>=1541 | |||
public final int typePredicateID = 5157; //Dbpedia 2016 <type>=5166 | |||
public int totalPredCount = 0; | |||
public int paraphrasedPredCount = 0; | |||
public int lineCount = 0; | |||
/** | |||
* constructor | |||
* @param parser | |||
* @param ner | |||
*/ | |||
public ParaphraseDictionary () { | |||
String fixedPath = Globals.localPath; | |||
System.out.println(System.getProperty("user.dir")); | |||
localDataPath = fixedPath + "data/DBpedia2016/parapharse/"; | |||
dbpedia_relation_paraphrases_baseform_withScore_rerank = localDataPath + "dbpedia-relation-paraphrases-withScore-baseform-merge-sorted-rerank-slct.txt"; | |||
dbpedia_relation_paraphrases_handwrite = localDataPath + "dbpedia-relation-paraphrase-handwrite.txt"; | |||
dbpedia_predicate_id = localDataPath + "16predicate_id.txt"; | |||
dbpedia_dbo_predicate = localDataPath + "16dbo_predicates.txt"; | |||
bannedTypes = new HashSet<String>(); | |||
bannedTypes.add("Mayor"); | |||
relns_subject = new HashSet<String>(); | |||
relns_subject.add("subj"); | |||
relns_subject.add("csubjpass"); | |||
relns_subject.add("csubj"); | |||
relns_subject.add("xsubj"); | |||
relns_subject.add("nsubjpass"); | |||
relns_subject.add("nsubj"); | |||
relns_subject.add("poss"); // Obama's wife | |||
relns_subject.add("dobj"); | |||
relns_object = new HashSet<String>(); | |||
relns_object.add("dobj"); | |||
relns_object.add("iobj"); | |||
relns_object.add("obj"); | |||
relns_object.add("pobj"); | |||
prepositions = new HashSet<String>(); | |||
prepositions.add("in");//in at on with to from before after of for | |||
prepositions.add("at"); | |||
prepositions.add("on"); | |||
prepositions.add("with"); | |||
prepositions.add("to"); | |||
prepositions.add("from"); | |||
prepositions.add("before"); | |||
prepositions.add("after"); | |||
prepositions.add("of"); | |||
prepositions.add("for"); | |||
prepositions.add("as"); | |||
try { | |||
loadPredicateId(); | |||
loadDboPredicate(); | |||
loadParaDict(); | |||
buildInvertedIndex(); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
/** | |||
* Load the mapping between predicates and their IDs. | |||
* @throws IOException | |||
*/ | |||
public void loadPredicateId () throws IOException { | |||
predicate_2_id = new HashMap<String, Integer>(); | |||
id_2_predicate = new HashMap<Integer, String>(); | |||
String input_filename = dbpedia_predicate_id; | |||
File file = new File(input_filename); | |||
InputStreamReader in = null; | |||
BufferedReader br = null; | |||
try{ | |||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||
br = new BufferedReader(in); | |||
String line = null; | |||
while ((line = br.readLine())!= null) { | |||
String[] lines = line.split("\t"); | |||
predicate_2_id.put(lines[0], Integer.parseInt(lines[1])); | |||
id_2_predicate.put(Integer.parseInt(lines[1]), lines[0]); | |||
} | |||
}catch(IOException e){ | |||
System.out.println("NLPatterns.loadPredicateId() : IOException!"); | |||
e.printStackTrace(); | |||
}finally{ | |||
if(br != null){ | |||
try{ | |||
br.close(); | |||
}catch(IOException e){ | |||
e.printStackTrace(); | |||
} | |||
} | |||
} | |||
System.out.println("NLPatterns.loadPredicateId() : ok!"); | |||
} | |||
public void loadDboPredicate() throws IOException | |||
{ | |||
dbo_predicate_id = new HashSet<Integer>(); | |||
int cnt = 0; | |||
String input_filename = dbpedia_dbo_predicate; | |||
InputStreamReader in = null; | |||
BufferedReader br = null; | |||
try{ | |||
File file = new File(input_filename); | |||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||
br = new BufferedReader(in); | |||
String line = null; | |||
while ((line = br.readLine())!= null) | |||
{ | |||
if (!predicate_2_id.containsKey(line)) | |||
{ | |||
cnt++; | |||
//System.out.println("error: not found "+line+" id."); | |||
continue; | |||
} | |||
dbo_predicate_id.add(predicate_2_id.get(line)); | |||
} | |||
}catch(IOException e){ | |||
System.out.println("NLPatterns.loadDboPredicate() : IOException!"); | |||
}finally{ | |||
if(br!=null){ | |||
try{ | |||
br.close(); | |||
}catch(IOException e){ | |||
e.printStackTrace(); | |||
} | |||
} | |||
} | |||
System.out.println("Warning: DBO not found id count: "+cnt); | |||
System.out.println("NLPatterns.loadDboPredicate() : ok!"); | |||
} | |||
/** | |||
* Get predicate by its id | |||
* @param predicateID | |||
* @return | |||
*/ | |||
public String getPredicateById (int predicateID) { | |||
return id_2_predicate.get(predicateID); | |||
} | |||
public void loadParaDict () throws Exception { | |||
nlPattern_2_predicateList = new HashMap<String, ArrayList<PredicateIDAndSupport>>(); | |||
HashSet<String> missInDBP2014 = new HashSet<String>(); | |||
InputStreamReader in = null; | |||
BufferedReader br = null; | |||
try{ | |||
String inputFileName = dbpedia_relation_paraphrases_baseform_withScore_rerank; | |||
File file = new File(inputFileName); | |||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||
br = new BufferedReader(in); | |||
String line = null; | |||
int lineCount = 0; | |||
//line = br.readLine();//read the first line which indicates the format | |||
while ((line = br.readLine()) != null) | |||
{ | |||
if (line.startsWith("#")) continue; | |||
lineCount ++; | |||
String[] content = line.split("\t"); | |||
if(!predicate_2_id.containsKey(content[0])) | |||
{ | |||
missInDBP2014.add(content[0]); | |||
continue; | |||
} | |||
int predicateID = predicate_2_id.get(content[0]); | |||
String nlPattern = content[1].toLowerCase(); | |||
int support = Integer.parseInt(content[2]); | |||
//double score = Double.parseDouble(content[3]); | |||
String []slctString = content[3].split(" "); | |||
double[] slct = new double[slctString.length]; | |||
for (int i=0; i < slct.length; i++) { | |||
slct[i] = Double.parseDouble(slctString[i]); | |||
} | |||
if (!nlPattern_2_predicateList.containsKey(nlPattern)) { | |||
nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>()); | |||
} | |||
nlPattern_2_predicateList.get(nlPattern).add(new PredicateIDAndSupport(predicateID, support, slct)); | |||
} | |||
System.out.println("Number of NL-Patterns-to-predicate mappings = " + lineCount); | |||
System.out.println("NLPatterns.size = " + nlPattern_2_predicateList.size()); | |||
System.out.println("Predicate.size = " + predicate_2_id.size()); | |||
System.out.println("Warning: Predicates not in DBpedia 2014 count: "+missInDBP2014.size()); | |||
// Notice predicate itself and handwritten patterns have no wordSelectivity. | |||
addPredicateAsNLPattern(); // This is very important. | |||
addHandwriteAsNLPattern(); | |||
Iterator<String> it = nlPattern_2_predicateList.keySet().iterator(); | |||
while (it.hasNext()) { | |||
Collections.sort(nlPattern_2_predicateList.get(it.next())); | |||
} | |||
}catch(IOException e){ | |||
System.out.println("NLPatterns.Paradict() : IOException!"); | |||
}finally{ | |||
if(br!=null){ | |||
try{ | |||
br.close(); | |||
}catch(IOException e){ | |||
e.printStackTrace(); | |||
} | |||
} | |||
} | |||
System.out.println("NLPatterns.Paradict() : ok!"); | |||
} | |||
/** | |||
* A set of very important NL patterns are the predicates themselves! | |||
*/ | |||
public void addPredicateAsNLPattern () { | |||
final int support = 200; | |||
int predicate_id; | |||
for (String p : predicate_2_id.keySet()) | |||
{ | |||
// TODO: Omitting some bad relations (should be discarded in future) | |||
if(p.equals("state") || p.equals("states")) | |||
continue; | |||
predicate_id = predicate_2_id.get(p); | |||
StringBuilder pattern = new StringBuilder(""); | |||
// Work/runtime 11,SpaceStation/volume 68 and some predicates have prefix (DBpedia 2015), discard the prefix when generating pattern | |||
if(p.contains("/")) | |||
{ | |||
if(p.charAt(0)>='A' && p.charAt(0)<='Z') | |||
p = p.substring(p.indexOf("/")+1); | |||
//gameW/l 1974 | |||
else | |||
p = p.replace("/", ""); | |||
} | |||
int last = 0, i = 0; | |||
for(i = 0; i < p.length(); i ++) { | |||
// if it were not a small letter, then break it. | |||
if(!(p.charAt(i)>='a' && p.charAt(i)<='z')) { | |||
pattern.append(p.substring(last, i).toLowerCase()); | |||
pattern.append(" "); | |||
last = i; | |||
} | |||
} | |||
pattern.append(p.substring(last, i).toLowerCase()); | |||
for (i = 3; i < pattern.length(); i ++) { | |||
// the blank between two digits should be deleted. | |||
if (pattern.charAt(i)>='0' && pattern.charAt(i)<='9' | |||
&& pattern.charAt(i-1)==' ' | |||
&& pattern.charAt(i-2)>='0' && pattern.charAt(i-2)<='9') { | |||
pattern.deleteCharAt(i-1); | |||
} | |||
// the blank between I and D should be deleted. | |||
else if (pattern.charAt(i)=='d' | |||
&& pattern.charAt(i-1)==' ' | |||
&& pattern.charAt(i-2)=='i' | |||
&& pattern.charAt(i-3)==' ') { | |||
pattern.deleteCharAt(i-1); | |||
} | |||
// the blank between D and B should be deleted. | |||
else if (pattern.charAt(i)=='b' | |||
&& pattern.charAt(i-1)==' ' | |||
&& pattern.charAt(i-2)=='d' | |||
&& pattern.charAt(i-3)==' ') { | |||
pattern.deleteCharAt(i-1); | |||
} | |||
} | |||
// pattern -> base form | |||
/*String[] ptns = pattern.toString().split(" "); | |||
pattern = new StringBuilder(""); | |||
for (String s : ptns) { | |||
pattern.append(Globals.coreNLPparser.getBaseFormOfPattern(s)); | |||
pattern.append(" "); | |||
} | |||
pattern.deleteCharAt(pattern.length()-1); | |||
String patternString = pattern.toString();*/ | |||
// Special case cannot use base form, eg, foundingYear //TODO: maybe Porter's Algorithm | |||
String patternString = Globals.coreNLP.getBaseFormOfPattern(pattern.toString()); | |||
//System.out.println(p + "-->" + patternString); | |||
if (!nlPattern_2_predicateList.containsKey(patternString)) { | |||
nlPattern_2_predicateList.put(patternString, new ArrayList<PredicateIDAndSupport>()); | |||
} | |||
nlPattern_2_predicateList.get(patternString).add( | |||
new PredicateIDAndSupport(predicate_id, | |||
support, | |||
PredicateIDAndSupport.genSlct(patternString.split(" ").length))); | |||
} | |||
System.out.println("NLPatterns.addPredicateAsNLPattern(): ok!"); | |||
} | |||
public void addHandwriteAsNLPattern() throws IOException { | |||
String inputFileName = dbpedia_relation_paraphrases_handwrite; | |||
InputStreamReader in = null; | |||
BufferedReader br = null; | |||
try{ | |||
File file = new File(inputFileName); | |||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||
br = new BufferedReader(in); | |||
String line = null; | |||
//int lineCount = 0; | |||
//line = br.readLine();//read the first line which indicates the format | |||
while ((line = br.readLine()) != null) { | |||
if (line.startsWith("#") || line.isEmpty()) continue; | |||
//lineCount ++; | |||
String[] content = line.split("\t"); | |||
if(!predicate_2_id.containsKey(content[0])) | |||
continue; | |||
int predicateID = predicate_2_id.get(content[0]); | |||
String nlPattern = content[1].toLowerCase(); | |||
int support = Integer.parseInt(content[2]); | |||
if (!nlPattern_2_predicateList.containsKey(nlPattern)) { | |||
nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>()); | |||
} | |||
nlPattern_2_predicateList.get(nlPattern).add( | |||
new PredicateIDAndSupport(predicateID, | |||
support, | |||
PredicateIDAndSupport.genSlct(nlPattern.split(" ").length))); | |||
} | |||
}catch(IOException e){ | |||
System.out.println("NLPatterns.addHandwriteAsNLPattern(): IOException!"); | |||
}finally{ | |||
if(br!=null){ | |||
try{ | |||
br.close(); | |||
}catch(IOException e){ | |||
e.printStackTrace(); | |||
} | |||
} | |||
} | |||
System.out.println("NLPatterns.addHandwriteAsNLPattern(): ok!"); | |||
} | |||
/** | |||
* Show the NLPatterns | |||
*/ | |||
public void showNLPatterns () { | |||
/*for (String s: syntacticMarker) { | |||
System.out.println(s); | |||
} | |||
GlobalTools.systemPause();*/ | |||
System.out.println("predicate-->id"); | |||
for (String s : predicate_2_id.keySet()) { | |||
System.out.println(s + "-->" + predicate_2_id.get(s)); | |||
} | |||
Globals.systemPause(); | |||
int count = 1; | |||
System.out.println("nlPattern-->predicate<support>"); | |||
for (String p : nlPattern_2_predicateList.keySet()) { | |||
System.out.print("" + (count++) + ".\t" + p + "\t[" + nlPattern_2_predicateList.get(p).size() + "]\t"); | |||
for (PredicateIDAndSupport i : nlPattern_2_predicateList.get(p)) { | |||
System.out.print(id_2_predicate.get(i.predicateID) + "<" + i.support + ">" + ", "); | |||
} | |||
System.out.println(); | |||
} | |||
} | |||
/** | |||
* Build the inverted index, where each word will be mapped to the patterns that it occurs | |||
*/ | |||
public void buildInvertedIndex () { | |||
invertedIndex = new HashMap<String, ArrayList<String>>(); | |||
// traversing all patterns | |||
for (String p : nlPattern_2_predicateList.keySet()) { | |||
String[] tokens = p.split(" "); | |||
for (String token : tokens) { | |||
if (token.length() < 1) continue; | |||
if (!invertedIndex.containsKey(token)) { | |||
invertedIndex.put(token, new ArrayList<String>()); | |||
} | |||
invertedIndex.get(token).add(p); | |||
} | |||
} | |||
System.out.println("NLPatterns.buildInvertedIndex(): ok!"); | |||
} | |||
public static void main (String[] args) { | |||
Globals.coreNLP = new CoreNLP(); | |||
Globals.pd = new ParaphraseDictionary(); | |||
//Globals.pd.showNLPatterns(); | |||
} | |||
} |
@@ -0,0 +1,24 @@ | |||
package paradict; | |||
public class PredicateIDAndSupport implements Comparable<PredicateIDAndSupport> { | |||
public int predicateID; | |||
public int support; | |||
public double[] wordSelectivity = null; // wordSelectivity helps PATTY patterns ranking more accurate. | |||
public PredicateIDAndSupport(int _pid, int _support, double[] _slct) { | |||
predicateID = _pid; | |||
support = _support; | |||
wordSelectivity = _slct; | |||
} | |||
public int compareTo(PredicateIDAndSupport o) { | |||
return o.support - this.support; | |||
} | |||
// only use for predicate itself and handwriting paraphrase | |||
public static double[] genSlct(int size) { | |||
double[] ret = new double[size]; | |||
for (int i=0;i<size;i++) ret[i] = 1.0; | |||
return ret; | |||
} | |||
} |
@@ -0,0 +1,105 @@ | |||
package qa; | |||
import java.util.ArrayList; | |||
public class Answer implements Comparable<Answer>{ | |||
public String questionFocusKey=null; | |||
public String questionFocusValue=null; | |||
public ArrayList<String> otherInformationKey = null; | |||
public ArrayList<String> otherInformationValue = null; | |||
public Answer(String qf, String[] ans) { | |||
otherInformationKey = new ArrayList<String>(); | |||
otherInformationValue = new ArrayList<String>(); | |||
int p1, p2; | |||
for (String line : ans) { | |||
System.out.println("line=" + line); | |||
if (line.startsWith(qf)) { | |||
questionFocusKey = qf; | |||
p1 = line.indexOf('<'); | |||
p2 = line.lastIndexOf('>'); | |||
String value = null; | |||
if (p1 != -1 && p2 != -1) { | |||
value = line.substring(p1+1, p2); | |||
} | |||
else { | |||
p1 = line.indexOf('\"'); | |||
p2 = line.lastIndexOf('\"'); | |||
if(p1 != -1 && p2 != -1) | |||
value = line.substring(p1+1, p2); | |||
else | |||
{ | |||
p1 = line.indexOf(':'); | |||
value = line.substring(p1+1); | |||
} | |||
} | |||
questionFocusValue = value; | |||
} | |||
else { | |||
p1 = line.indexOf(':'); | |||
String key = line.substring(0, p1); | |||
p1 = line.indexOf('<'); | |||
p2 = line.lastIndexOf('>'); | |||
String value = null; | |||
if (p1 != -1 && p2 != -1) { | |||
value = line.substring(p1+1, p2); | |||
} | |||
else { | |||
p1 = line.indexOf('\"'); | |||
p2 = line.lastIndexOf('\"'); | |||
if(p1 != -1 && p2 != -1) | |||
value = line.substring(p1+1, p2); | |||
else | |||
{ | |||
p1 = line.indexOf(':'); | |||
value = line.substring(p1+1); | |||
} | |||
} | |||
otherInformationKey.add(key); | |||
otherInformationValue.add(value); | |||
} | |||
} | |||
// Sove BUG: GStore return messy code in questionFocusKey | |||
if (questionFocusKey==null || questionFocusValue==null) | |||
{ | |||
questionFocusKey = qf; | |||
String line = ans[0]; | |||
p1 = line.indexOf('<'); | |||
p2 = line.lastIndexOf('>'); | |||
String value = null; | |||
if (p1 != -1 && p2 != -1) { | |||
value = line.substring(p1+1, p2); | |||
} | |||
else { | |||
p1 = line.indexOf('\"'); | |||
p2 = line.lastIndexOf('\"'); | |||
if(p1 != -1 && p2 != -1) | |||
value = line.substring(p1+1, p2); | |||
else | |||
{ | |||
p1 = line.indexOf(':'); | |||
value = line.substring(p1+1); | |||
} | |||
} | |||
questionFocusValue = value; | |||
otherInformationKey.clear(); | |||
otherInformationValue.clear(); | |||
} | |||
/*System.out.println("otherInformationKey.size=" + otherInformationKey.size()); | |||
for (String k : otherInformationKey) { | |||
System.out.println("otherInfoKey = " + k); | |||
}*/ | |||
} | |||
public int compareTo (Answer p) | |||
{ | |||
return questionFocusValue.compareTo(p.questionFocusValue); | |||
} | |||
} |
@@ -0,0 +1,376 @@ | |||
package qa; | |||
import java.io.*; | |||
import java.net.Socket; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashSet; | |||
import java.util.List; | |||
import jgsc.GstoreConnector; | |||
import log.QueryLogger; | |||
import nlp.ds.Sentence; | |||
import nlp.ds.Sentence.SentenceType; | |||
import qa.parsing.QuestionParsing; | |||
import qa.parsing.BuildQueryGraph; | |||
import rdf.Sparql; | |||
import utils.FileUtil; | |||
import addition.AddtionalFix; | |||
import qa.Globals; | |||
public class GAnswer { | |||
public static final int MAX_SPQ_NUM = 3; | |||
public static void init() { | |||
System.out.println("gAnswer2 init ..."); | |||
Globals.init(); | |||
System.out.println("gAnswer2 init ... ok!"); | |||
} | |||
public QueryLogger getSparqlList(String input) | |||
{ | |||
QueryLogger qlog = null; | |||
try | |||
{ | |||
if (input.length() <= 5) | |||
return null; | |||
System.out.println("[Input:] "+input); | |||
// step 0: Node (entity & type & literal) Recognition | |||
long t0 = System.currentTimeMillis(), t, NRtime; | |||
Query query = new Query(input); | |||
qlog = new QueryLogger(query); | |||
ArrayList<Sparql> rankedSparqls = new ArrayList<Sparql>(); | |||
NRtime = (int)(System.currentTimeMillis()-t0); | |||
System.out.println("step0 [Node Recognition] : "+ NRtime +"ms"); | |||
// Try to solve each NR plan, and combine the ranked SPARQLs. | |||
// We only reserve LOG of BEST NR plan for convenience. | |||
for(int i=query.sList.size()-1; i>=0; i--) | |||
{ | |||
Sentence possibleSentence = query.sList.get(i); | |||
qlog.reloadSentence(possibleSentence); | |||
// qlog.isMaltParserUsed = true; | |||
// LOG | |||
System.out.println("transQ: "+qlog.s.plainText); | |||
qlog.NRlog = query.preLog; | |||
qlog.SQGlog = "Id: "+query.queryId+"\nQuery: "+query.NLQuestion+"\n"; | |||
qlog.SQGlog += qlog.NRlog; | |||
qlog.timeTable.put("step0", (int)NRtime); | |||
// step 1: question parsing (dependency tree, sentence type) | |||
t = System.currentTimeMillis(); | |||
QuestionParsing step1 = new QuestionParsing(); | |||
step1.process(qlog); | |||
qlog.timeTable.put("step1", (int)(System.currentTimeMillis()-t)); | |||
// step 2: build query graph (structure construction, relation extraction, top-k join) | |||
t = System.currentTimeMillis(); | |||
BuildQueryGraph step2 = new BuildQueryGraph(); | |||
step2.process(qlog); | |||
// step2.processEXP(qlog); | |||
qlog.timeTable.put("step2", (int)(System.currentTimeMillis()-t)); | |||
// step 3: some fix (such as "one-node" or "ask-one-triple") and aggregation | |||
t = System.currentTimeMillis(); | |||
AddtionalFix step3 = new AddtionalFix(); | |||
step3.process(qlog); | |||
// Collect SPARQLs. | |||
rankedSparqls.addAll(qlog.rankedSparqls); | |||
qlog.timeTable.put("step3", (int)(System.currentTimeMillis()-t)); | |||
} | |||
// deduplicate in SPARQL | |||
for(Sparql spq: rankedSparqls) | |||
spq.deduplicate(); | |||
// Sort (descending order). | |||
Collections.sort(rankedSparqls); | |||
qlog.rankedSparqls = rankedSparqls; | |||
System.out.println("number of rankedSparqls = " + qlog.rankedSparqls.size()); | |||
// Detect question focus. | |||
for (int i=0; i<qlog.rankedSparqls.size(); i++) | |||
{ | |||
// First detect by SPARQLs. | |||
Sparql spq = qlog.rankedSparqls.get(i); | |||
String questionFocus = QuestionParsing.detectQuestionFocus(spq); | |||
// If failed, use TARGET directly. | |||
if(questionFocus == null) | |||
questionFocus = "?"+qlog.target.originalForm; | |||
spq.questionFocus = questionFocus; | |||
} | |||
return qlog; | |||
} | |||
catch (Exception e) { | |||
e.printStackTrace(); | |||
return qlog; | |||
} | |||
} | |||
public String getStdSparqlWoPrefix(QueryLogger qlog, Sparql curSpq) | |||
{ | |||
if(qlog == null || curSpq == null) | |||
return null; | |||
String res = ""; | |||
if (qlog.s.sentenceType==SentenceType.GeneralQuestion) | |||
res += "ask where"; | |||
else | |||
{ | |||
if(!curSpq.countTarget) | |||
res += ("select DISTINCT " + curSpq.questionFocus + " where"); | |||
else | |||
res += ("select COUNT(DISTINCT " + curSpq.questionFocus + ") where"); | |||
} | |||
res += "\n"; | |||
res += curSpq.toStringForGStore(); | |||
if(curSpq.moreThanStr != null) | |||
{ | |||
res += curSpq.moreThanStr+"\n"; | |||
} | |||
if(curSpq.mostStr != null) | |||
{ | |||
res += curSpq.mostStr+"\n"; | |||
} | |||
return res; | |||
} | |||
// Notice, this will change the original SPARQL. | |||
public Sparql getUntypedSparql (Sparql spq) | |||
{ | |||
if(spq == null) | |||
return null; | |||
spq.removeAllTypeInfo(); | |||
if (spq.tripleList.size() == 0) return null; | |||
return spq; | |||
} | |||
/** | |||
* Get answers from Virtuoso + DBpedia, this function require OLD version Virtuoso + Virtuoso Handler. | |||
* Virtuoso can solve "Aggregation" | |||
**/ | |||
// public Matches getAnswerFromVirtuoso (QueryLogger qlog, Sparql spq) | |||
// { | |||
// Matches ret = new Matches(); | |||
// try | |||
// { | |||
// Socket socket = new Socket(Globals.QueryEngineIP, 1112); | |||
// DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream())); | |||
// | |||
// //formatting SPARQL & evaluate | |||
// String formatedSpq = spq.toStringForVirtuoso(); | |||
// dos.writeUTF(formatedSpq); | |||
// dos.flush(); | |||
// System.out.println("STD SPARQL:\n"+formatedSpq+"\n"); | |||
// | |||
// ArrayList<String> rawLines = new ArrayList<String>(); | |||
// DataInputStream dis = new DataInputStream(new BufferedInputStream(socket.getInputStream())); | |||
// while (true) | |||
// { | |||
// String line = dis.readUTF(); | |||
// if (line.equals("[[finish]]")) break; | |||
// rawLines.add(line); | |||
// } | |||
// | |||
// // ASK query was translated to SELECT query, whose answer need translation. | |||
// // It is no need to translate, use "ASK WHERE" directly ! 2018-12-11 | |||
// if(qlog.s.sentenceType == SentenceType.GeneralQuestion) | |||
// { | |||
// ret.answersNum = 1; | |||
// ret.answers = new String[1][1]; | |||
// if(rawLines.size() == 0) | |||
// { | |||
// ret.answers[0][0] = "general:false"; | |||
// } | |||
// else | |||
// { | |||
// ret.answers[0][0] = "general:true"; | |||
// } | |||
// System.out.println("general question answer:" + ret.answers[0][0]); | |||
// dos.close(); | |||
// dis.close(); | |||
// socket.close(); | |||
// return ret; | |||
// } | |||
// | |||
// //select but no results | |||
// if (rawLines.size() == 0) | |||
// { | |||
// ret.answersNum = 0; | |||
// dos.close(); | |||
// dis.close(); | |||
// socket.close(); | |||
// return ret; | |||
// } | |||
// | |||
// int ansNum = rawLines.size(); | |||
// int varNum = variables.size(); | |||
// ArrayList<String> valist = new ArrayList<String>(variables); | |||
// ret.answers = new String[ansNum][varNum]; | |||
// | |||
// System.out.println("ansNum=" + ansNum); | |||
// System.out.println("varNum=" + varNum); | |||
// for (int i=0;i<rawLines.size();i++) | |||
// { | |||
// String[] ansLineContents = rawLines.get(i).split("\t"); | |||
// for (int j=0;j<varNum;j++) | |||
// { | |||
// ret.answers[i][j] = valist.get(j) + ":" + ansLineContents[j]; | |||
// } | |||
// } | |||
// | |||
// dos.close(); | |||
// dis.close(); | |||
// socket.close(); | |||
// } | |||
// catch (Exception e) { | |||
// e.printStackTrace(); | |||
// } | |||
// | |||
// return ret; | |||
// } | |||
public Matches getAnswerFromGStore2 (Sparql spq) | |||
{ | |||
// modified by Lin Yinnian using ghttp - 2018-9-28 | |||
GstoreConnector gc = new GstoreConnector("172.31.222.90", 9001); | |||
String answer = gc.query("root", "123456", "dbpedia16", spq.toStringForGStore2()); | |||
System.out.println(answer); | |||
String[] rawLines = answer.split("\n"); | |||
Matches ret = new Matches(); | |||
if (rawLines.length == 0 || rawLines[0].equals("[empty result]")) | |||
{ | |||
ret.answersNum = 0; | |||
return ret; | |||
} | |||
int ansNum = rawLines.length-1; | |||
String[] varLineContents = rawLines[0].split("\t"); | |||
int varNum = varLineContents.length; | |||
ret.answers = new String[ansNum][varNum]; | |||
System.out.println("ansNum=" + ansNum); | |||
System.out.println("varNum=" + varNum); | |||
System.out.println("rawLines.length=" + rawLines.length); | |||
for (int i=1;i<rawLines.length;i++) | |||
{ | |||
// if one answer of rawAnswer contains '\n', it may leads error so we just return. | |||
if(i-1 >= ansNum) | |||
break; | |||
String[] ansLineContents = rawLines[i].split("\t"); | |||
for (int j=0;j<varNum;j++) | |||
{ | |||
ret.answers[i-1][j] = varLineContents[j] + ":" + ansLineContents[j]; | |||
} | |||
} | |||
return ret; | |||
} | |||
public static void main (String[] args) | |||
{ | |||
Globals.init(); | |||
GAnswer ga = new GAnswer(); | |||
int i =1; | |||
//file in/output | |||
List<String> inputList = FileUtil.readFile("E:/Linyinnian/qald6_special.txt"); | |||
for(String input: inputList) | |||
{ | |||
ArrayList<String> outputs = new ArrayList<String>(); | |||
ArrayList<String> spqs = new ArrayList<String>(); | |||
spqs.add("id:"+String.valueOf(i)); | |||
i++; | |||
long parsing_st_time = System.currentTimeMillis(); | |||
QueryLogger qlog = ga.getSparqlList(input); | |||
if(qlog == null || qlog.rankedSparqls == null) | |||
continue; | |||
long parsing_ed_time = System.currentTimeMillis(); | |||
System.out.println("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); | |||
System.out.println("TripleCheck time: "+ qlog.timeTable.get("TripleCheck") + "ms"); | |||
System.out.println("SparqlCheck time: "+ qlog.timeTable.get("SparqlCheck") + "ms"); | |||
System.out.println("Ranked Sparqls: " + qlog.rankedSparqls.size()); | |||
outputs.add(qlog.SQGlog); | |||
outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms"); | |||
outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms"); | |||
outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); | |||
long excuting_st_time = System.currentTimeMillis(); | |||
Matches m = null; | |||
System.out.println("[RESULT]"); | |||
ArrayList<String> lastSpqList = new ArrayList<String>(); | |||
int idx; | |||
// Consider top-5 SPARQLs | |||
for(idx=1; idx<=Math.min(qlog.rankedSparqls.size(), 5); idx++) | |||
{ | |||
Sparql curSpq = qlog.rankedSparqls.get(idx-1); | |||
String stdSPQwoPrefix = ga.getStdSparqlWoPrefix(qlog, curSpq); | |||
lastSpqList.add(stdSPQwoPrefix); | |||
System.out.println("[" + idx + "]" + "score=" + curSpq.score); | |||
System.out.println(stdSPQwoPrefix); | |||
// Print top-3 SPARQLs to file. | |||
if(idx <= MAX_SPQ_NUM) | |||
// spqs.add("[" + idx + "]" + "score=" + curSpq.score + "\n" + stdSPQwoPrefix); | |||
outputs.add("[" + idx + "]" + "score=" + curSpq.score + "\n" + stdSPQwoPrefix); | |||
// // Execute by Virtuoso or GStore when answers not found | |||
if(m == null || m.answers == null) | |||
{ | |||
if (curSpq.tripleList.size()>0 && curSpq.questionFocus!=null) | |||
{ | |||
// if(ga.isBGP(qlog, curSpq)) | |||
m = ga.getAnswerFromGStore2(curSpq); | |||
// else | |||
// m = ga.getAnswerFromVirtuoso(qlog, curSpq); | |||
} | |||
if (m != null && m.answers != null) | |||
{ | |||
// Found results using current SPQ, then we can break and print result. | |||
qlog.sparql = curSpq; | |||
qlog.match = m; | |||
qlog.reviseAnswers(); | |||
System.out.println("Query Executing time: "+ (int)(System.currentTimeMillis() - excuting_st_time)+ "ms"); | |||
} | |||
} | |||
} | |||
// Some TYPEs can be omitted, (such as <type> <yago:Wife>) | |||
if(!qlog.rankedSparqls.isEmpty()) | |||
{ | |||
Sparql untypedSparql = ga.getUntypedSparql(qlog.rankedSparqls.get(0)); | |||
if(untypedSparql != null) | |||
{ | |||
String stdSPQwoPrefix = ga.getStdSparqlWoPrefix(qlog, untypedSparql); | |||
if(!lastSpqList.contains(stdSPQwoPrefix)) | |||
// spqs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n"); | |||
outputs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n"); | |||
} | |||
} | |||
outputs.add(qlog.match.toString()); | |||
FileUtil.writeFile(outputs, "E:/Linyinnian/qald6_special_out.txt", true); | |||
} | |||
} | |||
} |
@@ -0,0 +1,118 @@ | |||
package qa; | |||
import java.io.BufferedReader; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import lcn.EntityFragmentFields; | |||
import fgmt.RelationFragment; | |||
import fgmt.TypeFragment; | |||
import paradict.ParaphraseDictionary; | |||
import qa.mapping.DBpediaLookup; | |||
import nlp.tool.NERecognizer; | |||
import nlp.tool.CoreNLP; | |||
import nlp.tool.MaltParser; | |||
import nlp.tool.StanfordParser; | |||
import nlp.tool.StopWordsList; | |||
public class Globals { | |||
// nlp tools | |||
public static CoreNLP coreNLP; | |||
public static StanfordParser stanfordParser; | |||
public static StopWordsList stopWordsList; | |||
public static MaltParser maltParser; | |||
public static NERecognizer nerRecognizer; | |||
// relation paraphrase dictionary | |||
public static ParaphraseDictionary pd; | |||
// entity linking system | |||
public static DBpediaLookup dblk; | |||
public static int MaxAnswerNum = 100; | |||
/* | |||
* evaluationMethod: | |||
* 1. baseline(SQG), does not allow CIRCLE and WRONG edge. The structure may be different by changing the TARGET. | |||
* 2. super SQG, allow CIRCLE and WRONG edge. The structure is decided by DS tree, and can be changed in query evaluation(TOP-K match) stage. | |||
* */ | |||
public static int evaluationMethod = 2; | |||
public static boolean isRunAsWebServer = false; // Run Local: false; Run Server: true | |||
public static String runningBenchmark = "QALD"; // WQ:WebQuestions; WQSP:WebQuestionsSP; CQ:ComplexQuestions | |||
// using different method and Freebase Version (in Virtuoso.java) | |||
public static boolean usingOperationCondition = false; // only for EXP: try state transition operations only when condition are satisfied. | |||
public static String localPath = "/media/wip/husen/NBgAnswer/"; | |||
public static String QueryEngineIP = "127.0.0.1"; // Notice, PORT number is in the evaluation function. | |||
public static void init () | |||
{ | |||
System.out.println("====== gAnswer2.0 over DBpedia ======"); | |||
if(isRunAsWebServer == false) | |||
{ | |||
localPath = "D:/husen/gAnswer/"; | |||
QueryEngineIP = "172.31.222.72"; | |||
} | |||
long t1, t2, t3, t4, t5, t6, t7, t8, t9; | |||
t1 = System.currentTimeMillis(); | |||
coreNLP = new CoreNLP(); | |||
t2 = System.currentTimeMillis(); | |||
stanfordParser = new StanfordParser(); | |||
t3 = System.currentTimeMillis(); | |||
maltParser = new MaltParser(); | |||
t4 = System.currentTimeMillis(); | |||
nerRecognizer = new NERecognizer(); | |||
t5 = System.currentTimeMillis(); | |||
stopWordsList = new StopWordsList(); | |||
t6 = System.currentTimeMillis(); | |||
pd = new ParaphraseDictionary(); | |||
t7 = System.currentTimeMillis(); | |||
try | |||
{ | |||
EntityFragmentFields.load(); | |||
RelationFragment.load(); | |||
TypeFragment.load(); | |||
} | |||
catch (Exception e1) { | |||
System.out.println("EntityIDs and RelationFragment and TypeFragment loading error!"); | |||
e1.printStackTrace(); | |||
} | |||
t8 = System.currentTimeMillis(); | |||
dblk = new DBpediaLookup(); | |||
t9 = System.currentTimeMillis(); | |||
System.out.println("======Initialization======"); | |||
System.out.println("CoreNLP(Lemma): " + (t2-t1) + "ms."); | |||
System.out.println("StanfordParser: " + (t3-t2) + "ms."); | |||
System.out.println("MaltParser: " + (t4-t3) + "ms."); | |||
System.out.println("NERecognizer: " + (t5-t4) + "ms."); | |||
System.out.println("StopWordsList: " + (t6-t5) + "ms."); | |||
System.out.println("ParaphraseDict & posTagPattern: " + (t7-t6) + "ms."); | |||
System.out.println("GraphFragments: " + (t8-t7) + "ms."); | |||
System.out.println("DBpediaLookup: " + (t9-t8) + "ms."); | |||
System.out.println("* Total *: " + (t9-t1) + "ms."); | |||
System.out.println("=========================="); | |||
} | |||
/** | |||
* Use as system("pause") in C | |||
*/ | |||
public static void systemPause () { | |||
System.out.println("System pause ..."); | |||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||
try { | |||
br.readLine(); | |||
} catch (IOException e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,9 @@ | |||
package qa; | |||
public class Matches { | |||
public String[][] answers = null; | |||
public int answersNum = 0; | |||
public long time = 0; | |||
public static final int pageNum = 3000; | |||
} |
@@ -0,0 +1,128 @@ | |||
package qa; | |||
import java.util.ArrayList; | |||
import nlp.ds.Sentence; | |||
import qa.extract.EntityRecognition; | |||
import rdf.MergedWord; | |||
/** | |||
* 1. preprocessing of question | |||
* 2. Node Recognition | |||
* @author husen | |||
*/ | |||
public class Query | |||
{ | |||
public String NLQuestion = null; | |||
public String TransferedQuestion = null; | |||
public ArrayList<String> MergedQuestionList = null; | |||
public ArrayList<Sentence> sList = null; | |||
public String queryId = null; | |||
public String preLog = ""; | |||
public ArrayList<MergedWord> mWordList = null; | |||
public Query(){} | |||
public Query(String _question) | |||
{ | |||
NLQuestion = _question; | |||
NLQuestion = removeQueryId(NLQuestion); | |||
TransferedQuestion = getTransferedQuestion(NLQuestion); | |||
// step1. NODE Recognition | |||
MergedQuestionList = getMergedQuestionList(TransferedQuestion); | |||
// build Sentence | |||
sList = new ArrayList<Sentence>(); | |||
for(String mergedQuestion: MergedQuestionList) | |||
{ | |||
Sentence sentence = new Sentence(this, mergedQuestion); | |||
sList.add(sentence); | |||
} | |||
} | |||
public boolean isDigit(char ch) | |||
{ | |||
if(ch>='0' && ch<='9') | |||
return true; | |||
return false; | |||
} | |||
public boolean isUpperWord(char ch) | |||
{ | |||
if(ch>='A' && ch<='Z') | |||
return true; | |||
return false; | |||
} | |||
/** | |||
* some words -> equivalent words | |||
* 1、stanfordParser often parse incorrect. | |||
* 2、Synonyms unify. eg, movie->film | |||
* @param question | |||
* @return transfered question | |||
*/ | |||
public String getTransferedQuestion(String question) | |||
{ | |||
//rule1: discard ".", because "." and "_" will be disconnected by parser. Discard word tail's "'", which may pollutes NER | |||
question = question.replace("' ", " "); | |||
String [] words = question.split(" "); | |||
String ret = ""; | |||
for(String word: words) | |||
{ | |||
String retWord = word; | |||
//TODO: now just check NUM in head/tail | |||
if(word.length()>=2 && !isDigit(word.charAt(0)) && !isDigit(word.charAt(word.length()-1))) | |||
{ | |||
retWord = retWord.replace(".", ""); | |||
} | |||
ret += retWord + " "; | |||
} | |||
if(ret.length()>1) | |||
ret = ret.substring(0,ret.length()-1); | |||
ret = ret.replace("-", " "); | |||
ret = ret.replace("in america", "in United States"); | |||
//rule2: as well as -> and | |||
ret = ret.replace("as well as", "and"); | |||
//rule3: movie -> film | |||
ret = ret.replace(" movie", " film"); | |||
ret = ret.replace(" movies", " films"); | |||
return ret; | |||
} | |||
/** | |||
* Recognize entity & type & literal in KB and replace " " in Phrases with "_" | |||
* @param question | |||
* @return merged question list | |||
*/ | |||
public ArrayList<String> getMergedQuestionList(String question) | |||
{ | |||
ArrayList<String> mergedQuestionList = null; | |||
//entity & type recognize | |||
EntityRecognition er = new EntityRecognition(); | |||
mergedQuestionList = er.process(question); | |||
preLog = er.preLog; | |||
mWordList = er.mWordList; | |||
return mergedQuestionList; | |||
} | |||
public String removeQueryId(String question) | |||
{ | |||
String ret = question; | |||
int st = question.indexOf("\t"); | |||
if(st!=-1 && question.length()>1 && question.charAt(0)>='0' && question.charAt(0)<='9') | |||
{ | |||
queryId = question.substring(0,st); | |||
ret = question.substring(st+1); | |||
System.out.println("Extract QueryId :"+queryId); | |||
} | |||
return ret; | |||
} | |||
} |
@@ -0,0 +1,153 @@ | |||
package qa.extract; | |||
import java.util.ArrayList; | |||
import java.util.HashSet; | |||
import qa.Globals; | |||
import log.QueryLogger; | |||
import nlp.ds.DependencyTree; | |||
import nlp.ds.DependencyTreeNode; | |||
import nlp.ds.Word; | |||
import rdf.SimpleRelation; | |||
public class CorefResolution { | |||
/** | |||
* 1. a very simple reference resolution | |||
* 2. Coref Resolution should be done after relation extraction and before items mapping | |||
*/ | |||
public void process(ArrayList<SimpleRelation> simpleRelations, QueryLogger qlog) { | |||
if (qlog.s.words.length <= 4) return; // if the sentence is too short, skip the coref step. | |||
System.out.println("=====Co-reference resolution======="); | |||
ArrayList<SimpleRelation> deleteList = new ArrayList<SimpleRelation>(); | |||
for(SimpleRelation sr : simpleRelations) { | |||
Word w1=null, w2=null; | |||
if (sr.extractingMethod == 'S') { | |||
w1 = getRefWord(sr.arg1Word.getNnHead(), qlog.s.dependencyTreeStanford, qlog); | |||
w2 = getRefWord(sr.arg2Word.getNnHead(), qlog.s.dependencyTreeStanford, qlog); | |||
} | |||
else if (sr.extractingMethod == 'M') { | |||
w1 = getRefWord(sr.arg1Word.getNnHead(), qlog.s.dependencyTreeMalt, qlog); | |||
w2 = getRefWord(sr.arg2Word.getNnHead(), qlog.s.dependencyTreeMalt, qlog); | |||
} | |||
else { | |||
continue; | |||
} | |||
if (w1 != null) { | |||
sr.arg1Word_beforeCRR = sr.arg1Word; | |||
sr.arg1Word = w1; | |||
} | |||
if (w2 != null) { | |||
sr.arg2Word_beforeCRR = sr.arg2Word; | |||
sr.arg2Word = w2; | |||
} | |||
if (sr.arg1Word == sr.arg2Word) | |||
deleteList.add(sr); | |||
} | |||
simpleRelations.removeAll(deleteList); | |||
printCRR(qlog); | |||
System.out.println("==================================="); | |||
} | |||
// return the reference word of w | |||
public Word getRefWord (Word w, DependencyTree dt, QueryLogger qlog) { | |||
w = w.getNnHead(); | |||
if (w.crr != null) { | |||
return w.crr; | |||
} | |||
/* | |||
* method: (suitable for stanford parser (old version)) | |||
* (1) WDT --det--> [] eg: Which city is located in China? | |||
* (2) WDT -------> V/J --rcmod--> [] eg: Who is married to someone that was born in Rome? | |||
* "when is the sth" is conflict with this rule, so discarded. (3) W -------> be <------- [] eg: Who is the author of WikiLeaks? | |||
* (4) WDT -------> V --ccomp--> [] eg: The actor that married the child of a politician. | |||
* (5) DT(that, which) --dep--> V eg:The actors that married an athlete. // DS parser error. | |||
* (6) W(position=1) ------> NN eg:What are the language used in China? // DS parser error, should eliminate "WRB":When was Carlo Giuliani shot? | |||
* (7) where <--advmod-- V <--advcl-- V --prep/pobj--> [] eg: Who graduate from the school where Keqiang Li graduates? | |||
*/ | |||
DependencyTreeNode dtn = dt.getNodeByIndex(w.position); | |||
// no need for root | |||
if (dtn.father == null) return null; | |||
try { | |||
if(dtn.word.posTag.equals("WDT") && dtn.dep_father2child.equals("det")) { // (1) | |||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead()); | |||
w.crr = dtn.father.word.getNnHead(); | |||
} | |||
else if(dtn.word.posTag.startsWith("W") && !dtn.word.posTag.equals("WRB") && dtn.word.position == 1 && dtn.father.word.posTag.equals("NN")) { // (6) | |||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead()); | |||
w.crr = dtn.father.word.getNnHead(); | |||
} | |||
else if(dtn.word.posTag.equals("DT") | |||
&& dtn.dep_father2child.equals("dep") | |||
&& (dtn.word.baseForm.equals("that")||dtn.word.baseForm.equals("which"))) { // (5) | |||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead()); | |||
w.crr = dtn.father.word.getNnHead(); | |||
} | |||
// else if(dtn.word.posTag.startsWith("W") | |||
// && dtn.father.word.baseForm.equals("be")) { // (3) //&& dtn.dep_father2child.equals("attr") | |||
// DependencyTreeNode target = dtn.father.containDependencyWithChildren("nsubj"); | |||
// if (target != null) { | |||
// if(qlog.MODE_debug) System.out.println(w + "-->" + target.word.getNnHead()); | |||
// w.crr = target.word.getNnHead(); | |||
// } | |||
// } | |||
else if(dtn.word.posTag.equals("WDT") | |||
&& (dtn.father.word.posTag.startsWith("V") || dtn.father.word.posTag.startsWith("J")) | |||
&& dtn.father.dep_father2child.equals("rcmod")) { // (2) | |||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.father.word.getNnHead()); | |||
w.crr = dtn.father.father.word.getNnHead(); | |||
} | |||
else if(dtn.word.posTag.equals("WDT") | |||
&& dtn.father.word.posTag.startsWith("V") | |||
&& dtn.father.dep_father2child.equals("ccomp")) { // (4) | |||
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.father.word.getNnHead()); | |||
w.crr = dtn.father.father.word.getNnHead(); | |||
} | |||
else if (dtn.word.baseForm.equals("where") | |||
&& dtn.dep_father2child.equals("advmod") | |||
&& dtn.father.dep_father2child.equals("advcl")) { // (7) | |||
DependencyTreeNode target = dtn.father.father.containDependencyWithChildren("prep"); | |||
if (target != null) { | |||
target = target.containDependencyWithChildren("pobj"); | |||
} | |||
else { | |||
for (DependencyTreeNode n : dtn.father.father.childrenList) { | |||
if (Globals.pd.relns_object.contains(n.dep_father2child)) { | |||
target = n; | |||
} | |||
} | |||
} | |||
if (target != null) { | |||
if(qlog.MODE_debug) System.out.println(w + "-->" + target.word.getNnHead()); | |||
w.crr = target.word.getNnHead(); | |||
} | |||
} | |||
} catch (Exception e) {} | |||
return w.crr; | |||
} | |||
public void printCRR (QueryLogger qlog) { | |||
HashSet<Word> printed = new HashSet<Word>(); | |||
for (Word w : qlog.s.words) { | |||
w = w.getNnHead(); | |||
if (printed.contains(w)) | |||
continue; | |||
if (w.crr != null) | |||
System.out.println("\""+w.getFullEntityName() + "\" is resoluted to \"" + w.crr.getFullEntityName() + "\""); | |||
printed.add(w); | |||
} | |||
} | |||
} |
@@ -0,0 +1,918 @@ | |||
package qa.extract; | |||
import java.io.BufferedReader; | |||
//import java.io.File; | |||
//import java.io.FileInputStream; | |||
//import java.io.FileNotFoundException; | |||
//import java.io.FileOutputStream; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
//import java.io.OutputStreamWriter; | |||
//import java.io.UnsupportedEncodingException; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.Comparator; | |||
import java.util.HashMap; | |||
import java.util.List; | |||
import lcn.EntityFragmentFields; | |||
import fgmt.EntityFragment; | |||
import nlp.ds.Word; | |||
import qa.Globals; | |||
import rdf.EntityMapping; | |||
import rdf.NodeSelectedWithScore; | |||
import rdf.TypeMapping; | |||
import rdf.MergedWord; | |||
import utils.FileUtil; | |||
import addition.*; | |||
/** | |||
* Core class of Node Recognition | |||
* @author husen | |||
*/ | |||
public class EntityRecognition { | |||
public String preLog = ""; | |||
public String stopEntFilePath = Globals.localPath + "data/DBpedia2016/parapharse/stopEntDict.txt"; | |||
double EntAcceptedScore = 26; | |||
double TypeAcceptedScore = 0.5; | |||
double AcceptedDiffScore = 1; | |||
public HashMap<String, String> m2e = null; | |||
public ArrayList<MergedWord> mWordList = null; | |||
public ArrayList<String> stopEntList = null; | |||
public ArrayList<String> badTagListForEntAndType = null; | |||
ArrayList<ArrayList<Integer>> selectedList = null; | |||
TypeRecognition tr = null; | |||
AddtionalFix af = null; | |||
public EntityRecognition() | |||
{ | |||
// LOG | |||
preLog = ""; | |||
loadStopEntityDict(); | |||
// Bad posTag for entity | |||
badTagListForEntAndType = new ArrayList<String>(); | |||
badTagListForEntAndType.add("RBS"); | |||
badTagListForEntAndType.add("JJS"); | |||
badTagListForEntAndType.add("W"); | |||
badTagListForEntAndType.add("."); | |||
badTagListForEntAndType.add("VBD"); | |||
badTagListForEntAndType.add("VBN"); | |||
badTagListForEntAndType.add("VBZ"); | |||
badTagListForEntAndType.add("VBP"); | |||
badTagListForEntAndType.add("POS"); | |||
// !Handwriting entity linking; (lower case) | |||
m2e = new HashMap<String, String>(); | |||
m2e.put("bipolar_syndrome", "Bipolar_disorder"); | |||
m2e.put("battle_in_1836_in_san_antonio", "Battle_of_San_Jacinto"); | |||
m2e.put("federal_minister_of_finance_in_germany", "Federal_Ministry_of_Finance_(Germany)"); | |||
// Additional fix for CATEGORY (in DBpedia) | |||
af = new AddtionalFix(); | |||
tr = new TypeRecognition(); | |||
System.out.println("EntityRecognizer Initial : ok!"); | |||
} | |||
public void loadStopEntityDict() | |||
{ | |||
stopEntList = new ArrayList<String>(); | |||
try | |||
{ | |||
List<String> inputs = FileUtil.readFile(stopEntFilePath); | |||
for(String line: inputs) | |||
{ | |||
if(line.startsWith("#")) | |||
continue; | |||
stopEntList.add(line); | |||
} | |||
} | |||
catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
public ArrayList<String> process(String question) | |||
{ | |||
ArrayList<String> fixedQuestionList = new ArrayList<String>(); | |||
ArrayList<Integer> literalList = new ArrayList<Integer>(); | |||
HashMap<Integer, Double> entityScores = new HashMap<Integer, Double>(); | |||
HashMap<Integer, Integer> entityMappings = new HashMap<Integer, Integer>(); | |||
HashMap<Integer, Double> typeScores = new HashMap<Integer, Double>(); | |||
HashMap<Integer, String> typeMappings = new HashMap<Integer, String>(); | |||
HashMap<Integer, Double> mappingScores = new HashMap<Integer, Double>(); | |||
ArrayList<Integer> mustSelectedList = new ArrayList<Integer>(); | |||
System.out.println("--------- entity/type recognition start ---------"); | |||
Word[] words = Globals.coreNLP.getTaggedWords(question); | |||
mWordList = new ArrayList<MergedWord>(); | |||
long t1 = System.currentTimeMillis(); | |||
int checkEntCnt = 0, checkTypeCnt = 0, hitEntCnt = 0, hitTypeCnt = 0, allCnt = 0; | |||
boolean needRemoveCommas = false; | |||
// Check entity & type | |||
// Notice, ascending order by length | |||
StringBuilder tmpOW = new StringBuilder(); | |||
StringBuilder tmpBW = new StringBuilder(); | |||
for(int len=1; len<=words.length; len++) | |||
{ | |||
for(int st=0,ed=st+len; ed<=words.length; st++,ed++) | |||
{ | |||
String originalWord = "", baseWord = "", allUpperWord = ""; | |||
//String[] posTagArr = new String[len]; | |||
for(int j=st; j<ed; j++) | |||
{ | |||
//posTagArr[j-st] = words[j].posTag; | |||
//originalWord += words[j].originalForm; | |||
//baseWord += words[j].baseForm; | |||
tmpOW.append(words[j].originalForm); | |||
tmpBW.append(words[j].baseForm); | |||
String tmp = words[j].originalForm; | |||
if(tmp.length()>0 && tmp.charAt(0) >='a' && tmp.charAt(0)<='z') | |||
{ | |||
String pre = tmp.substring(0,1).toUpperCase(); | |||
tmp = pre + tmp.substring(1); | |||
} | |||
allUpperWord += tmp; | |||
if(j < ed-1) | |||
{ | |||
//originalWord += "_"; | |||
//baseWord += "_"; | |||
tmpOW.append("_"); | |||
tmpBW.append("_"); | |||
} | |||
} | |||
originalWord = tmpOW.toString(); | |||
baseWord=tmpBW.toString(); | |||
tmpOW.setLength(0); | |||
tmpBW.setLength(0); | |||
allCnt++; | |||
/* | |||
* Filters to save time and drop some bad cases. | |||
*/ | |||
boolean entOmit = false, typeOmit = false; | |||
int prep_cnt=0; | |||
// Upper words can pass filter. eg: "Melbourne , Florida" | |||
int UpperWordCnt = 0; | |||
for(int i=st;i<ed;i++) | |||
if((words[i].originalForm.charAt(0)>='A' && words[i].originalForm.charAt(0)<='Z') | |||
|| ((words[i].posTag.equals(",") || words[i].originalForm.equals("'")) && i>st && i<ed-1)) | |||
UpperWordCnt++; | |||
// Filters | |||
if(UpperWordCnt<len || st==0) | |||
{ | |||
if(st==0) | |||
{ | |||
if(!words[st].posTag.startsWith("DT") && !words[st].posTag.startsWith("N")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
else if(st>0) | |||
{ | |||
Word formerWord = words[st-1]; | |||
//as princess | |||
if(formerWord.baseForm.equals("as")) | |||
entOmit = true; | |||
//how many dogs? | |||
if(formerWord.baseForm.equals("many")) | |||
entOmit = true; | |||
//obama's daughter ; your height | len=1 to avoid: Asimov's Foundation series | |||
if(len == 1 && (formerWord.posTag.startsWith("POS") || formerWord.posTag.startsWith("PRP"))) | |||
entOmit = true; | |||
//the father of you | |||
if(ed<words.length) | |||
{ | |||
Word nextWord = words[ed]; | |||
if(formerWord.posTag.equals("DT") && nextWord.posTag.equals("IN")) | |||
entOmit = true; | |||
} | |||
//the area code of ; the official language of | |||
boolean flag1=false, flag2=false; | |||
for(int i=0;i<=st;i++) | |||
if(words[i].posTag.equals("DT")) | |||
flag1 = true; | |||
for(int i=ed-1;i<words.length;i++) | |||
if(words[i].posTag.equals("IN")) | |||
flag2 = true; | |||
if(flag1 && flag2) | |||
entOmit = true; | |||
} | |||
if(ed < words.length) | |||
{ | |||
Word nextWord = words[ed]; | |||
// (lowerCase)+(UpperCase) | |||
if(nextWord.originalForm.charAt(0)>='A' && nextWord.originalForm.charAt(0)<='Z') | |||
entOmit = true; | |||
} | |||
for(int i=st;i<ed;i++) | |||
{ | |||
if(words[i].posTag.startsWith("I")) | |||
prep_cnt++; | |||
for(String badTag: badTagListForEntAndType) | |||
{ | |||
if(words[i].posTag.startsWith(badTag)) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
break; | |||
} | |||
} | |||
if(words[i].posTag.startsWith("P") && (i!=ed-1 || len==1)){ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
// First word | |||
if(i==st) | |||
{ | |||
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("EX") || words[i].posTag.startsWith("TO")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
if(words[i].posTag.startsWith("D") && len==2){ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
if(words[i].baseForm.startsWith("list") || words[i].baseForm.startsWith("many")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
if(words[i].baseForm.equals("and")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
// Last word. | |||
if(i==ed-1) | |||
{ | |||
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("D") || words[i].posTag.startsWith("TO")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
if(words[i].baseForm.equals("and")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
// Single word. | |||
if(len==1) | |||
{ | |||
//TODO: Omit general noun. eg: father, book ... | |||
if(!words[i].posTag.startsWith("N")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
} | |||
// Too many preposition. | |||
if(prep_cnt >= 3) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
/* | |||
* Filter done. | |||
*/ | |||
// Search category | highest priority | |||
String category = null; | |||
if(af.pattern2category.containsKey(baseWord)) | |||
{ | |||
typeOmit = true; | |||
entOmit = true; | |||
category = af.pattern2category.get(baseWord); | |||
} | |||
// Search type | |||
int hitMethod = 0; // 1=dbo(baseWord), 2=dbo(originalWord), 3=yago|extend() | |||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||
if(!typeOmit) | |||
{ | |||
System.out.println("Type Check: "+originalWord); | |||
//checkTypeCnt++; | |||
//search standard type | |||
tmList = tr.getTypeIDsAndNamesByStr(baseWord); | |||
if(tmList == null || tmList.size() == 0) | |||
{ | |||
tmList = tr.getTypeIDsAndNamesByStr(originalWord); | |||
if(tmList != null && tmList.size()>0) | |||
hitMethod = 2; | |||
} | |||
else | |||
hitMethod = 1; | |||
//Search extend type (YAGO type) | |||
if(tmList == null || tmList.size() == 0) | |||
{ | |||
tmList = tr.getExtendTypeByStr(allUpperWord); | |||
if(tmList != null && tmList.size() > 0) | |||
{ | |||
preLog += "++++ Extend Type detect: "+baseWord+": "+" prefferd relaiton:"+tmList.get(0).prefferdRelation+"\n"; | |||
hitMethod = 3; | |||
} | |||
} | |||
} | |||
// Search entity | |||
ArrayList<EntityMapping> emList = new ArrayList<EntityMapping>(); | |||
if(!entOmit && !stopEntList.contains(baseWord)) | |||
{ | |||
System.out.println("Ent Check: "+originalWord); | |||
checkEntCnt++; | |||
// Notice, the second parameter is whether use DBpedia Lookup. | |||
emList = getEntityIDsAndNamesByStr(originalWord, (UpperWordCnt>=len-1 || len==1),len); | |||
if(emList == null || emList.size() == 0) | |||
{ | |||
emList = getEntityIDsAndNamesByStr(baseWord, (UpperWordCnt>=len-1 || len==1), len); | |||
} | |||
if(emList!=null && emList.size()>10) | |||
{ | |||
ArrayList<EntityMapping> tmpList = new ArrayList<EntityMapping>(); | |||
for(int i=0;i<10;i++) | |||
{ | |||
tmpList.add(emList.get(i)); | |||
} | |||
emList = tmpList; | |||
} | |||
} | |||
MergedWord mWord = new MergedWord(st,ed,originalWord); | |||
// Add category | |||
if(category != null) | |||
{ | |||
mWord.mayCategory = true; | |||
mWord.category = category; | |||
int key = st*(words.length+1) + ed; | |||
mustSelectedList.add(key); | |||
} | |||
// Add literal | |||
if(len==1 && checkLiteralWord(words[st])) | |||
{ | |||
mWord.mayLiteral = true; | |||
int key = st*(words.length+1) + ed; | |||
literalList.add(key); | |||
} | |||
// Add type mappings | |||
if(tmList!=null && tmList.size()>0) | |||
{ | |||
// Drop by score threshold | |||
if(tmList.get(0).score < TypeAcceptedScore) | |||
typeOmit = true; | |||
// Only allow EXACT MATCH when method=1|2 | |||
// TODO: consider approximate match and taxonomy. eg, actor->person | |||
String likelyType = tmList.get(0).typeName.toLowerCase(); | |||
String candidateBase = baseWord.replace("_", ""), candidateOriginal = originalWord.replace("_", "").toLowerCase(); | |||
if(!candidateBase.equals(likelyType) && hitMethod == 1) | |||
typeOmit = true; | |||
if(!candidateOriginal.equals(likelyType) && hitMethod == 2) | |||
typeOmit = true; | |||
if(!typeOmit) | |||
{ | |||
mWord.mayType = true; | |||
mWord.tmList = tmList; | |||
int key = st*(words.length+1) + ed; | |||
typeMappings.put(key, tmList.get(0).typeName); | |||
typeScores.put(key, tmList.get(0).score); | |||
} | |||
} | |||
// Add entity mappings | |||
if(emList!=null && emList.size()>0) | |||
{ | |||
// Drop by score threshold | |||
if(emList.get(0).score < EntAcceptedScore) | |||
entOmit = true; | |||
// Drop: the [German Shepherd] dog | |||
else if(len > 2) | |||
{ | |||
for(int key: entityMappings.keySet()) | |||
{ | |||
//int te=key%(words.length+1); | |||
int ts=key/(words.length+1); | |||
if(ts == st+1 && ts <= ed) | |||
{ | |||
//DT in lowercase (allow uppercase, such as: [The Pillars of the Earth]) | |||
if(words[st].posTag.startsWith("DT") && !(words[st].originalForm.charAt(0)>='A'&&words[st].originalForm.charAt(0)<='Z')) | |||
{ | |||
entOmit = true; | |||
} | |||
} | |||
} | |||
} | |||
// Record info in merged word | |||
if(!entOmit) | |||
{ | |||
mWord.mayEnt = true; | |||
mWord.emList = emList; | |||
// use to remove duplicate and select | |||
int key = st*(words.length+1) + ed; | |||
entityMappings.put(key, emList.get(0).entityID); | |||
// fix entity score | conflict resolution | |||
double score = emList.get(0).score; | |||
String likelyEnt = emList.get(0).entityName.toLowerCase().replace(" ", "_"); | |||
String lowerOriginalWord = originalWord.toLowerCase(); | |||
// !Award: whole match | |||
if(likelyEnt.equals(lowerOriginalWord)) | |||
score *= len; | |||
// !Award: COVER (eg, Robert Kennedy: [Robert] [Kennedy] [Robert Kennedy]) | |||
//像Social_Democratic_Party,这三个word任意组合都是ent,导致方案太多;相比较“冲突选哪个”,“连or不应该连”显得更重要(而且实际错误多为连或不连的错误),所以这里直接抛弃被覆盖的小ent | |||
//像Abraham_Lincoln,在“不连接”的方案中,会把他们识别成两个node,最后得分超过了正确答案的得分;故对于这种词设置为必选 | |||
if(len>1) | |||
{ | |||
boolean[] flag = new boolean[words.length+1]; | |||
ArrayList<Integer> needlessEntList = new ArrayList<Integer>(); | |||
double tmpScore=0; | |||
for(int preKey: entityMappings.keySet()) | |||
{ | |||
if(preKey == key) | |||
continue; | |||
int te=preKey%(words.length+1),ts=preKey/(words.length+1); | |||
for(int i=ts;i<te;i++) | |||
flag[i] = true; | |||
if(st<=ts && ed>= te) | |||
{ | |||
needlessEntList.add(preKey); | |||
tmpScore += entityScores.get(preKey); | |||
} | |||
} | |||
int hitCnt = 0; | |||
for(int i=st;i<ed;i++) | |||
if(flag[i]) | |||
hitCnt++; | |||
// WHOLE match || HIGH match & HIGH upper || WHOLE upper | |||
if(hitCnt == len || ((double)hitCnt/(double)len > 0.6 && (double)UpperWordCnt/(double)len > 0.6) || UpperWordCnt == len || len>=4) | |||
{ | |||
//如中间有逗号,则要求两边的词都在mapping的entity中出现 | |||
//例如 Melbourne_,_Florida: Melbourne, Florida 是必须选的,而 California_,_USA: Malibu, California,认为不一定正确 | |||
boolean commaTotalRight = true; | |||
if(originalWord.contains(",")) | |||
{ | |||
String candidateCompactString = originalWord.replace(",","").replace("_", "").toLowerCase(); | |||
String likelyCompactEnt = likelyEnt.replace(",","").replace("_", ""); | |||
if(!candidateCompactString.equals(likelyCompactEnt)) | |||
commaTotalRight = false; | |||
else | |||
{ | |||
mWord.name = mWord.name.replace("_,_","_"); | |||
needRemoveCommas = true; | |||
} | |||
} | |||
if(commaTotalRight) | |||
{ | |||
mustSelectedList.add(key); | |||
if(tmpScore>score) | |||
score = tmpScore+1; | |||
for(int preKey: needlessEntList) | |||
{ | |||
entityMappings.remove(preKey); | |||
mustSelectedList.remove(Integer.valueOf(preKey)); | |||
} | |||
} | |||
} | |||
} | |||
//NOTICE: score in mWord have no changes. we only change the score in entityScores. | |||
entityScores.put(key,score); | |||
} | |||
} | |||
if(mWord.mayCategory || mWord.mayEnt || mWord.mayType || mWord.mayLiteral) | |||
mWordList.add(mWord); | |||
} | |||
} | |||
/* Print all candidates (use fixed score).*/ | |||
System.out.println("------- Result ------"); | |||
for(MergedWord mWord: mWordList) | |||
{ | |||
int key = mWord.st * (words.length+1) + mWord.ed; | |||
if(mWord.mayCategory) | |||
{ | |||
System.out.println("Detect category mapping: "+mWord.name+": "+ mWord.category +" score: 100.0"); | |||
preLog += "++++ Category detect: "+mWord.name+": "+mWord.category+" score: 100.0\n"; | |||
} | |||
if(mWord.mayEnt) | |||
{ | |||
System.out.println("Detect entity mapping: "+mWord.name+": ["); | |||
for(EntityMapping em: mWord.emList) | |||
System.out.print(em.entityName + ", "); | |||
System.out.println("]"); | |||
preLog += "++++ Entity detect: "+mWord.name+": "+mWord.emList.get(0).entityName+" score:"+entityScores.get(key)+"\n"; | |||
hitEntCnt++; | |||
} | |||
if(mWord.mayType) | |||
{ | |||
System.out.println("Detect type mapping: "+mWord.name+": ["); | |||
for(TypeMapping tm: mWord.tmList) | |||
System.out.print(tm.typeName + ", "); | |||
System.out.println("]"); | |||
preLog += "++++ Type detect: "+mWord.name+": "+mWord.tmList.get(0).typeName +" score:"+typeScores.get(key)+"\n"; | |||
hitTypeCnt++; | |||
} | |||
if(mWord.mayLiteral) | |||
{ | |||
System.out.println("Detect literal: "+mWord.name); | |||
preLog += "++++ Literal detect: "+mWord.name+"\n"; | |||
} | |||
} | |||
/* | |||
* Sort by score and remove duplicate. | |||
* eg, <"video_game" "ent:Video game" "50.0"> <"a_video_game" "ent:Video game" "45.0">. | |||
* Notice, reserve all information in mWordList. | |||
*/ | |||
// one ENT maps different mergedWord in query, reserve the higher score. | |||
ByValueComparator bvc = new ByValueComparator(entityScores,words.length+1); | |||
List<Integer> keys = new ArrayList<Integer>(entityMappings.keySet()); | |||
Collections.sort(keys, bvc); | |||
for(Integer key : keys) | |||
{ | |||
if(!mappingScores.containsKey(entityMappings.get(key))) | |||
mappingScores.put(entityMappings.get(key), entityScores.get(key)); | |||
else | |||
entityMappings.remove(key); | |||
} | |||
selectedList = new ArrayList<ArrayList<Integer>>(); | |||
ArrayList<Integer> selected = new ArrayList<Integer>(); | |||
// Some phrases must be selected. | |||
selected.addAll(mustSelectedList); | |||
for(Integer key: typeMappings.keySet()) | |||
{ | |||
// !type(len>1) (Omit len=1 because: [Brooklyn Bridge] is a entity. | |||
int ed = key%(words.length+1), st = key/(words.length+1); | |||
if(st+1 < ed) | |||
{ | |||
boolean beCovered = false; | |||
//Entity cover type, eg:[prime_minister of Spain] | |||
for(int preKey: entityMappings.keySet()) | |||
{ | |||
int te=preKey%(words.length+1),ts=preKey/(words.length+1); | |||
//Entiy should longer than type | |||
if(ts <= st && te >= ed && ed-st < te-ts) | |||
{ | |||
beCovered = true; | |||
} | |||
} | |||
if(!beCovered) | |||
selected.add(key); | |||
} | |||
} | |||
// Conflict resolution | |||
ArrayList<Integer> noConflictSelected = new ArrayList<Integer>(); | |||
//select longer one when conflict | |||
boolean[] flag = new boolean[words.length]; | |||
ByLenComparator blc = new ByLenComparator(words.length+1); | |||
Collections.sort(selected,blc); | |||
for(Integer key : selected) | |||
{ | |||
int ed = key%(words.length+1), st = (key-ed)/(words.length+1); | |||
boolean omit = false; | |||
for(int i=st;i<ed;i++) | |||
{ | |||
if(flag[i]) | |||
{ | |||
omit = true; | |||
break; | |||
} | |||
} | |||
if(omit) | |||
continue; | |||
for(int i=st;i<ed;i++) | |||
flag[i]=true; | |||
noConflictSelected.add(key); | |||
} | |||
// Scoring and ranking --> top-k decision | |||
dfs(keys,0,noConflictSelected,words.length+1); | |||
ArrayList<NodeSelectedWithScore> nodeSelectedWithScoreList = new ArrayList<NodeSelectedWithScore>(); | |||
for(ArrayList<Integer> select: selectedList) | |||
{ | |||
double score = 0; | |||
for(Integer key: select) | |||
{ | |||
if(entityScores.containsKey(key)) | |||
score += entityScores.get(key); | |||
if(typeScores.containsKey(key)) | |||
score += typeScores.get(key); | |||
} | |||
NodeSelectedWithScore tmp = new NodeSelectedWithScore(select, score); | |||
nodeSelectedWithScoreList.add(tmp); | |||
} | |||
Collections.sort(nodeSelectedWithScoreList); | |||
// Replace | |||
int cnt = 0; | |||
for(int k=0; k<nodeSelectedWithScoreList.size(); k++) | |||
{ | |||
if(k >= nodeSelectedWithScoreList.size()) | |||
break; | |||
selected = nodeSelectedWithScoreList.get(k).selected; | |||
Collections.sort(selected); | |||
int j = 0; | |||
String res = question; | |||
if(selected.size()>0) | |||
{ | |||
res = words[0].originalForm; | |||
int tmp = selected.get(j++), st = tmp/(words.length+1), ed = tmp%(words.length+1); | |||
for(int i=1;i<words.length;i++) | |||
{ | |||
if(i>st && i<ed) | |||
{ | |||
res = res+"_"+words[i].originalForm; | |||
} | |||
else | |||
{ | |||
res = res+" "+words[i].originalForm; | |||
} | |||
if(i >= ed && j<selected.size()) | |||
{ | |||
tmp = selected.get(j++); | |||
st = tmp/(words.length+1); | |||
ed = tmp%(words.length+1); | |||
} | |||
} | |||
} | |||
else | |||
{ | |||
res = words[0].originalForm; | |||
for(int i=1;i<words.length;i++) | |||
{ | |||
res = res+" "+words[i].originalForm; | |||
} | |||
} | |||
boolean ok = true; | |||
for(String str: fixedQuestionList) | |||
if(str.equals(res)) | |||
ok = false; | |||
if(!ok) | |||
continue; | |||
if(needRemoveCommas) | |||
res = res.replace("_,_","_"); | |||
System.out.println("Merged: "+res); | |||
preLog += "plan "+cnt+": "+res+"\n"; | |||
fixedQuestionList.add(res); | |||
cnt++; | |||
if(cnt >= 3) // top-3 | |||
break; | |||
} | |||
long t2 = System.currentTimeMillis(); | |||
// preLog += "Total hit/check/all ent num: "+hitEntCnt+" / "+checkEntCnt+" / "+allCnt+"\n"; | |||
// preLog += "Total hit/check/all type num: "+hitTypeCnt+" / "+checkTypeCnt+" / "+allCnt+"\n"; | |||
preLog += "Node Recognition time: "+ (t2-t1) + "ms\n"; | |||
System.out.println("Total check time: "+ (t2-t1) + "ms"); | |||
System.out.println("--------- pre entity/type recognition end ---------"); | |||
return fixedQuestionList; | |||
} | |||
public void dfs(List<Integer> keys,int dep,ArrayList<Integer> selected,int size) | |||
{ | |||
if(dep == keys.size()) | |||
{ | |||
ArrayList<Integer> tmpList = (ArrayList<Integer>) selected.clone(); | |||
selectedList.add(tmpList); | |||
} | |||
else | |||
{ | |||
//off: dep-th mWord | |||
dfs(keys,dep+1,selected,size); | |||
//on: no conflict | |||
boolean conflict = false; | |||
for(int preKey: selected) | |||
{ | |||
int curKey = keys.get(dep); | |||
int preEd = preKey%size, preSt = (preKey-preEd)/size; | |||
int curEd = curKey%size, curSt = (curKey-curEd)/size; | |||
if(!(preSt<preEd && preEd<=curSt && curSt<curEd) && !(curSt<curEd && curEd<=preSt && preSt<preEd)) | |||
conflict = true; | |||
} | |||
if(!conflict) | |||
{ | |||
selected.add(keys.get(dep)); | |||
dfs(keys,dep+1,selected,size); | |||
selected.remove(keys.get(dep)); | |||
} | |||
} | |||
} | |||
public ArrayList<EntityMapping> getEntityIDsAndNamesByStr(String entity, boolean useDblk, int len) | |||
{ | |||
String n = entity; | |||
ArrayList<EntityMapping> ret= new ArrayList<EntityMapping>(); | |||
//1. Handwriting | |||
if(m2e.containsKey(entity)) | |||
{ | |||
String eName = m2e.get(entity); | |||
EntityMapping em = new EntityMapping(EntityFragmentFields.entityName2Id.get(eName), eName, 1000); | |||
ret.add(em); | |||
return ret; //handwriting is always correct | |||
} | |||
//2. Lucene index | |||
ret.addAll(EntityFragment.getEntityMappingList(n)); | |||
//3. DBpedia Lookup (some cases) | |||
if (useDblk) | |||
{ | |||
ret.addAll(Globals.dblk.getEntityMappings(n, null)); | |||
} | |||
Collections.sort(ret); | |||
if (ret.size() > 0) return ret; | |||
else return null; | |||
} | |||
public int preferDBpediaLookupOrLucene(String entityName) | |||
{ | |||
int cntUpperCase = 0; | |||
int cntSpace = 0; | |||
int cntPoint = 0; | |||
int length = entityName.length(); | |||
for (int i=0; i<length; i++) | |||
{ | |||
char c = entityName.charAt(i); | |||
if (c==' ') | |||
cntSpace++; | |||
else if (c=='.') | |||
cntPoint++; | |||
else if (c>='A' && c<='Z') | |||
cntUpperCase++; | |||
} | |||
if ((cntUpperCase>0 || cntPoint>0) && cntSpace<3) | |||
return 1; | |||
if (cntUpperCase == length) | |||
return 1; | |||
return 0; | |||
} | |||
static class ByValueComparator implements Comparator<Integer> { | |||
HashMap<Integer, Double> base_map; | |||
int base_size; | |||
double eps = 1e-8; | |||
int dblcmp(double a,double b) | |||
{ | |||
if(a+eps < b) | |||
return -1; | |||
return b+eps<a ? 1:0; | |||
} | |||
public ByValueComparator(HashMap<Integer, Double> base_map, Integer size) { | |||
this.base_map = base_map; | |||
this.base_size = size; | |||
} | |||
public int compare(Integer arg0, Integer arg1) { | |||
if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) { | |||
return 0; | |||
} | |||
if (dblcmp(base_map.get(arg0),base_map.get(arg1))<0) { | |||
return 1; | |||
} | |||
else if (dblcmp(base_map.get(arg0),base_map.get(arg1))==0) | |||
{ | |||
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; | |||
if (len0 < len1) { | |||
return 1; | |||
} else if (len0 == len1) { | |||
return 0; | |||
} else { | |||
return -1; | |||
} | |||
} | |||
else { | |||
return -1; | |||
} | |||
} | |||
} | |||
static class ByLenComparator implements Comparator<Integer> { | |||
int base_size; | |||
public ByLenComparator(int size) { | |||
this.base_size = size; | |||
} | |||
public int compare(Integer arg0, Integer arg1) { | |||
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; | |||
if (len0 < len1) { | |||
return 1; | |||
} else if (len0 == len1) { | |||
return 0; | |||
} else { | |||
return -1; | |||
} | |||
} | |||
} | |||
public boolean isDigit(char ch) | |||
{ | |||
if(ch>='0' && ch<='9') | |||
return true; | |||
return false; | |||
} | |||
//TODO: other literal words. | |||
public boolean checkLiteralWord(Word word) | |||
{ | |||
boolean ok = false; | |||
if(word.posTag.equals("CD")) | |||
ok = true; | |||
return ok; | |||
} | |||
public static void main (String[] args) | |||
{ | |||
Globals.init(); | |||
EntityRecognition er = new EntityRecognition(); | |||
try | |||
{ | |||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||
while (true) | |||
{ | |||
System.out.println("Please input the question: "); | |||
String question = br.readLine(); | |||
er.process(question); | |||
} | |||
// File inputFile = new File("D:\\husen\\gAnswer\\data\\test\\test_in.txt"); | |||
// File outputFile = new File("D:\\husen\\gAnswer\\data\\test\\test_out.txt"); | |||
// BufferedReader fr = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile),"utf-8")); | |||
// OutputStreamWriter fw = new OutputStreamWriter(new FileOutputStream(outputFile,true),"utf-8"); | |||
// | |||
// String input; | |||
// while((input=fr.readLine())!=null) | |||
// { | |||
// String[] strArray = input.split("\t"); | |||
// String id = ""; | |||
// String question = strArray[0]; | |||
// if(strArray.length>1) | |||
// { | |||
// question = strArray[1]; | |||
// id = strArray[0]; | |||
// } | |||
// //Notice "?" may leads lucene/dbpedia lookup error | |||
// if(question.length()>1 && question.charAt(question.length()-1)=='.' || question.charAt(question.length()-1)=='?') | |||
// question = question.substring(0,question.length()-1); | |||
// if(question.isEmpty()) | |||
// continue; | |||
// er.process(question); | |||
// fw.write("Id: "+id+"\nQuery: "+question+"\n"); | |||
// fw.write(er.preLog+"\n"); | |||
// } | |||
// | |||
// fr.close(); | |||
// fw.close(); | |||
} catch (IOException e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,467 @@ | |||
package qa.extract; | |||
import java.io.BufferedReader; | |||
//import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.Comparator; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import java.util.List; | |||
import paradict.ParaphraseDictionary; | |||
import qa.Globals; | |||
import rdf.Sparql; | |||
import rdf.Triple; | |||
import rdf.ImplicitRelation; | |||
import lcn.EntityFragmentFields; | |||
import log.QueryLogger; | |||
import fgmt.EntityFragment; | |||
import fgmt.TypeFragment; | |||
import nlp.ds.Word; | |||
import nlp.tool.CoreNLP; | |||
public class ExtractImplicitRelation { | |||
static final int SamplingNumber = 100; // the maximum sampling number in calculation | |||
static final int k = 3; // select top-k when many suitable relations; select top-k entities for a word | |||
public HashMap<String, Integer> implicitEntRel = new HashMap<String, Integer>(); | |||
/* | |||
* Implicit Relations: | |||
* eg, Which is the film directed by Obama and starred by a Chinese ?x | |||
* 1. [What] is in a [chocolate_chip_cookie] ?var + ent | |||
* 2. What [country] is [Sitecore] from ?type + ent = [?var p ent + ?var<-type] | |||
* 3. Czech movies | Chinese actor ent + ?type | |||
* 4. President Obama type + ent | |||
* 5. Andy Liu's Hero(film) ent + ent | |||
* */ | |||
public ExtractImplicitRelation() | |||
{ | |||
//orignal word to lower case | |||
implicitEntRel.put("american", Globals.pd.predicate_2_id.get("country")); | |||
implicitEntRel.put("united_states", Globals.pd.predicate_2_id.get("country")); | |||
} | |||
// Notice, it is usually UNNECESSARY for two constant, so we unimplemented this function. | |||
// eg, "president Obama", "Andy Liu's Hero(film)". | |||
public ArrayList<Integer> getPrefferdPidListBetweenTwoConstant(Word w1, Word w2) | |||
{ | |||
ArrayList<Integer> res = new ArrayList<Integer>(); | |||
int w1Role = 0, w2Role = 0; // 0:var 1:ent 2:type | |||
if(w1.mayEnt && w1.emList.size()>0) | |||
w1Role = 1; | |||
if(w1.mayType && w1.tmList.size()>0) | |||
w1Role = 2; | |||
if(w2.mayEnt && w2.emList.size()>0) | |||
w2Role = 1; | |||
if(w2.mayType && w2.tmList.size()>0) | |||
w2Role = 2; | |||
//Reject variables | two types | |||
if(w1Role == 0 || w2Role == 0 || (w1Role == 2 && w2Role == 2)) | |||
return null; | |||
//ent1 & ent2 | |||
//if(w1Role == 1 && w2Role == 1) | |||
//{ | |||
//EntityFragment ef = null; | |||
// TODO: implement. | |||
//} | |||
return res; | |||
} | |||
public ArrayList<Triple> supplementTriplesByModifyWord(QueryLogger qlog) | |||
{ | |||
ArrayList<Triple> res = new ArrayList<Triple>(); | |||
ArrayList<Word> typeVariableList = new ArrayList<Word>(); | |||
// Modifier | |||
for(Word word: qlog.s.words) | |||
{ | |||
if(word.modifiedWord != null && word.modifiedWord != word) | |||
{ | |||
ArrayList<ImplicitRelation> irList = null; | |||
// ent -> typeVariable | eg, Chinese actor, Czech movies | TODO: consider more types of modifier | |||
if(word.mayEnt && word.modifiedWord.mayType) | |||
{ | |||
typeVariableList.add(word.modifiedWord); | |||
int tId = word.modifiedWord.tmList.get(0).typeID; // select the top-1 type | |||
String tName = word.modifiedWord.originalForm; | |||
for(int i=0; i<k&&i<word.emList.size(); i++) // select the top-k entities | |||
{ | |||
int eId = word.emList.get(i).entityID; | |||
String eName = word.emList.get(i).entityName; | |||
irList = getPrefferdPidListBetween_Entity_TypeVariable(eId, tId); | |||
// !Handwriting implicit relations | |||
if(irList != null && implicitEntRel.containsKey(word.originalForm.toLowerCase())) | |||
{ | |||
int pId = implicitEntRel.get(word.originalForm.toLowerCase()); | |||
ImplicitRelation ir = new ImplicitRelation(tId, eId, pId, 1000); | |||
irList.add(0, ir); | |||
} | |||
if(irList!=null && irList.size()>0) | |||
{ | |||
ImplicitRelation ir = irList.get(0); | |||
String subjName = null, objName = null; | |||
Word subjWord = null, objWord = null; | |||
if(ir.subjId == eId) | |||
{ | |||
subjName = eName; | |||
objName = "?"+tName; | |||
subjWord = word; | |||
objWord = word.modifiedWord; | |||
} | |||
else | |||
{ | |||
subjName = "?"+tName; | |||
objName = eName; | |||
subjWord = word.modifiedWord; | |||
objWord = word; | |||
} | |||
Triple triple = new Triple(ir.subjId, subjName, ir.pId, ir.objId, objName, null, ir.score, subjWord, objWord); | |||
res.add(triple); | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
if(qlog.rankedSparqls == null || qlog.rankedSparqls.size() == 0) | |||
{ | |||
if(res != null && res.size() > 0) | |||
{ | |||
Sparql spq = new Sparql(); | |||
for(Triple t: res) | |||
spq.addTriple(t); | |||
// Add type info | |||
for(Word typeVar: typeVariableList) | |||
{ | |||
Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+typeVar.originalForm, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeVar.tmList.get(0).typeName, null, 100); | |||
spq.addTriple(triple); | |||
} | |||
qlog.rankedSparqls.add(spq); | |||
} | |||
} | |||
else | |||
{ | |||
// Supplement implicit relations (modified) for each SPARQL. | |||
for(Sparql spq: qlog.rankedSparqls) | |||
{ | |||
for(Triple t: res) | |||
spq.addTriple(t); | |||
} | |||
} | |||
return res; | |||
} | |||
/* | |||
* eg:Czech|ent movies|?type Chinese|ent actor|?type | |||
* type variable + entity -> entities belong to type + entity | |||
* */ | |||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_TypeVariable(Integer entId, Integer typeId) | |||
{ | |||
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>(); | |||
TypeFragment tf = TypeFragment.typeFragments.get(typeId); | |||
EntityFragment ef2 = EntityFragment.getEntityFragmentByEntityId(entId); | |||
if(tf == null || ef2 == null) | |||
{ | |||
System.out.println("Error in getPrefferdPidListBetween_TypeVariable_Entity :Type(" + | |||
TypeFragment.typeId2ShortName.get(typeId) + ") or Entity(" + EntityFragmentFields.entityId2Name.get(entId) + ") no fragments."); | |||
return null; | |||
} | |||
// select entities belong to type, count relations | TODO: random select | |||
int samplingCnt = 0; | |||
HashMap<ImplicitRelation, Integer> irCount = new HashMap<ImplicitRelation, Integer>(); | |||
for(int candidateEid: tf.entSet) | |||
{ | |||
EntityFragment ef1 = EntityFragment.getEntityFragmentByEntityId(candidateEid); | |||
if(ef1 == null) | |||
continue; | |||
ArrayList<ImplicitRelation> tmp = getPrefferdPidListBetween_TwoEntities(ef1, ef2); | |||
if(tmp == null || tmp.size() == 0) | |||
continue; | |||
if(samplingCnt++ > SamplingNumber) | |||
break; | |||
for(ImplicitRelation ir: tmp) | |||
{ | |||
if(ir.subjId == candidateEid) | |||
ir.setSubjectId(Triple.VAR_ROLE_ID); | |||
else if(ir.objId == candidateEid) | |||
ir.setObjectId(Triple.VAR_ROLE_ID); | |||
if(irCount.containsKey(ir)) | |||
irCount.put(ir, irCount.get(ir)+1); | |||
else | |||
irCount.put(ir, 1); | |||
} | |||
} | |||
//sort, get top-k | |||
ByValueComparator bvc = new ByValueComparator(irCount); | |||
List<ImplicitRelation> keys = new ArrayList<ImplicitRelation>(irCount.keySet()); | |||
Collections.sort(keys, bvc); | |||
for(ImplicitRelation ir: keys) | |||
{ | |||
res.add(ir); | |||
if(res.size() >= k) | |||
break; | |||
} | |||
return res; | |||
} | |||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_TypeVariable(String entName, String typeName) | |||
{ | |||
if(!TypeFragment.typeShortName2IdList.containsKey(typeName) || !EntityFragmentFields.entityName2Id.containsKey(entName)) | |||
return null; | |||
return getPrefferdPidListBetween_Entity_TypeVariable(EntityFragmentFields.entityName2Id.get(entName), TypeFragment.typeShortName2IdList.get(typeName).get(0)); | |||
} | |||
static class ByValueComparator implements Comparator<ImplicitRelation> { | |||
HashMap<ImplicitRelation, Integer> base_map; | |||
public ByValueComparator(HashMap<ImplicitRelation, Integer> base_map) { | |||
this.base_map = base_map; | |||
} | |||
public int compare(ImplicitRelation arg0, ImplicitRelation arg1) { | |||
if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) | |||
return 0; | |||
if (base_map.get(arg0) < base_map.get(arg1)) | |||
return 1; | |||
else if (base_map.get(arg0) == base_map.get(arg1)) | |||
return 0; | |||
else | |||
return -1; | |||
} | |||
} | |||
/* | |||
* Notice, this function has not been used in fact. | |||
* eg:[What] is in a [chocolate_chip_cookie] | |||
* Just guess by single entity: select the most frequent edge. | |||
* */ | |||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_Variable(Integer entId, String var) | |||
{ | |||
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>(); | |||
EntityFragment ef = null; | |||
ef = EntityFragment.getEntityFragmentByEntityId(entId); | |||
if(ef == null) | |||
{ | |||
System.out.println("Error in getPrefferdPidListBetween_Entity_Variable: Entity No Fragments!"); | |||
return null; | |||
} | |||
// find most frequent inEdge | |||
int pid = findMostFrequentEdge(ef.inEntMap, ef.inEdges); | |||
if(pid != -1) | |||
res.add(new ImplicitRelation(Triple.VAR_ROLE_ID, entId, pid, 100)); | |||
// find most frequent outEdge | |||
pid = findMostFrequentEdge(ef.outEntMap, ef.outEdges); | |||
if(pid != -1) | |||
res.add(new ImplicitRelation(entId, Triple.VAR_ROLE_ID, pid, 100)); | |||
return res; | |||
} | |||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_Variable(String entName, String var) | |||
{ | |||
return getPrefferdPidListBetween_Entity_Variable(EntityFragmentFields.entityName2Id.get(entName), var); | |||
} | |||
public int findMostFrequentEdge(HashMap<Integer, ArrayList<Integer>> entMap, HashSet<Integer> edges) | |||
{ | |||
int mfPredicateId = -1, maxCount = 0; | |||
HashMap<Integer, Integer> edgeCount = new HashMap<Integer, Integer>(); | |||
for(int key: entMap.keySet()) | |||
{ | |||
for(int edge: entMap.get(key)) | |||
{ | |||
if(!edgeCount.containsKey(edge)) | |||
edgeCount.put(edge, 1); | |||
else | |||
edgeCount.put(edge, edgeCount.get(edge)+1); | |||
if(maxCount < edgeCount.get(edge)) | |||
{ | |||
maxCount = edgeCount.get(edge); | |||
mfPredicateId = edge; | |||
} | |||
} | |||
} | |||
return mfPredicateId; | |||
} | |||
// Unnecessary. | |||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TypeConstant_Entity(Integer typeId, Integer entId) | |||
{ | |||
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>(); | |||
TypeFragment tf = TypeFragment.typeFragments.get(typeId); | |||
if(tf == null) | |||
{ | |||
System.out.println("Error in getPrefferdPidListBetween_TypeConstant_Entity: Type No Fragments!"); | |||
return null; | |||
} | |||
// subj : ent1 | |||
if(tf.entSet.contains(entId)) | |||
{ | |||
ImplicitRelation ir = new ImplicitRelation(entId, typeId, Globals.pd.typePredicateID, 100); | |||
res.add(ir); | |||
} | |||
return res; | |||
} | |||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TwoEntities(String eName1, String eName2) | |||
{ | |||
return getPrefferdPidListBetween_TwoEntities(EntityFragmentFields.entityName2Id.get(eName1), EntityFragmentFields.entityName2Id.get(eName2)); | |||
} | |||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TwoEntities(Integer eId1, Integer eId2) | |||
{ | |||
EntityFragment ef1 = null, ef2 = null; | |||
ef1 = EntityFragment.getEntityFragmentByEntityId(eId1); | |||
ef2 = EntityFragment.getEntityFragmentByEntityId(eId2); | |||
if(ef1 == null || ef2 == null) | |||
{ | |||
System.out.println("Error in GetPrefferdPidListBetweenTwoEntities: Entity No Fragments!"); | |||
return null; | |||
} | |||
return getPrefferdPidListBetween_TwoEntities(ef1,ef2); | |||
} | |||
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TwoEntities(EntityFragment ef1, EntityFragment ef2) | |||
{ | |||
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>(); | |||
if(ef1 == null || ef2 == null) | |||
return null; | |||
int eId1 = ef1.eId; | |||
int eId2 = ef2.eId; | |||
// subj : ent1 | |||
if(ef1.outEntMap.containsKey(eId2)) | |||
{ | |||
ArrayList<Integer> pidList = ef1.outEntMap.get(eId2); | |||
for(int pid: pidList) | |||
{ | |||
// TODO: other score strategy | |||
ImplicitRelation ir = new ImplicitRelation(eId1, eId2, pid, 100); | |||
res.add(ir); | |||
} | |||
} | |||
// subj : ent2 | |||
else if(ef2.outEntMap.containsKey(eId1)) | |||
{ | |||
ArrayList<Integer> pidList = ef2.outEntMap.get(eId1); | |||
for(int pid: pidList) | |||
{ | |||
ImplicitRelation ir = new ImplicitRelation(eId2, eId1, pid, 100); | |||
res.add(ir); | |||
} | |||
} | |||
return res; | |||
} | |||
public static void main(String[] args) throws Exception { | |||
Globals.coreNLP = new CoreNLP(); | |||
Globals.pd = new ParaphraseDictionary(); | |||
try | |||
{ | |||
EntityFragmentFields.load(); | |||
TypeFragment.load(); | |||
} catch (Exception e) { | |||
// TODO Auto-generated catch block | |||
e.printStackTrace(); | |||
} | |||
ExtractImplicitRelation eir = new ExtractImplicitRelation(); | |||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||
String name1,name2; | |||
while(true) | |||
{ | |||
System.out.println("Input two node to extract their implicit relations:"); | |||
name1 = br.readLine(); | |||
name2 = br.readLine(); | |||
ArrayList<ImplicitRelation> irList = null; | |||
irList = eir.getPrefferdPidListBetween_TwoEntities(name1, name2); | |||
if(irList == null || irList.size()==0) | |||
System.out.println("Can't find!"); | |||
else | |||
{ | |||
for(ImplicitRelation ir: irList) | |||
{ | |||
int pId = ir.pId; | |||
String p = Globals.pd.getPredicateById(pId); | |||
System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId); | |||
System.out.println(ir.subj+"\t"+p+"\t"+ir.obj); | |||
} | |||
} | |||
// irList = eir.getPrefferdPidListBetween_TypeConstant_Entity(name1, name2); | |||
// if(irList == null || irList.size()==0) | |||
// System.out.println("Can't find!"); | |||
// else | |||
// { | |||
// for(ImplicitRelation ir: irList) | |||
// { | |||
// int pId = ir.pId; | |||
// String p = Globals.pd.getPredicateById(pId); | |||
// System.out.println(ir.subj+"\t"+p+"\t"+ir.obj); | |||
// } | |||
// } | |||
// irList = eir.getPrefferdPidListBetween_Entity_Variable(name1, name2); | |||
// if(irList == null || irList.size()==0) | |||
// System.out.println("Can't find!"); | |||
// else | |||
// { | |||
// for(ImplicitRelation ir: irList) | |||
// { | |||
// int pId = ir.pId; | |||
// String p = Globals.pd.getPredicateById(pId); | |||
// System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId); | |||
// } | |||
// } | |||
// irList = eir.getPrefferdPidListBetween_Entity_TypeVariable(name1, name2); | |||
// if(irList == null || irList.size()==0) | |||
// System.out.println("Can't find!"); | |||
// else | |||
// { | |||
// for(ImplicitRelation ir: irList) | |||
// { | |||
// int pId = ir.pId; | |||
// String p = Globals.pd.getPredicateById(pId); | |||
// System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId); | |||
// } | |||
// } | |||
} | |||
} | |||
} |
@@ -0,0 +1,472 @@ | |||
package qa.extract; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import java.util.LinkedList; | |||
import java.util.Queue; | |||
import log.QueryLogger; | |||
import nlp.ds.DependencyTree; | |||
import nlp.ds.DependencyTreeNode; | |||
//import nlp.ds.Word; | |||
import paradict.ParaphraseDictionary; | |||
import qa.Globals; | |||
import rdf.SimpleRelation; | |||
import rdf.PredicateMapping; | |||
import rdf.SemanticRelation; | |||
import rdf.SemanticUnit; | |||
public class ExtractRelation { | |||
public static final int notMatchedCountThreshold = 1; // the bigger, the looser (more relations can be extracted) | |||
public static final int notCoverageCountThreshold = 2; | |||
/* | |||
* Find relations by dependency tree & paraphrases. | |||
* */ | |||
public ArrayList<SimpleRelation> findRelationsBetweenTwoUnit(SemanticUnit su1, SemanticUnit su2, QueryLogger qlog) | |||
{ | |||
DependencyTree T = qlog.s.dependencyTreeStanford; | |||
if(qlog.isMaltParserUsed) | |||
T = qlog.s.dependencyTreeMalt; | |||
DependencyTreeNode n1 = T.getNodeByIndex(su1.centerWord.position), n2 = T.getNodeByIndex(su2.centerWord.position); | |||
ArrayList<DependencyTreeNode> shortestPath = T.getShortestNodePathBetween(n1,n2); | |||
ArrayList<SimpleRelation> ret = new ArrayList<SimpleRelation>(); | |||
HashSet<String> BoW_T = new HashSet<String>(); | |||
HashSet<String> SubBoW_T = new HashSet<String>(); | |||
// (Fix shortest path) Some cases consider the words not in shortest path | eg: What [be] [ent] (famous) for? | |||
// what-be-[ent], the word [be] is useless but we need (famous) | |||
if(shortestPath.size() == 3 && shortestPath.get(1).word.baseForm.equals("be") && T.nodesList.size() > shortestPath.get(2).word.position) | |||
{ | |||
shortestPath.remove(1); | |||
shortestPath.add(1, T.getNodeByIndex(shortestPath.get(1).word.position + 1)); | |||
} | |||
// Shortest path -> SubBag of Words | |||
for(DependencyTreeNode curNode: shortestPath) | |||
{ | |||
String text = curNode.word.baseForm; | |||
if(!curNode.word.isIgnored && !Globals.stopWordsList.isStopWord(text)) | |||
{ | |||
//!split words |eg, soccer club -> soccer_club(after node recognition) -> soccer club(used in matching paraphrase) | |||
if(curNode.word.mayEnt || curNode.word.mayType) | |||
{ | |||
String [] strArray = curNode.word.baseForm.split("_"); | |||
for(String str: strArray) | |||
SubBoW_T.add(str); | |||
} | |||
else | |||
{ | |||
SubBoW_T.add(text); | |||
} | |||
} | |||
} | |||
// DS tree -> Bag of Words | |||
for (DependencyTreeNode curNode : T.getNodesList()) | |||
{ | |||
if (!curNode.word.isIgnored) | |||
{ | |||
String text = curNode.word.baseForm; | |||
if(curNode.word.mayEnt || curNode.word.mayType) | |||
{ | |||
String [] strArray = curNode.word.baseForm.split("_"); | |||
for(String str: strArray) | |||
BoW_T.add(str); | |||
} | |||
else | |||
{ | |||
BoW_T.add(text); | |||
} | |||
} | |||
} | |||
// Find candidate patterns by SubBoW_T & inveretdIndex | |||
HashSet<String> candidatePatterns = new HashSet<String>(); | |||
for (String curWord : SubBoW_T) | |||
{ | |||
ArrayList<String> postingList = Globals.pd.invertedIndex.get(curWord); | |||
if (postingList != null) | |||
{ | |||
candidatePatterns.addAll(postingList); | |||
} | |||
} | |||
// Check patterns by BoW_P & subtree matching | |||
int notMatchedCount = 0; | |||
HashSet<String> validCandidatePatterns = new HashSet<String>(); | |||
for (String p : candidatePatterns) | |||
{ | |||
String[] BoW_P = p.split(" "); | |||
notMatchedCount = 0; // not match number between pattern & question | |||
for (String s : BoW_P) | |||
{ | |||
if (s.length() < 2) | |||
continue; | |||
if (s.startsWith("[")) | |||
continue; | |||
if (Globals.stopWordsList.isStopWord(s)) | |||
continue; | |||
if (!BoW_T.contains(s)) | |||
{ | |||
notMatchedCount ++; | |||
if (notMatchedCount > notMatchedCountThreshold) | |||
break; | |||
} | |||
} | |||
if (notMatchedCount <= notMatchedCountThreshold) | |||
{ | |||
validCandidatePatterns.add(p); | |||
//TODO: to support matching like [soccer_club] | |||
subTreeMatching(p, BoW_P, shortestPath, T, qlog, ret, 'S'); | |||
} | |||
} | |||
// Another chance for [soccer_club] (the relation embedded in nodes) | |||
if(validCandidatePatterns.size() > 0) | |||
{ | |||
if(n1.word.originalForm.contains("_") || n2.word.originalForm.contains("_")) | |||
{ | |||
for (String p : validCandidatePatterns) | |||
{ | |||
String[] BoW_P = p.split(" "); | |||
notMatchedCount = 0; | |||
int mappedCharacterCount = 0; | |||
int matchedWordInArg = 0; | |||
boolean[] matchedFlag = new boolean[BoW_P.length]; | |||
for(int idx = 0; idx < BoW_P.length; idx ++) {matchedFlag[idx] = false;} | |||
int idx = 0; | |||
for (String s : BoW_P) | |||
{ | |||
if(n1.word.baseForm.contains(s) || n2.word.baseForm.contains(s)) // Hit nodes | |||
matchedWordInArg++; | |||
if(BoW_T.contains(s)) | |||
{ | |||
mappedCharacterCount += s.length(); | |||
matchedFlag[idx] = true; | |||
} | |||
idx++; | |||
if (s.length() < 2) | |||
continue; | |||
if (s.startsWith("[")) | |||
continue; | |||
if (Globals.stopWordsList.isStopWord(s)) | |||
continue; | |||
if (!BoW_T.contains(s)) | |||
notMatchedCount ++; | |||
} | |||
// Success if has 2 hits | |||
if(matchedWordInArg >= 2) | |||
{ | |||
double matched_score = ((double)(BoW_P.length-notMatchedCount))/((double)(BoW_P.length)); | |||
if (matched_score > 0.95) | |||
matched_score *= 10; // award for WHOLE match | |||
// TODO: this will make LONGER one has LARGER score, sometimes unsuitable | eg, be bear die in | |||
matched_score = matched_score * Math.sqrt(mappedCharacterCount); | |||
SimpleRelation sr = new SimpleRelation(); | |||
sr.arg1Word = n1.word; | |||
sr.arg2Word = n2.word; | |||
sr.relationParaphrase = p; | |||
sr.matchingScore = matched_score; | |||
sr.extractingMethod = 'X'; | |||
if (n1.dep_father2child.endsWith("subj")) | |||
sr.preferredSubj = sr.arg1Word; | |||
sr.arg1Word.setIsCovered(); | |||
sr.arg2Word.setIsCovered(); | |||
sr.setPasList(p, matched_score, matchedFlag); | |||
sr.setPreferedSubjObjOrder(T); | |||
ret.add(sr); | |||
} | |||
} | |||
} | |||
} | |||
return ret; | |||
} | |||
// Core function of paraphrase matching | |||
private void subTreeMatching (String pattern, String[] BoW_P, | |||
ArrayList<DependencyTreeNode> shortestPath, | |||
DependencyTree T, QueryLogger qlog, | |||
ArrayList<SimpleRelation> ret, char extractingMethod) | |||
{ | |||
DependencyTreeNode n1 = shortestPath.get(0); | |||
DependencyTreeNode n2 = shortestPath.get(shortestPath.size()-1); | |||
ParaphraseDictionary pd = Globals.pd; | |||
Queue<DependencyTreeNode> queue = new LinkedList<DependencyTreeNode>(); | |||
queue.add(T.getRoot()); | |||
for(DependencyTreeNode curOuterNode: shortestPath) | |||
{ | |||
outer: | |||
for(String s: BoW_P) | |||
{ | |||
if(s.equals(curOuterNode.word.baseForm)) | |||
{ | |||
// try to match all nodes | |||
ArrayList<DependencyTreeNode> subTreeNodes = new ArrayList<DependencyTreeNode>(); | |||
Queue<DependencyTreeNode> queue2 = new LinkedList<DependencyTreeNode>(); | |||
queue2.add(curOuterNode); | |||
int unMappedLeft = BoW_P.length; | |||
int mappedCharacterCount = 0; | |||
int hitPathCnt = 0; // words in pattern hit the shortest path | |||
int hitPathBetweenTwoArgCnt = 0; //words in pattern hit the shortest path and excluding the two target nodes | |||
double mappedCharacterCountPunishment = 0; // punishment when contains [[]] (function word) | |||
DependencyTreeNode curNode; | |||
boolean[] matchedFlag = new boolean[BoW_P.length]; | |||
for(int idx = 0; idx < BoW_P.length; idx ++) {matchedFlag[idx] = false;} | |||
while (unMappedLeft > 0 && (curNode=queue2.poll())!=null) | |||
{ | |||
if (curNode.word.isIgnored) continue; | |||
int idx = 0; | |||
for (String ss : BoW_P) | |||
{ | |||
// words in pattern only can be matched once | |||
if (!matchedFlag[idx]) | |||
{ | |||
// check word | |||
if (ss.equals(curNode.word.baseForm)) | |||
{ | |||
unMappedLeft --; | |||
subTreeNodes.add(curNode); | |||
queue2.addAll(curNode.childrenList); | |||
matchedFlag[idx] = true; | |||
mappedCharacterCount += ss.length(); | |||
if(shortestPath.contains(curNode)) | |||
{ | |||
hitPathCnt++; | |||
if(curNode!=n1 && curNode!=n2) | |||
hitPathBetweenTwoArgCnt++; | |||
} | |||
break; | |||
} | |||
// check POS tag | |||
else if (ss.startsWith("[") && posSame(curNode.word.posTag, ss)) | |||
{ | |||
unMappedLeft --; | |||
subTreeNodes.add(curNode); | |||
queue2.addAll(curNode.childrenList); | |||
matchedFlag[idx] = true; | |||
mappedCharacterCount += curNode.word.baseForm.length(); | |||
mappedCharacterCountPunishment += 0.01; | |||
break; | |||
} | |||
} | |||
idx ++; | |||
} | |||
} | |||
int unMatchedNoneStopWordCount = 0; | |||
int matchedNoneStopWordCount = 0; | |||
for (int idx = 0; idx < BoW_P.length; idx ++) { | |||
if (BoW_P[idx].startsWith("[")) continue; | |||
if (!matchedFlag[idx]) { | |||
if (!Globals.stopWordsList.isStopWord(BoW_P[idx])) // unmatched | |||
unMatchedNoneStopWordCount ++; | |||
} | |||
else { | |||
if (!Globals.stopWordsList.isStopWord(BoW_P[idx])) // matched | |||
matchedNoneStopWordCount ++; | |||
} | |||
} | |||
if (unMatchedNoneStopWordCount > notMatchedCountThreshold) { | |||
if(qlog.MODE_debug) System.out.println("----But the pattern\"" + pattern + "\" is not a subtree."); | |||
break outer; | |||
} | |||
// MUST have notional words matched, non stop words > 0 | |||
if (matchedNoneStopWordCount == 0){ | |||
if(qlog.MODE_debug) System.out.println("----But the matching for pattern \"" + pattern + "\" does not have content words."); | |||
break outer; | |||
} | |||
// IF partial match and be covered by other pattern, give up the current pattern | |||
if (unMappedLeft > 0) { | |||
StringBuilder subpattern = new StringBuilder(); | |||
for (int idx = 0; idx < BoW_P.length; idx ++) { | |||
if (matchedFlag[idx]) { | |||
subpattern.append(BoW_P[idx]); | |||
subpattern.append(' '); | |||
} | |||
} | |||
subpattern.deleteCharAt(subpattern.length()-1); | |||
if (pd.nlPattern_2_predicateList.containsKey(subpattern)) { | |||
if(qlog.MODE_debug) System.out.println("----But the partially matched pattern \"" + pattern + "\" is another pattern."); | |||
break outer; | |||
} | |||
} | |||
// !Preposition | suppose only have one preposition | |||
// TODO: consider more preposition | the first preposition may be wrong | |||
DependencyTreeNode prep = null; | |||
for (DependencyTreeNode dtn : subTreeNodes) { | |||
outer2: | |||
for (DependencyTreeNode dtn_child : dtn.childrenList) { | |||
if(pd.prepositions.contains(dtn_child.word.baseForm)) { | |||
prep = dtn_child; | |||
break outer2; | |||
} | |||
} | |||
} | |||
boolean isContained = false; | |||
for(DependencyTreeNode dtn_contain : subTreeNodes) { | |||
if(dtn_contain == prep) isContained = true; | |||
} | |||
if(!isContained && prep != null) { | |||
subTreeNodes.add(prep); | |||
} | |||
// Relation extracted, set COVER flags | |||
for (DependencyTreeNode dtn : subTreeNodes) | |||
{ | |||
dtn.word.isCovered = true; | |||
} | |||
int cnt = 0; | |||
double matched_score = ((double)(BoW_P.length-unMappedLeft))/((double)(BoW_P.length)); | |||
if (matched_score > 0.95) | |||
matched_score *= 10; // Award for WHOLE match | |||
// The match ratio between pattern and path larger, the score higher; especially when uncovered with the two target nodes | |||
if(hitPathCnt != 0) | |||
{ | |||
double hitScore = 1 + (double)hitPathCnt/(double)BoW_P.length; | |||
if(hitPathBetweenTwoArgCnt == hitPathCnt) | |||
hitScore += 1; | |||
else if(shortestPath.size() >= 4) // If path long enough, pattern still cover with the target nodes, punishment | |||
{ | |||
//hitScore = 0.5; | |||
if(hitPathBetweenTwoArgCnt == 0) // If path long enough, pattern cover with target nodes totally, punishment a lot | |||
hitScore = 0.25; | |||
} | |||
matched_score *= hitScore; | |||
} | |||
matched_score = matched_score * Math.sqrt(mappedCharacterCount) - mappedCharacterCountPunishment; // the longer, the better (unsuitable in some cases) | |||
if (qlog.MODE_debug) System.out.println("☆" + pattern + ", score=" + matched_score); | |||
DependencyTreeNode subject = n1; | |||
DependencyTreeNode object = n2; | |||
if (subject != object) | |||
{ | |||
SimpleRelation sr = new SimpleRelation(); | |||
sr.arg1Word = subject.word; | |||
sr.arg2Word = object.word; | |||
sr.relationParaphrase = pattern; | |||
sr.matchingScore = matched_score; | |||
sr.extractingMethod = extractingMethod; | |||
if (subject.dep_father2child.endsWith("subj")) | |||
sr.preferredSubj = sr.arg1Word; | |||
sr.arg1Word.setIsCovered(); | |||
sr.arg2Word.setIsCovered(); | |||
sr.setPasList(pattern, matched_score, matchedFlag); | |||
sr.setPreferedSubjObjOrder(T); | |||
ret.add(sr); | |||
cnt ++; | |||
//String binaryRelation = "<" + subjectString + "> <" + pattern + "> <" + objectString + ">"; | |||
} | |||
if (cnt == 0) break outer; | |||
} | |||
} | |||
} | |||
} | |||
// [[det]], [[num]], [[adj]], [[pro]], [[prp]], [[con]], [[mod]] | |||
public boolean posSame(String tag, String posWithBracket) { | |||
if ( (posWithBracket.charAt(2) == 'd' && tag.equals("DT")) | |||
|| (posWithBracket.charAt(2) == 'n' && tag.equals("CD")) | |||
|| (posWithBracket.charAt(2) == 'a' && (tag.startsWith("JJ") || tag.startsWith("RB"))) | |||
|| (posWithBracket.charAt(2) == 'c' && tag.startsWith("CC"))//TODO: how about "IN: subordinating conjunction"? | |||
|| (posWithBracket.charAt(2) == 'm' && tag.equals("MD"))) { | |||
return true; | |||
} | |||
else if (posWithBracket.charAt(2) == 'p') { | |||
if ( (posWithBracket.charAt(4) == 'o' && tag.startsWith("PR")) | |||
|| (posWithBracket.charAt(4) == 'p' && (tag.equals("IN") || tag.equals("TO")))) { | |||
return true; | |||
} | |||
} | |||
return false; | |||
} | |||
public HashMap<Integer, SemanticRelation> groupSimpleRelationsByArgsAndMapPredicate (ArrayList<SimpleRelation> simpleRelations) { | |||
System.out.println("==========Group Simple Relations========="); | |||
HashMap<Integer, SemanticRelation> ret = new HashMap<Integer, SemanticRelation>(); | |||
HashMap<Integer, HashMap<Integer, StringAndDouble>> key2pasMap = new HashMap<Integer, HashMap<Integer, StringAndDouble>>(); | |||
for(SimpleRelation simr : simpleRelations) | |||
{ | |||
int key = simr.getHashCode(); | |||
if (!ret.keySet().contains(key)) | |||
{ | |||
ret.put(key, new SemanticRelation(simr)); | |||
key2pasMap.put(key, new HashMap<Integer, StringAndDouble>()); | |||
} | |||
SemanticRelation semr = ret.get(key); | |||
HashMap<Integer, StringAndDouble> pasMap = key2pasMap.get(key); | |||
// Just use to display. | |||
if (simr.matchingScore > semr.LongestMatchingScore) | |||
{ | |||
semr.LongestMatchingScore = simr.matchingScore; | |||
semr.relationParaphrase = simr.relationParaphrase; | |||
} | |||
// for pid=x, no wonder from which pattern, we only record the highest score and the related pattern. | |||
for (int pid : simr.pasList.keySet()) { | |||
double score = simr.pasList.get(pid); | |||
if (!pasMap.containsKey(pid)) { | |||
pasMap.put(pid, new StringAndDouble(simr.relationParaphrase, score)); | |||
} | |||
else if (score > pasMap.get(pid).score) { | |||
pasMap.put(pid, new StringAndDouble(simr.relationParaphrase, score)); | |||
} | |||
} | |||
} | |||
for (Integer key : key2pasMap.keySet()) { | |||
SemanticRelation semr = ret.get(key); | |||
HashMap<Integer, StringAndDouble> pasMap = key2pasMap.get(key); | |||
semr.predicateMappings = new ArrayList<PredicateMapping>(); | |||
//System.out.print("<"+semr.arg1Word.getFullEntityName() + "," + semr.arg2Word.getFullEntityName() + ">:"); | |||
for (Integer pid : pasMap.keySet()) | |||
{ | |||
semr.predicateMappings.add(new PredicateMapping(pid, pasMap.get(pid).score, pasMap.get(pid).str)); | |||
//System.out.print("[" + Globals.pd.getPredicateById(pid) + "," + pasMap.get(pid).str + "," + pasMap.get(pid).score + "]"); | |||
} | |||
Collections.sort(semr.predicateMappings); | |||
} | |||
System.out.println("========================================="); | |||
return ret; | |||
} | |||
} | |||
class StringAndDouble { | |||
public String str; | |||
public double score; | |||
public StringAndDouble (String str, double score) { | |||
this.str = str; | |||
this.score = score; | |||
} | |||
} |
@@ -0,0 +1,358 @@ | |||
package qa.extract; | |||
import java.io.BufferedReader; | |||
import java.io.InputStreamReader; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import nlp.ds.Word; | |||
import nlp.tool.StopWordsList; | |||
//import fgmt.RelationFragment; | |||
import fgmt.TypeFragment; | |||
import lcn.SearchInTypeShortName; | |||
import log.QueryLogger; | |||
import qa.Globals; | |||
import rdf.PredicateMapping; | |||
import rdf.SemanticRelation; | |||
import rdf.Triple; | |||
import rdf.TypeMapping; | |||
/* | |||
* 2016-6-17 | |||
* 1. Recognize types (include YAGO type) | |||
* 2、Add some type mapping manually, eg, "US State"-"yago:StatesOfTheUnitedStates" | |||
* 3、Add some extend variable, (generalization of [variable with inherit type] -> [variable with inherit triples]) eg, ?canadian <birthPlace> <Canada> | |||
* */ | |||
public class TypeRecognition { | |||
// dbpedia 2014 | |||
//public static final int[] type_Person = {180,279}; | |||
//public static final int[] type_Place = {49,228}; | |||
//public static final int[] type_Organisation = {419,53}; | |||
//dbpedia 2016 | |||
public static final int[] type_Person = {5828,15985}; | |||
public static final int[] type_Place = {11197,2188}; | |||
public static final int[] type_Organisation = {1335,4716}; | |||
public static HashMap<String, String> extendTypeMap = null; | |||
public static HashMap<String, Triple> extendVariableMap = null; | |||
SearchInTypeShortName st = new SearchInTypeShortName(); | |||
static | |||
{ | |||
extendTypeMap = new HashMap<String, String>(); | |||
extendVariableMap = new HashMap<String, Triple>(); | |||
Triple triple = null; | |||
//!Handwriting for convenience | TODO: approximate/semantic match of type | |||
extendTypeMap.put("NonprofitOrganizations", "dbo:Non-ProfitOrganisation"); | |||
extendTypeMap.put("GivenNames", "dbo:GivenName"); | |||
extendTypeMap.put("JamesBondMovies","yago:JamesBondFilms"); | |||
extendTypeMap.put("TVShows", "dbo:TelevisionShow"); | |||
extendTypeMap.put("USState", "yago:StatesOfTheUnitedStates"); | |||
extendTypeMap.put("USStates", "yago:StatesOfTheUnitedStates"); | |||
extendTypeMap.put("Europe", "yago:EuropeanCountries"); | |||
extendTypeMap.put("Africa", "yago:AfricanCountries"); | |||
//!The following IDs are based on DBpedia 2014. | |||
//!extend variable (embedded triples) | eg, [?E|surfers]-?uri dbo:occupation res:Surfing | canadians��<?canadian> <birthPlace> <Canada> | |||
//1) <?canadians> <birthPlace> <Canada> | [country people] <birthPlace|1639> [country] | |||
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 1639, 2112902, "Canada", null, 100); | |||
extendVariableMap.put("canadian", triple); | |||
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 1639, 883747, "Germany", null, 100); | |||
extendVariableMap.put("german", triple); | |||
//2) ?bandleader <occupation|6690> <Bandleader> | |||
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 6690, 5436853, "Bandleader", null, 100); | |||
extendVariableMap.put("bandleader", triple); | |||
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 6690, 5436854, "Surfing>", null, 100); | |||
extendVariableMap.put("surfer", triple); | |||
} | |||
public static void recognizeExtendVariable(Word w) | |||
{ | |||
String key = w.baseForm; | |||
if(extendVariableMap.containsKey(key)) | |||
{ | |||
w.mayExtendVariable = true; | |||
Triple triple = extendVariableMap.get(key).copy(); | |||
if(triple.subjId == Triple.VAR_ROLE_ID && triple.subject.equals(Triple.VAR_NAME)) | |||
triple.subject = "?" + w.originalForm; | |||
if(triple.objId == Triple.VAR_ROLE_ID && triple.object.equals(Triple.VAR_NAME)) | |||
triple.object = "?" + w.originalForm; | |||
w.embbededTriple = triple; | |||
} | |||
} | |||
public ArrayList<TypeMapping> getExtendTypeByStr(String allUpperFormWord) | |||
{ | |||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||
//Do not consider SINGLE-word type (most are useless) | eg, Battle, War, Daughter | |||
if(allUpperFormWord.length() > 1 && allUpperFormWord.substring(1).equals(allUpperFormWord.substring(1).toLowerCase())) | |||
return null; | |||
//search in YAGO type | |||
if(TypeFragment.yagoTypeList.contains(allUpperFormWord)) | |||
{ | |||
//YAGO prefix | |||
String typeName = "yago:"+allUpperFormWord; | |||
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); | |||
tmList.add(tm); | |||
} | |||
else if(extendTypeMap.containsKey(allUpperFormWord)) | |||
{ | |||
String typeName = extendTypeMap.get(allUpperFormWord); | |||
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); | |||
tmList.add(tm); | |||
} | |||
if(tmList.size()>0) | |||
return tmList; | |||
else | |||
return null; | |||
} | |||
public ArrayList<TypeMapping> getTypeIDsAndNamesByStr (String baseform) | |||
{ | |||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||
try | |||
{ | |||
tmList = st.searchTypeScore(baseform, 0.4, 0.8, 10); | |||
Collections.sort(tmList); | |||
if (tmList.size()>0) | |||
return tmList; | |||
else | |||
return null; | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
return null; | |||
} | |||
} | |||
public ArrayList<Integer> recognize (String baseform) { | |||
char c = baseform.charAt(baseform.length()-1); | |||
if (c >= '0' && c <= '9') { | |||
baseform = baseform.substring(0, baseform.length()-2); | |||
} | |||
try { | |||
ArrayList<String> ret = st.searchType(baseform, 0.4, 0.8, 10); | |||
ArrayList<Integer> ret_in = new ArrayList<Integer>(); | |||
for (String s : ret) { | |||
System.out.println("["+s+"]"); | |||
ret_in.addAll(TypeFragment.typeShortName2IdList.get(s)); | |||
} | |||
if (ret_in.size()>0) return ret_in; | |||
else return null; | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
return null; | |||
} | |||
} | |||
public static void AddTypesOfWhwords (HashMap<Integer, SemanticRelation> semanticRelations) { | |||
ArrayList<TypeMapping> ret = null; | |||
for (Integer it : semanticRelations.keySet()) | |||
{ | |||
SemanticRelation sr = semanticRelations.get(it); | |||
if(!sr.arg1Word.mayType) | |||
{ | |||
ret = recognizeSpecial(sr.arg1Word.baseForm); | |||
if (ret != null) | |||
{ | |||
sr.arg1Word.tmList = ret; | |||
} | |||
} | |||
if(!sr.arg2Word.mayType) | |||
{ | |||
ret = recognizeSpecial(sr.arg2Word.baseForm); | |||
if (ret != null) | |||
{ | |||
sr.arg2Word.tmList = ret; | |||
} | |||
} | |||
} | |||
} | |||
public static ArrayList<TypeMapping> recognizeSpecial (String wordSpecial) | |||
{ | |||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||
if (wordSpecial.toLowerCase().equals("who")) | |||
{ | |||
for (Integer i : type_Person) | |||
{ | |||
tmList.add(new TypeMapping(i,"Person",1)); | |||
} | |||
//"who" can also means organization | |||
for (Integer i : type_Organisation) | |||
{ | |||
tmList.add(new TypeMapping(i,"Organization",1)); | |||
} | |||
return tmList; | |||
} | |||
else if (wordSpecial.toLowerCase().equals("where")) | |||
{ | |||
for (Integer i : type_Place) | |||
{ | |||
tmList.add(new TypeMapping(i,"Place",1)); | |||
} | |||
for (Integer i : type_Organisation) | |||
{ | |||
tmList.add(new TypeMapping(i,"Organization",1)); | |||
} | |||
return tmList; | |||
} | |||
//TODO: When ... | |||
return null; | |||
} | |||
/* | |||
* 1. Priority: mayEnt(Uppercase)>mayType>mayEnt | |||
* 2. mayEnt=1: Constant | |||
* 3. mayType=1: | |||
* (1)Variable, a triple will be added when evaluation. | eg, Which [books] by Kerouac were published by Viking Press? | |||
* (2)Constant, it modify other words. | eg, Are tree frogs a type of [amphibian]? | |||
* 4、extend variable (a variable embedded triples) | |||
* */ | |||
public static void constantVariableRecognition(HashMap<Integer, SemanticRelation> semanticRelations, QueryLogger qlog) | |||
{ | |||
Word[] words = qlog.s.words; | |||
//NOTICE: modifiers(implicit relation) have not been considered. | |||
for (Integer it : semanticRelations.keySet()) | |||
{ | |||
SemanticRelation sr = semanticRelations.get(it); | |||
int arg1WordPos = sr.arg1Word.position - 1; | |||
int arg2WordPos = sr.arg2Word.position - 1; | |||
// extend variable recognition | |||
recognizeExtendVariable(sr.arg1Word); | |||
recognizeExtendVariable(sr.arg2Word); | |||
// constant or variable | |||
if(sr.arg1Word.mayExtendVariable) | |||
{ | |||
//eg, ?canadian <birthPlace> <Canada> (both extendVariable & type) | |||
if(sr.arg1Word.mayType) | |||
sr.arg1Word.mayType = false; | |||
if(sr.arg1Word.mayEnt) | |||
{ | |||
//rule: [extendVaraible & ent] + noun -> ent |eg, Canadian movies -> ent:Canada | |||
if(arg1WordPos+1 < words.length && words[arg1WordPos+1].posTag.startsWith("N")) | |||
{ | |||
sr.arg1Word.mayExtendVariable = false; | |||
sr.isArg1Constant = true; | |||
} | |||
else | |||
{ | |||
sr.arg1Word.mayEnt = false; | |||
} | |||
} | |||
} | |||
// type | |||
else if(sr.arg1Word.mayType) | |||
{ | |||
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries | |||
if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of")) | |||
&& !words[arg1WordPos-2].posTag.startsWith("V")) | |||
{ | |||
sr.isArg1Constant = true; | |||
double largerScore = 1000; | |||
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||
largerScore = sr.predicateMappings.get(0).score * 2; | |||
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||
sr.predicateMappings.add(0,nPredicate); | |||
//constant type should be object | |||
sr.preferredSubj = sr.arg2Word; | |||
} | |||
} | |||
//ent: constant | |||
else if(sr.arg1Word.mayEnt) | |||
{ | |||
sr.isArg1Constant = true; | |||
} | |||
// constant or variable | |||
if(sr.arg2Word.mayExtendVariable) | |||
{ | |||
if(sr.arg2Word.mayType) | |||
sr.arg2Word.mayType = false; | |||
if(sr.arg2Word.mayEnt) | |||
{ | |||
if(arg2WordPos+1 < words.length && words[arg2WordPos+1].posTag.startsWith("N")) | |||
{ | |||
sr.arg2Word.mayExtendVariable = false; | |||
sr.isArg2Constant = true; | |||
} | |||
else | |||
{ | |||
sr.arg2Word.mayEnt = false; | |||
} | |||
} | |||
} | |||
// type | |||
else if(sr.arg2Word.mayType) | |||
{ | |||
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries | |||
if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of")) | |||
&& !words[arg2WordPos-2].posTag.startsWith("V") ) | |||
{ | |||
sr.isArg2Constant = true; | |||
double largerScore = 1000; | |||
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||
largerScore = sr.predicateMappings.get(0).score * 2; | |||
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||
sr.predicateMappings.add(0,nPredicate); | |||
sr.preferredSubj = sr.arg1Word; | |||
} | |||
//rule: Be ... a type? | |||
if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a")) | |||
{ | |||
sr.isArg2Constant = true; | |||
double largerScore = 1000; | |||
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||
largerScore = sr.predicateMappings.get(0).score * 2; | |||
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||
sr.predicateMappings.add(0,nPredicate); | |||
sr.preferredSubj = sr.arg1Word; | |||
} | |||
} | |||
else if(sr.arg2Word.mayEnt) | |||
{ | |||
sr.isArg2Constant = true; | |||
} | |||
if(sr.arg1Word != sr.preferredSubj) | |||
sr.swapArg1Arg2(); | |||
} | |||
} | |||
public static void main (String[] args) | |||
{ | |||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||
String type = "space mission"; | |||
try | |||
{ | |||
TypeFragment.load(); | |||
Globals.stopWordsList = new StopWordsList(); | |||
TypeRecognition tr = new TypeRecognition(); | |||
while(true) | |||
{ | |||
System.out.print("Input query type: "); | |||
type = br.readLine(); | |||
tr.recognize(type); | |||
} | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,690 @@ | |||
package qa.mapping; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import java.util.Iterator; | |||
import qa.Globals; | |||
import rdf.Sparql; | |||
import rdf.Triple; | |||
import fgmt.EntityFragment; | |||
import fgmt.RelationFragment; | |||
import fgmt.TypeFragment; | |||
import fgmt.VariableFragment; | |||
/** | |||
* Notice: one compatiblityChecker can be only used once to check a SPARQL. | |||
* @author husen | |||
*/ | |||
public class CompatibilityChecker { | |||
static int EnumerateThreshold = 1000; | |||
public EntityFragmentDict efd = null; | |||
public HashMap<String, VariableFragment> variable_fragment = null; | |||
public CompatibilityChecker(EntityFragmentDict efd) { | |||
this.efd = efd; | |||
variable_fragment = new HashMap<String, VariableFragment>(); | |||
} | |||
// Run this check function after pass "single triple check" (recoded) | |||
// Recoded: variable will find suitable entities, depend on the inMemory INDEX. Notice when variable = literal | |||
public boolean isSparqlCompatible3 (Sparql spq) | |||
{ | |||
boolean[] isFixed = new boolean[spq.tripleList.size()]; // record triple's compatibility whether need check | |||
for (int i = 0; i < spq.tripleList.size(); i ++) { | |||
isFixed[i] = false; | |||
} | |||
//System.out.println("tripleList size="+spq.tripleList.size()); | |||
Iterator<Triple> it; | |||
boolean shouldContinue = true; | |||
// shouldContinue when: triple with variables updates variable fragment, then use updated variable fragment check the previous triples | |||
while (shouldContinue) | |||
{ | |||
shouldContinue = false; | |||
it = spq.tripleList.iterator(); | |||
int t_cnt = 0; | |||
while (it.hasNext()) { | |||
Triple t = it.next(); | |||
switch (getTripleType(t)) { | |||
case 1: // (1) E1, P, E2 | |||
if (!isFixed[t_cnt]) | |||
{ | |||
int ret = hs_check1_E1PE2(t); | |||
if (ret == 0) | |||
isFixed[t_cnt] = true; | |||
else if (ret == 5) | |||
return false; | |||
} | |||
break; | |||
case 2: // (2) E, P, V | |||
if(!isFixed[t_cnt]) | |||
{ | |||
int ret = hs_check2_EPV(t); | |||
if (ret == 5) | |||
return false; | |||
else | |||
{ | |||
isFixed[t_cnt] = true; // Now V has set entities or literal; notice E/P->V maybe not unique, eg, xx's starring | |||
if (ret == 1) | |||
shouldContinue = true; | |||
} | |||
} | |||
break; | |||
case 3: // (3) E, <type1>, T | |||
if (!isFixed[t_cnt]) | |||
{ | |||
int ret = check3_Etype1T(t); | |||
if (ret == -2) return false; | |||
if (ret == 0) isFixed[t_cnt] = true; | |||
} | |||
break; | |||
case 4: // (4) V, P, E | |||
if(!isFixed[t_cnt]) | |||
{ | |||
int ret = hs_check4_VPE(t); | |||
if (ret == 5) | |||
return false; | |||
else | |||
{ | |||
isFixed[t_cnt] = true; // Now V has set entities or literal; notice E/P->V maybe not unique, eg, xx's starring | |||
if (ret == 1) | |||
shouldContinue = true; | |||
} | |||
} | |||
break; | |||
case 5: // (5) V1, P, V2 (The most important and time consuming) | |||
if(!isFixed[t_cnt]) | |||
{ | |||
int ret = hs_check5_V1PV2(t); | |||
if (ret == 5) | |||
return false; | |||
else | |||
{ | |||
isFixed[t_cnt] = true; // Just set once and no re-check | |||
if (ret == 1) | |||
shouldContinue = true; | |||
} | |||
} | |||
break; | |||
case 6: // (6) V, <type1>, T | |||
if (!isFixed[t_cnt]) | |||
{ | |||
int ret = hs_check6_Vtype1T(t); | |||
if (ret == -2) return false; | |||
else | |||
{ | |||
isFixed[t_cnt] = true; | |||
if (ret == 1) | |||
shouldContinue = true; | |||
} | |||
} | |||
break; | |||
case 7: | |||
// do nothing | |||
break; | |||
case 8: | |||
default: | |||
return false; | |||
} | |||
t_cnt ++; | |||
} | |||
} | |||
return true; | |||
} | |||
/** | |||
* Get Triple's category | |||
* (1) E1, P, E2 | |||
* (2) E, P, V | |||
* (3) E, <type>, T | |||
* (4) V, P, E | |||
* (5) V1, P, V2 | |||
* (6) V, <type>, T | |||
* (7) E, <type>, V | |||
* (8) error | |||
* | |||
* E: Entity | |||
* P: Predicate (exclude <type>) | |||
* V: Variable | |||
* T: Type | |||
* | |||
* @param t | |||
* @return | |||
*/ | |||
public int getTripleType (Triple t) { | |||
if (t.predicateID == Globals.pd.typePredicateID) { | |||
boolean s = t.subject.startsWith("?"); | |||
boolean o = t.object.startsWith("?"); | |||
if (s && !o) return 6; | |||
else if (o && !s) return 7; | |||
else if (!s && !o) return 3; | |||
else return 8; | |||
} | |||
else if (t.subject.startsWith("?")) { | |||
if (t.object.startsWith("?")) return 5; | |||
else return 4; | |||
} | |||
else { | |||
if (t.object.startsWith("?")) return 2; | |||
else return 1; | |||
} | |||
} | |||
public int hs_check1_E1PE2(Triple t) | |||
{ | |||
int pid = t.predicateID; | |||
EntityFragment E1 = efd.getEntityFragmentByEid(t.subjId); | |||
EntityFragment E2 = efd.getEntityFragmentByEid(t.objId); | |||
// E2 is E1's one depth neighbor, connected with predicate "p" | |||
if(E1.outEntMap.containsKey(E2.eId)) | |||
{ | |||
ArrayList<Integer> pList = E1.outEntMap.get(E2.eId); | |||
if(pList.contains(pid)) | |||
return 0; | |||
} | |||
return 5; | |||
} | |||
public int hs_check2_EPV(Triple t) | |||
{ | |||
int pid = t.predicateID; | |||
EntityFragment E = efd.getEntityFragmentByEid(t.subjId); | |||
VariableFragment V = variable_fragment.get(t.object); | |||
// P ∈ E.outEdges | |||
if (!E.outEdges.contains(pid)) { | |||
return 5; | |||
} | |||
// Set V, notice maybe literal | |||
if(V == null) | |||
{ | |||
variable_fragment.put(t.object, new VariableFragment()); | |||
V = variable_fragment.get(t.object); | |||
for(int vid: E.outEntMap.keySet()) | |||
{ | |||
if(E.outEntMap.get(vid).contains(pid)) | |||
{ | |||
V.candEntities.add(vid); | |||
} | |||
} | |||
// E's outEdges contain p, but cannot find neighbor ENT by p, then V maybe literal | |||
if(V.candEntities.size() == 0) | |||
{ | |||
V.mayLiteral = true; | |||
return 0; | |||
} | |||
} | |||
else | |||
{ | |||
// just okay if V is literal, because fragment has not stored the literal information | |||
if(V.mayLiteral) | |||
return 0; | |||
// Update V's binding by current neighbor of E | |||
HashSet<Integer> newCandEntities = new HashSet<Integer>(); | |||
if(V.candEntities.size() > 0 && V.candEntities.size() < E.outEntMap.size()) | |||
{ | |||
for(int vid: V.candEntities) | |||
{ | |||
if(E.outEntMap.containsKey(vid) && E.outEntMap.get(vid).contains(pid)) | |||
{ | |||
newCandEntities.add(vid); | |||
} | |||
} | |||
} | |||
else | |||
{ | |||
for(int vid: E.outEntMap.keySet()) | |||
{ | |||
if(E.outEntMap.get(vid).contains(pid) && (V.candEntities.size() == 0 || V.candEntities.contains(vid))) | |||
{ | |||
newCandEntities.add(vid); | |||
} | |||
} | |||
} | |||
V.candEntities = newCandEntities; | |||
} | |||
if(V.candEntities.size() > 0) | |||
return 0; | |||
else | |||
return 5; | |||
} | |||
public int check3_Etype1T(Triple t) { | |||
String[] T = t.object.split("\\|"); // ע��"|"��Ҫת�� | |||
EntityFragment E = efd.getEntityFragmentByEid(t.subjId); | |||
String newTypeString = ""; | |||
boolean contained = false; | |||
// check whether each type int T is proper for E | |||
if (T.length == 0) return -2; | |||
for (String s : T) { | |||
contained = false; | |||
for (Integer i : TypeFragment.typeShortName2IdList.get(s)) { | |||
if (E.types.contains(i)) { | |||
if (!contained) { | |||
contained = true; | |||
newTypeString += s; | |||
newTypeString += "|"; | |||
} | |||
} | |||
} | |||
} | |||
if (newTypeString.length() > 1) { | |||
t.object = newTypeString.substring(0, newTypeString.length()-1); | |||
return 0; | |||
} | |||
else return -2; | |||
} | |||
public int hs_check4_VPE(Triple t) | |||
{ | |||
int pid = t.predicateID; | |||
EntityFragment E = efd.getEntityFragmentByEid(t.objId); | |||
VariableFragment V = variable_fragment.get(t.subject); | |||
TypeFragment subjTf = SemanticItemMapping.getTypeFragmentByWord(t.getSubjectWord()); | |||
// P ∈ E.inEdges | |||
if (!E.inEdges.contains(pid)) { | |||
return 5; | |||
} | |||
// Set V, notice V cannot be literal, because now V is subject | |||
if(V == null) | |||
{ | |||
variable_fragment.put(t.subject, new VariableFragment()); | |||
V = variable_fragment.get(t.subject); | |||
for(int vid: E.inEntMap.keySet()) | |||
{ | |||
if(E.inEntMap.get(vid).contains(pid) && (subjTf == null || subjTf.entSet.contains(vid))) | |||
{ | |||
V.candEntities.add(vid); | |||
} | |||
} | |||
// E's inEdges contain p, but cannot find neighbor ENT by p, now V is subject and cannot be literal, so match fail | |||
if(V.candEntities.size() == 0) | |||
{ | |||
return 5; | |||
} | |||
} | |||
else | |||
{ | |||
// if V is literal, fail because subject cannot be literal | |||
if(V.mayLiteral) | |||
return 5; | |||
// update V's binding by current E's neighbors | |||
HashSet<Integer> newCandEntities = new HashSet<Integer>(); | |||
if(V.candEntities.size() > 0 && V.candEntities.size() < E.inEntMap.size()) | |||
{ | |||
for(int vid: V.candEntities) | |||
{ | |||
if(E.inEntMap.containsKey(vid) && E.inEntMap.get(vid).contains(pid)) | |||
{ | |||
newCandEntities.add(vid); | |||
} | |||
} | |||
} | |||
else | |||
{ | |||
for(int vid: E.inEntMap.keySet()) | |||
{ | |||
if(E.inEntMap.get(vid).contains(pid) && (V.candEntities.size() == 0 || V.candEntities.contains(vid))) | |||
{ | |||
newCandEntities.add(vid); | |||
} | |||
} | |||
} | |||
V.candEntities = newCandEntities; | |||
} | |||
if(V.candEntities.size() > 0) | |||
return 0; | |||
else | |||
return 5; | |||
} | |||
public int check5_V1PV2(Triple t) { | |||
ArrayList<Integer> pidList = new ArrayList<Integer>(); | |||
pidList.add(t.predicateID); | |||
VariableFragment V1 = variable_fragment.get(t.subject); | |||
VariableFragment V2 = variable_fragment.get(t.object); | |||
// V1 & V2's types, equal with types of one fragment of P | |||
Iterator<Integer> it_int = pidList.iterator(); | |||
ArrayList<HashSet<Integer>> newCandTypes1 = new ArrayList<HashSet<Integer>>(); | |||
ArrayList<HashSet<Integer>> newCandTypes2 = new ArrayList<HashSet<Integer>>(); | |||
while (it_int.hasNext()) { | |||
Integer i = it_int.next(); | |||
ArrayList<RelationFragment> flist = RelationFragment.relFragments.get(i); | |||
Iterator<RelationFragment> it_rln = flist.iterator(); | |||
while (it_rln.hasNext()) { | |||
RelationFragment rf = it_rln.next(); | |||
if (V1 == null && V2 == null) { | |||
newCandTypes1.add(rf.inTypes); | |||
newCandTypes2.add(rf.outTypes); | |||
} | |||
else if (V1 == null && V2 != null) { | |||
if (V2.containsAll(rf.outTypes)) { | |||
newCandTypes1.add(rf.inTypes); | |||
newCandTypes2.add(rf.outTypes); | |||
} | |||
} | |||
else if (V2 == null && V1 != null) { | |||
if (V1.containsAll(rf.inTypes)) { | |||
newCandTypes1.add(rf.inTypes); | |||
newCandTypes2.add(rf.outTypes); | |||
} | |||
} | |||
else { | |||
if (V1.containsAll(rf.inTypes) && V2.containsAll(rf.outTypes)) | |||
{ | |||
newCandTypes1.add(rf.inTypes); | |||
newCandTypes2.add(rf.outTypes); | |||
} | |||
} | |||
} | |||
} | |||
if (newCandTypes1.size() > 0 && newCandTypes2.size() > 0) { | |||
if (V1 == null && V2 == null) { | |||
variable_fragment.put(t.subject, new VariableFragment()); | |||
variable_fragment.get(t.subject).candTypes = newCandTypes1; | |||
variable_fragment.put(t.object, new VariableFragment()); | |||
variable_fragment.get(t.object).candTypes = newCandTypes2; | |||
return 1; | |||
} | |||
else if (V1 == null && V2 != null) { | |||
variable_fragment.put(t.subject, new VariableFragment()); | |||
variable_fragment.get(t.subject).candTypes = newCandTypes1; | |||
if (V2.candTypes.size() > newCandTypes2.size()) { | |||
V2.candTypes = newCandTypes2; | |||
return 1; | |||
} | |||
else return 0; | |||
} | |||
else if (V2 == null && V1 != null) { | |||
variable_fragment.put(t.object, new VariableFragment()); | |||
variable_fragment.get(t.object).candTypes = newCandTypes2; | |||
if (V1.candTypes.size() > newCandTypes1.size()) { | |||
V1.candTypes = newCandTypes1; | |||
return 1; | |||
} | |||
else return 0; | |||
} | |||
else { | |||
if (V1.candTypes.size() > newCandTypes1.size() || V2.candTypes.size() > newCandTypes2.size()) { | |||
V1.candTypes = newCandTypes1; | |||
V2.candTypes = newCandTypes2; | |||
return 1; | |||
} | |||
else return 0; | |||
} | |||
} | |||
else return 5; | |||
} | |||
public int hs_check5_V1PV2(Triple t) | |||
{ | |||
int pid = t.predicateID; | |||
VariableFragment V1 = variable_fragment.get(t.subject); | |||
VariableFragment V2 = variable_fragment.get(t.object); | |||
if(V1 == null && V2 == null) // The WORST case, current relation fragment has no records of two target entities, cannot check without types, so we should put this triple in the end | |||
{ | |||
return 0; // in fact should return 1, just expect the unchecked triples can provide candidates of V1,V2 then can check in the next turn | |||
} | |||
else if(V2 == null) | |||
{ | |||
if(V1.mayLiteral) | |||
return 5; | |||
variable_fragment.put(t.object, new VariableFragment()); | |||
V2 = variable_fragment.get(t.object); | |||
HashSet<Integer> newV1cands = new HashSet<Integer>(); | |||
int cnt = 0; | |||
for(int v1id: V1.candEntities) | |||
{ | |||
cnt++; | |||
if(cnt > EnumerateThreshold) | |||
break; | |||
EntityFragment E = efd.getEntityFragmentByEid(v1id); | |||
if(E != null && E.outEdges.contains(pid)) | |||
{ | |||
newV1cands.add(v1id); | |||
for(int v2id: E.outEntMap.keySet()) | |||
{ | |||
if(E.outEntMap.get(v2id).contains(pid)) | |||
V2.candEntities.add(v2id); | |||
} | |||
} | |||
} | |||
V1.candEntities = newV1cands; | |||
} | |||
else if(V1 == null) | |||
{ | |||
if(V2.mayLiteral) | |||
return 0; | |||
variable_fragment.put(t.subject, new VariableFragment()); | |||
V1 = variable_fragment.get(t.subject); | |||
HashSet<Integer> newV2cands = new HashSet<Integer>(); | |||
int cnt = 0; | |||
for(int v2id: V2.candEntities) | |||
{ | |||
cnt++; | |||
if(cnt > EnumerateThreshold) | |||
break; | |||
EntityFragment E = efd.getEntityFragmentByEid(v2id); | |||
if(E != null && E.inEdges.contains(pid)) | |||
{ | |||
newV2cands.add(v2id); | |||
for(int v1id: E.inEntMap.keySet()) | |||
{ | |||
if(E.inEntMap.get(v1id).contains(pid)) | |||
V1.candEntities.add(v1id); | |||
} | |||
} | |||
} | |||
V2.candEntities = newV2cands; | |||
} | |||
else | |||
{ | |||
if(V1.mayLiteral) | |||
return 5; | |||
if(V2.mayLiteral) | |||
return 0; | |||
HashSet<Integer> newV1cands = new HashSet<Integer>(); | |||
HashSet<Integer> newV2cands = new HashSet<Integer>(); | |||
for(int v1id: V1.candEntities) | |||
{ | |||
EntityFragment E1 = efd.getEntityFragmentByEid(v1id); | |||
if(E1 != null && E1.outEdges.contains(pid)) | |||
newV1cands.add(v1id); | |||
} | |||
V1.candEntities = newV1cands; | |||
for(int v2id: V2.candEntities) | |||
{ | |||
EntityFragment E2 = efd.getEntityFragmentByEid(v2id); | |||
if(E2 != null && E2.inEdges.contains(pid)) | |||
newV2cands.add(v2id); | |||
} | |||
V2.candEntities = newV2cands; | |||
newV1cands = new HashSet<Integer>(); | |||
newV2cands = new HashSet<Integer>(); | |||
for(int v1id: V1.candEntities) | |||
{ | |||
EntityFragment E1 = efd.getEntityFragmentByEid(v1id); | |||
for(int v2id: V2.candEntities) | |||
{ | |||
if(E1.outEntMap.containsKey(v2id) && E1.outEntMap.get(v2id).contains(pid)) | |||
{ | |||
newV1cands.add(v1id); | |||
newV2cands.add(v2id); | |||
} | |||
} | |||
} | |||
V1.candEntities = newV1cands; | |||
V2.candEntities = newV2cands; | |||
} | |||
if(V1.candEntities.size() == 0 || (V2.candEntities.size() == 0 && !RelationFragment.isLiteral(pid))) | |||
return 5; | |||
else | |||
return 0; | |||
} | |||
public int check6_Vtype1T(Triple t) { | |||
String[] T = t.object.split("\\|"); // notice "|" need "\\|" | |||
VariableFragment V = variable_fragment.get(t.subject); | |||
String newTypeString = ""; | |||
boolean contained = false; | |||
// check whether each type in T is proper for V | |||
if (T.length == 0) return -2; | |||
ArrayList<HashSet<Integer>> newCandTypes = new ArrayList<HashSet<Integer>>(); | |||
for (String s : T) | |||
{ | |||
contained = false; | |||
//YAGO type (uncoded types), just return because we have no INDEX to check it | |||
if(!TypeFragment.typeShortName2IdList.containsKey(s)) | |||
return 0; | |||
for (Integer i : TypeFragment.typeShortName2IdList.get(s)) | |||
{ | |||
if (V == null) { | |||
// constraint V by user given types, flag it due to possible incomplete type | |||
HashSet<Integer> set = new HashSet<Integer>(); | |||
set.add(i); | |||
set.add(VariableFragment.magic_number); | |||
newCandTypes.add(set); | |||
if (!contained) { | |||
contained = true; | |||
newTypeString += s; | |||
newTypeString += "|"; | |||
} | |||
} | |||
else if (V.contains(i)) { | |||
if (!contained) { | |||
contained = true; | |||
newTypeString += s; | |||
newTypeString += "|"; | |||
} | |||
} | |||
} | |||
} | |||
// check whether each fragment in V is proper for T | |||
// if not, delete the fragment (that means we can narrow the scope) | |||
ArrayList<HashSet<Integer>> deleteCandTypes = new ArrayList<HashSet<Integer>>(); | |||
if (V != null) | |||
{ | |||
Iterator<HashSet<Integer>> it = V.candTypes.iterator(); | |||
while(it.hasNext()) { | |||
HashSet<Integer> set = it.next(); | |||
boolean isCandTypeOkay = false; | |||
//v get [constraint types] through other triples, at least one type can reserve, otherwise delete the [constriant types] | |||
for (String s : T) | |||
{ | |||
for (Integer i : TypeFragment.typeShortName2IdList.get(s)) { | |||
if (set.contains(i)) { | |||
isCandTypeOkay = true; | |||
break; | |||
} | |||
} | |||
} | |||
if (!isCandTypeOkay) { | |||
deleteCandTypes.add(set); | |||
} | |||
} | |||
V.candTypes.removeAll(deleteCandTypes); | |||
} | |||
if (V == null) { | |||
variable_fragment.put(t.subject, new VariableFragment()); | |||
variable_fragment.get(t.subject).candTypes = newCandTypes; | |||
} | |||
if (newTypeString.length() > 1) { | |||
t.object = newTypeString.substring(0, newTypeString.length()-1); | |||
if (deleteCandTypes.size() > 0) { | |||
return 1; | |||
} | |||
else { | |||
return 0; | |||
} | |||
} | |||
else return -2; | |||
} | |||
public int hs_check6_Vtype1T(Triple t) | |||
{ | |||
String[] tList = t.object.split("\\|"); // ע��"|"��Ҫת�� | |||
VariableFragment V = variable_fragment.get(t.subject); | |||
if (tList.length == 0) return -2; | |||
// Simplify, only consider the first one | |||
if(!TypeFragment.typeShortName2IdList.containsKey(tList[0])) | |||
return 0; | |||
int tid = TypeFragment.typeShortName2IdList.get(tList[0]).get(0); | |||
TypeFragment T = TypeFragment.typeFragments.get(tid); | |||
if(V == null) | |||
{ | |||
variable_fragment.put(t.subject, new VariableFragment()); | |||
V = variable_fragment.get(t.subject); | |||
V.candEntities = T.entSet; | |||
} | |||
else | |||
{ | |||
if(V.mayLiteral) //literal cannot be subject | |||
return -2; | |||
HashSet<Integer> newVcands = new HashSet<Integer>(); | |||
for(int vid: V.candEntities) | |||
{ | |||
EntityFragment E = efd.getEntityFragmentByEid(vid); | |||
if(E.types.contains(tid)) | |||
newVcands.add(vid); | |||
} | |||
V.candEntities = newVcands; | |||
} | |||
if(V.candEntities.size() == 0) | |||
return -2; | |||
else | |||
return 0; | |||
} | |||
public void swapTriple (Triple t) { | |||
String temp = t.subject; | |||
t.subject = t.object; | |||
t.object = temp; | |||
} | |||
}; |
@@ -0,0 +1,164 @@ | |||
package qa.mapping; | |||
import java.io.BufferedReader; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import lcn.EntityFragmentFields; | |||
import log.QueryLogger; | |||
import org.apache.commons.httpclient.HttpClient; | |||
import org.apache.commons.httpclient.HttpException; | |||
import org.apache.commons.httpclient.methods.GetMethod; | |||
import fgmt.EntityFragment; | |||
import rdf.EntityMapping; | |||
public class DBpediaLookup { | |||
//There are two websites of the DBpediaLookup online service. | |||
//public static final String baseURL = "http://en.wikipedia.org/w/api.php?action=opensearch&format=xml&limit=10&search="; | |||
//public static final String baseURL = "http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=5&QueryString="; | |||
public static final String baseURL = "http://172.31.222.72:1234/api/search/KeywordSearch?MaxHits=5&QueryString="; | |||
public HttpClient ctripHttpClient = null; | |||
//public static final String begin = "<Text xml:space=\"preserve\">"; | |||
//public static final String begin = "<Result>\n <Label>"; | |||
public static final String begin = "<Result>\n <Label>"; | |||
public static final int begin_length = begin.length(); | |||
//public static final String end = "</Text>"; | |||
public static final String end = "</Label>"; | |||
public static final int end_length = end.length(); | |||
public static HashMap<String, String>entMentionDict = null; // TODO: base on redirect data & wikipedia click data to build mention2ent's dictionary, now just manually | |||
public DBpediaLookup() | |||
{ | |||
ctripHttpClient = new HttpClient(); | |||
ctripHttpClient.setTimeout(3000); | |||
entMentionDict = new HashMap<String, String>(); | |||
entMentionDict.put("Prince_Charles", "Charles,_Prince_of_Wales"); | |||
} | |||
public ArrayList<EntityMapping> getEntityMappings(String searchString, QueryLogger qlog) | |||
{ | |||
ArrayList<String> slist = new ArrayList<String>(); | |||
if(entMentionDict.containsKey(searchString)) | |||
slist.add(entMentionDict.get(searchString)); | |||
else | |||
slist = lookForEntityNames(searchString, qlog); | |||
if (slist.size() == 0 && searchString.contains(". ")) | |||
slist.addAll(lookForEntityNames(searchString.replaceAll(". ", "."), qlog)); | |||
ArrayList<EntityMapping> emlist = new ArrayList<EntityMapping>(); | |||
// Now string use "_" as delimiter (original) | |||
String[] sa = searchString.split("_"); | |||
int UpperCnt = 0; | |||
for(String str: sa) | |||
{ | |||
if( (str.charAt(0)>='A'&&str.charAt(0)<='Z') || (str.charAt(0)>='0'&&str.charAt(0)<='9') ) | |||
UpperCnt ++; | |||
} | |||
System.out.print("DBpediaLookup find: " + slist + ", "); | |||
int count = 40; | |||
for (String s : slist) | |||
{ | |||
//consider ABBR only when all UPPER; drop when too long edit distance | |||
if(UpperCnt < sa.length && EntityFragment.calEditDistance(s, searchString.replace("_", ""))>searchString.length()/2) | |||
continue; | |||
int eid = -1; | |||
s = s.replace(" ", "_"); | |||
if(EntityFragmentFields.entityName2Id.containsKey(s)) | |||
{ | |||
eid = EntityFragmentFields.entityName2Id.get(s); | |||
emlist.add(new EntityMapping(eid, s, count)); | |||
count -=2 ; | |||
} | |||
else | |||
{ | |||
System.out.print("Drop "+s+" because it not in Entity Dictionary. "); | |||
} | |||
} | |||
System.out.println("DBpediaLookup select: " + emlist); | |||
return emlist; | |||
} | |||
public ArrayList<String> lookForEntityNames (String searchString, QueryLogger qlog) { | |||
// URL transition: " " -> %20 | |||
GetMethod getMethod = new GetMethod((baseURL+searchString).replaceAll(" ", "%20")); | |||
ArrayList<String> ret = new ArrayList<String>(); | |||
int statusCode; | |||
try { | |||
statusCode = ctripHttpClient.executeMethod(getMethod); | |||
} catch (HttpException e) { | |||
e.printStackTrace(); | |||
return ret; | |||
} catch (IOException e) { | |||
e.printStackTrace(); | |||
return ret; | |||
} | |||
if (statusCode!=200) return null; | |||
String response = getMethod.getResponseBodyAsString(); | |||
if (qlog != null && qlog.MODE_debug) { | |||
System.out.println("searchString=" + searchString); | |||
System.out.println("statusCode=" + statusCode); | |||
System.out.println("response=" + getMethod.getResponseBodyAsString()); | |||
} | |||
getMethod.releaseConnection(); | |||
//System.out.println(response); | |||
if (response == null || response.isEmpty()) | |||
return ret; | |||
int idx1 = response.indexOf(begin); | |||
while (idx1 != -1) { | |||
int idx2 = response.indexOf(end, idx1+begin_length); | |||
String ss = response.substring(idx1+begin_length, idx2); | |||
ret.add(ss); | |||
//System.out.println(ss); | |||
idx1 = response.indexOf(begin, idx2 + end_length); | |||
} | |||
return ret; | |||
} | |||
public static void main(String argv[]){ | |||
DBpediaLookup dbplook = new DBpediaLookup(); | |||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||
try { | |||
while (true) { | |||
System.out.println("Test DBpediaLookup."); | |||
System.out.print("Please input the search string: "); | |||
String searchString = br.readLine(); | |||
try { | |||
long t1 = System.currentTimeMillis(); | |||
ArrayList<String> res = dbplook.lookForEntityNames(searchString, null); | |||
long t2 = System.currentTimeMillis(); | |||
System.out.println(res); | |||
System.out.println("time=" + (t2-t1) + "ms"); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} catch (IOException e) { | |||
e.printStackTrace(); | |||
} | |||
return; | |||
} | |||
} |
@@ -0,0 +1,44 @@ | |||
package qa.mapping; | |||
import java.util.HashMap; | |||
//import lcn.EntityFragmentFields; | |||
//import qa.Globals; | |||
import fgmt.EntityFragment; | |||
public class EntityFragmentDict { | |||
//public HashMap<String, EntityFragment> entityFragmentDictionary = new HashMap<String, EntityFragment>(); | |||
public HashMap<Integer, EntityFragment> entityFragmentDictionary = new HashMap<Integer, EntityFragment>(); | |||
public EntityFragment getEntityFragmentByEid (Integer eid) | |||
{ | |||
if (!entityFragmentDictionary.containsKey(eid)) | |||
{ | |||
entityFragmentDictionary.put(eid, EntityFragment.getEntityFragmentByEntityId(eid)); | |||
} | |||
return entityFragmentDictionary.get(eid); | |||
} | |||
/* | |||
* Old version, search by name | |||
* */ | |||
// public EntityFragment getEntityFragmentByName (String name) { | |||
// if (name.startsWith("?")) { | |||
// return null; | |||
// } | |||
// if (!entityFragmentDictionary.containsKey(name)) { | |||
// String fgmt = EntityFragment.getEntityFgmtStringByName(name); | |||
// if (fgmt != null) | |||
// { | |||
// int eid = EntityFragmentFields.entityName2Id.get(name); | |||
// entityFragmentDictionary.put(name, new EntityFragment(eid, fgmt)); | |||
// } | |||
// else { | |||
// entityFragmentDictionary.put(name, null); | |||
// } | |||
// } | |||
// return entityFragmentDictionary.get(name); | |||
// | |||
// } | |||
} |
@@ -0,0 +1,811 @@ | |||
package qa.mapping; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import java.util.Iterator; | |||
import java.util.Map; | |||
import nlp.ds.Word; | |||
import nlp.ds.Sentence.SentenceType; | |||
import fgmt.EntityFragment; | |||
import fgmt.RelationFragment; | |||
import fgmt.TypeFragment; | |||
import log.QueryLogger; | |||
import qa.Globals; | |||
import rdf.EntityMapping; | |||
import rdf.PredicateMapping; | |||
import rdf.SemanticRelation; | |||
import rdf.Sparql; | |||
import rdf.Triple; | |||
import rdf.TypeMapping; | |||
public class SemanticItemMapping { | |||
public HashMap<Word, ArrayList<EntityMapping>> entityDictionary = new HashMap<Word, ArrayList<EntityMapping>>(); | |||
public static int k = 10; // useless now | |||
public static int t = 10; // Depth of enumerating candidates of each node/edge. O(t^n). | |||
ArrayList<Sparql> rankedSparqls = new ArrayList<Sparql>(); | |||
HashSet<String> checkedSparqlStrs = new HashSet<String>(); | |||
public ArrayList<ArrayList<EntityMapping>> entityPhrasesList = new ArrayList<ArrayList<EntityMapping>>(); | |||
public ArrayList<Word> entityWordList = new ArrayList<Word>(); | |||
public HashMap<Integer, EntityMapping> currentEntityMappings = new HashMap<Integer, EntityMapping>(); | |||
public ArrayList<ArrayList<PredicateMapping>> predicatePhraseList = new ArrayList<ArrayList<PredicateMapping>>(); | |||
public ArrayList<SemanticRelation> predicateSrList = new ArrayList<SemanticRelation>(); | |||
public HashMap<Integer, PredicateMapping> currentPredicateMappings = new HashMap<Integer, PredicateMapping>(); | |||
public HashMap<Integer, SemanticRelation> semanticRelations = null; | |||
public QueryLogger qlog = null; | |||
public EntityFragmentDict efd = new EntityFragmentDict(); | |||
public boolean isAnswerFound = false; | |||
public int tripleCheckCallCnt = 0; | |||
public int sparqlCheckCallCnt = 0; | |||
public int sparqlCheckId = 0; | |||
SemanticRelation firstFalseSr = null; | |||
long tripleCheckTime = 0; | |||
long sparqlCheckTime = 0; | |||
/* | |||
* A best-first top-down method, enumerate all possible query graph and sort. | |||
* Notice, we use fragment checking to simulate graph matching and generate the TOP-k SPARQL queries, which can be executed via GStore or Virtuoso. | |||
* */ | |||
public void process(QueryLogger qlog, HashMap<Integer, SemanticRelation> semRltn) | |||
{ | |||
semanticRelations = semRltn; | |||
this.qlog = qlog; | |||
long t1; | |||
t = 10; // Notice, t is adjustable. | |||
entityPhrasesList.clear(); | |||
entityWordList.clear(); | |||
currentEntityMappings.clear(); | |||
predicatePhraseList.clear(); | |||
predicateSrList.clear(); | |||
currentPredicateMappings.clear(); | |||
// 1. collect info of constant nodes(entities) | |||
Iterator<Map.Entry<Integer, SemanticRelation>> it = semanticRelations.entrySet().iterator(); | |||
while(it.hasNext()) | |||
{ | |||
Map.Entry<Integer, SemanticRelation> entry = it.next(); | |||
SemanticRelation sr = entry.getValue(); | |||
//We now only tackle Constant of Entity & Type. TODO: consider Literal. | |||
if(sr.isArg1Constant && !sr.arg1Word.mayType && !sr.arg1Word.mayEnt || sr.isArg2Constant && !sr.arg2Word.mayType && !sr.arg2Word.mayEnt) | |||
{ | |||
it.remove(); | |||
continue; | |||
} | |||
//Type constant will be solved in ScoreAndRanking function. | |||
if(sr.isArg1Constant && sr.arg1Word.mayEnt) | |||
{ | |||
if(!entityDictionary.containsKey(sr.arg1Word)) | |||
entityDictionary.put(sr.arg1Word, sr.arg1Word.emList); | |||
entityPhrasesList.add(sr.arg1Word.emList); | |||
entityWordList.add(sr.arg1Word); | |||
} | |||
if(sr.isArg2Constant && !sr.arg2Word.mayType) | |||
{ | |||
if (!entityDictionary.containsKey(sr.arg2Word)) | |||
entityDictionary.put(sr.arg2Word, sr.arg2Word.emList); | |||
entityPhrasesList.add(sr.arg2Word.emList); | |||
entityWordList.add(sr.arg2Word); | |||
} | |||
} | |||
// 2. collect info of edges(relations). | |||
for (Integer key : semanticRelations.keySet()) | |||
{ | |||
SemanticRelation sr = semanticRelations.get(key); | |||
predicatePhraseList.add(sr.predicateMappings); | |||
predicateSrList.add(sr); | |||
// Reduce t when structure enumeration needed. | |||
if(Globals.evaluationMethod > 1 && !sr.isSteadyEdge) | |||
t = 5; | |||
} | |||
// 3. top-k join | |||
t1 = System.currentTimeMillis(); | |||
if(semanticRelations.size()>0) | |||
topkJoin(semanticRelations); | |||
else | |||
System.out.println("No Valid SemanticRelations."); | |||
qlog.timeTable.put("TopkJoin", (int)(System.currentTimeMillis()-t1)); | |||
qlog.timeTable.put("TripleCheck", (int)tripleCheckTime); | |||
qlog.timeTable.put("SparqlCheck", (int)sparqlCheckTime); | |||
Collections.sort(rankedSparqls); | |||
// Notice, use addAll because we may have more than one node recognition decision. | |||
qlog.rankedSparqls.addAll(rankedSparqls); | |||
qlog.entityDictionary = entityDictionary; | |||
System.out.println("Check query graph count: " + tripleCheckCallCnt + "\nPass single check: " + sparqlCheckCallCnt + "\nPass final check: " + rankedSparqls.size()); | |||
System.out.println("TopkJoin time=" + qlog.timeTable.get("TopkJoin")); | |||
} | |||
public void topkJoin (HashMap<Integer, SemanticRelation> semanticRelations) | |||
{ | |||
dfs_entityName(0); | |||
} | |||
// Each level for a CERTAIN entity | |||
public void dfs_entityName (int level_i) | |||
{ | |||
// All entities ready. | |||
if (level_i == entityPhrasesList.size()) | |||
{ | |||
dfs_predicate(0); | |||
return; | |||
} | |||
ArrayList<EntityMapping> list = entityPhrasesList.get(level_i); | |||
Word w = entityWordList.get(level_i); | |||
int tcount = 0; | |||
for(EntityMapping em : list) | |||
{ | |||
if (tcount == t || isAnswerFound) break; | |||
currentEntityMappings.put(w.hashCode(), em); | |||
dfs_entityName(level_i+1); | |||
currentEntityMappings.remove(w.hashCode()); | |||
tcount ++; | |||
} | |||
} | |||
public void dfs_predicate(int level_i) | |||
{ | |||
// All entities & predicates ready, start generate SPARQL. | |||
if (level_i == predicatePhraseList.size()) | |||
{ | |||
scoringAndRanking(); | |||
return; | |||
} | |||
ArrayList<PredicateMapping> list = predicatePhraseList.get(level_i); | |||
SemanticRelation sr = predicateSrList.get(level_i); | |||
if (sr.dependOnSemanticRelation != null) | |||
{ | |||
dfs_predicate(level_i+1); | |||
} | |||
else | |||
{ | |||
int tcount=0; | |||
for (PredicateMapping pm : list) | |||
{ | |||
if (tcount==t || isAnswerFound) break; | |||
currentPredicateMappings.put(sr.hashCode(), pm); | |||
dfs_predicate(level_i+1); | |||
currentPredicateMappings.remove(sr.hashCode()); | |||
tcount++; | |||
// Pruning (If we do not change predicate of firstFalseSr, it will still false, so just return) | |||
if(firstFalseSr != null) | |||
{ | |||
if(firstFalseSr != sr) return; | |||
else firstFalseSr = null; | |||
} | |||
} | |||
// "null" means we drop this edge, this is how we enumerate structure. | |||
if(Globals.evaluationMethod == 2 && sr.isSteadyEdge == false) | |||
{ | |||
currentPredicateMappings.put(sr.hashCode(), null); | |||
dfs_predicate(level_i+1); | |||
currentPredicateMappings.remove(sr.hashCode()); | |||
tcount++; | |||
} | |||
} | |||
} | |||
/* | |||
* Run this function when all nodes/edges have set value (through currentEntityMappings、currentPredicateMappings) | |||
* Generate SPARQL according current ENTs and RELATIONs, then fragment checking | |||
* Notice: add embedded type information: | |||
* eg, ?who <height> ?how --add--> ?who <type1> <Person> | ?book <author> <Tom> --add--> ?book <type1> <Book> | |||
* Notice: add constant type information: | |||
* eg, ask: <YaoMing> <type1> <BasketballPlayer> | |||
* Notice: add embedded triple information: | |||
* eg, ?Canadians <residence> <Unitied_State> --add--> ?Canadians <birthPlace> <Canada> | |||
* */ | |||
public void scoringAndRanking() | |||
{ | |||
firstFalseSr = null; | |||
Sparql sparql = new Sparql(semanticRelations); | |||
// A simple way to judge connectivity (may incorrect when nodes number >= 6) | |||
//TODO: a standard method to judge CONNECTIVITY | |||
HashMap<Integer, Integer> count = new HashMap<Integer, Integer>(); | |||
int edgeCnt = 0; | |||
for (Integer key : semanticRelations.keySet()) | |||
{ | |||
SemanticRelation sr = semanticRelations.get(key); | |||
if(currentPredicateMappings.get(sr.hashCode()) == null) | |||
continue; | |||
edgeCnt++; | |||
int v1 = sr.arg1Word.hashCode(), v2 = sr.arg2Word.hashCode(); | |||
if(!count.containsKey(v1)) | |||
count.put(v1, 1); | |||
else | |||
count.put(v1, count.get(v1)+1); | |||
if(!count.containsKey(v2)) | |||
count.put(v2, 1); | |||
else | |||
count.put(v2, count.get(v2)+1); | |||
} | |||
if(count.size() < qlog.semanticUnitList.size()) | |||
return; | |||
if(edgeCnt == 0) | |||
return; | |||
if(edgeCnt > 1) | |||
{ | |||
for (Integer key : semanticRelations.keySet()) | |||
{ | |||
SemanticRelation sr = semanticRelations.get(key); | |||
if(currentPredicateMappings.get(sr.hashCode()) == null) | |||
continue; | |||
int v1 = sr.arg1Word.hashCode(), v2 = sr.arg2Word.hashCode(); | |||
if(count.get(v1) == 1 && count.get(v2) == 1) | |||
return; | |||
} | |||
} | |||
// Now the graph is connected, start to generate SPARQL. | |||
HashSet<String> typeSetFlag = new HashSet<String>(); | |||
for (Integer key : semanticRelations.keySet()) | |||
{ | |||
SemanticRelation sr = semanticRelations.get(key); | |||
String sub, obj; | |||
int subjId = -1, objId = -1; | |||
int pid; | |||
double score = 1; | |||
boolean isSubjObjOrderSameWithSemRltn = true; | |||
// argument1 | |||
if(sr.isArg1Constant && (sr.arg1Word.mayEnt || sr.arg1Word.mayType) ) // Constant | |||
{ | |||
// For subject, entity has higher priority. | |||
if(sr.arg1Word.mayEnt) | |||
{ | |||
EntityMapping em = currentEntityMappings.get(sr.arg1Word.hashCode()); | |||
subjId = em.entityID; | |||
sub = em.entityName; | |||
score *= em.score; | |||
} | |||
else | |||
{ | |||
TypeMapping tm = sr.arg1Word.tmList.get(0); | |||
subjId = Triple.TYPE_ROLE_ID; | |||
sub = tm.typeName; | |||
score *= (tm.score*100); // Generalization. type score: [0,1], entity score: [0,100]. | |||
} | |||
} | |||
else // Variable | |||
{ | |||
subjId = Triple.VAR_ROLE_ID; | |||
sub = "?" + sr.arg1Word.originalForm; | |||
} | |||
// Embedded Type info of argument1(variable type) | eg, ?book <type> <Book> | |||
// Notice, mayType & mayExtendVariable is mutual-exclusive. (see constantVariableRecognition) | |||
// Notice, we do NOT consider types of [?who,?where...] now. | |||
Triple subt = null; | |||
if (!sr.isArg1Constant && sr.arg1Word.mayType && sr.arg1Word.tmList != null && sr.arg1Word.tmList.size() > 0 && !typeSetFlag.contains(sub)) | |||
{ | |||
StringBuilder type = new StringBuilder(""); | |||
for (TypeMapping tm: sr.arg1Word.tmList) | |||
{ | |||
Integer tt = tm.typeID; | |||
if(tt != -1) | |||
type.append(TypeFragment.typeId2ShortName.get(tt)); | |||
else | |||
type.append(tm.typeName); | |||
type.append('|'); | |||
} | |||
String ttt = type.substring(0, type.length()-1); | |||
subt = new Triple(subjId, sub, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, ttt, null, 10); | |||
subt.typeSubjectWord = sr.arg1Word; | |||
if(sr.arg1Word.tmList.get(0).prefferdRelation == -1) | |||
subt = null; | |||
} | |||
// predicate | |||
SemanticRelation dep = sr.dependOnSemanticRelation; | |||
PredicateMapping pm = null; | |||
if (dep == null) | |||
pm = currentPredicateMappings.get(sr.hashCode()); | |||
else | |||
pm = currentPredicateMappings.get(dep.hashCode()); | |||
if(pm == null) | |||
continue; | |||
pid = pm.pid; | |||
score *= pm.score; | |||
// argument2 | |||
if(sr.isArg2Constant && (sr.arg2Word.mayEnt || sr.arg2Word.mayType) ) | |||
{ | |||
if(!sr.arg2Word.mayType) | |||
{ | |||
EntityMapping em = currentEntityMappings.get(sr.arg2Word.hashCode()); | |||
objId = em.entityID; | |||
obj = em.entityName; | |||
score *= em.score; | |||
} | |||
else | |||
{ | |||
TypeMapping tm = sr.arg2Word.tmList.get(0); | |||
objId = Triple.TYPE_ROLE_ID; | |||
obj = tm.typeName; | |||
score *= (tm.score*100); | |||
} | |||
} | |||
else | |||
{ | |||
objId = Triple.VAR_ROLE_ID; | |||
obj = "?" + sr.arg2Word.getFullEntityName(); | |||
} | |||
// Type info of argument2 | |||
Triple objt = null; | |||
if (sr.arg2Word.tmList != null && sr.arg2Word.tmList.size() > 0 && !typeSetFlag.contains(obj) && !sr.isArg2Constant) | |||
{ | |||
StringBuilder type = new StringBuilder(""); | |||
for (TypeMapping tm : sr.arg2Word.tmList) | |||
{ | |||
Integer tt = tm.typeID; | |||
if(tt != -1) | |||
type.append(TypeFragment.typeId2ShortName.get(tt)); | |||
else | |||
type.append(tm.typeName); | |||
type.append('|'); | |||
} | |||
String ttt = type.substring(0, type.length()-1); | |||
objt = new Triple(objId, obj, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, ttt, null, 10); | |||
objt.typeSubjectWord = sr.arg2Word; | |||
if(sr.arg2Word.tmList.get(0).prefferdRelation == -1) | |||
objt = null; | |||
} | |||
// Prune. | |||
if(objId == Triple.TYPE_ROLE_ID && pid != Globals.pd.typePredicateID) | |||
return; | |||
// Consider orders rely on LITERAL relations | at least one argument has TYPE info | |||
if (RelationFragment.isLiteral(pid) && (subt != null || objt != null)) | |||
{ | |||
if (sub.startsWith("?") && obj.startsWith("?")) // two variables | |||
{ | |||
// two variables have both possibility as object literal | |||
if (subt != null) { | |||
subt.object += ("|" + "literal_HRZ"); | |||
} | |||
if (objt != null) { | |||
objt.object += ("|" + "literal_HRZ"); | |||
} | |||
if (subt==null && objt!=null) | |||
{ | |||
// if object has type, subject has no type, more possible to change sub/obj because literal has no type in general [however maybe have yago:type] | |||
String temp = sub; | |||
int tmpId = subjId; | |||
sub = obj; | |||
subjId = objId; | |||
obj = temp; | |||
objId = tmpId; | |||
isSubjObjOrderSameWithSemRltn=!isSubjObjOrderSameWithSemRltn; | |||
} | |||
} | |||
else if (sub.startsWith("?") && !obj.startsWith("?")) { | |||
// need change subj/obj order | |||
if (subt != null) { | |||
subt.object += ("|" + "literal_HRZ"); | |||
} | |||
String temp = sub; | |||
int tmpId = subjId; | |||
sub = obj; | |||
subjId = objId; | |||
obj = temp; | |||
objId = tmpId; | |||
isSubjObjOrderSameWithSemRltn=!isSubjObjOrderSameWithSemRltn; | |||
//System.out.println("here: "+sub+obj); | |||
} | |||
else if (obj.startsWith("?") && !sub.startsWith("?")) { | |||
if (objt != null) { | |||
objt.object += ("|" + "literal_HRZ"); | |||
} | |||
} | |||
} | |||
Triple t = new Triple(subjId, sub, pid, objId, obj, sr, score,isSubjObjOrderSameWithSemRltn); | |||
//System.out.println("triple: "+t+" "+isTripleCompatibleCanSwap(t)); | |||
sparql.addTriple(t); | |||
// score of subject/object's type should correlative with the score of triple itself | |||
if (subt != null) | |||
{ | |||
subt.score += t.score*0.2; | |||
sparql.addTriple(subt); | |||
typeSetFlag.add(subt.subject); // be cautious to NOT use sub, it may has changed subj/obj order | |||
} | |||
if (objt != null) | |||
{ | |||
objt.score += t.score*0.2; | |||
sparql.addTriple(objt); | |||
typeSetFlag.add(objt.subject); | |||
} | |||
// add argument' embedded triple, eg, ?canadian <birthPlace> <Canada> | |||
if(!sr.isArg1Constant && sr.arg1Word.mayExtendVariable && sr.arg1Word.embbededTriple != null) | |||
{ | |||
sparql.addTriple(sr.arg1Word.embbededTriple); | |||
} | |||
if(!sr.isArg2Constant && sr.arg2Word.mayExtendVariable && sr.arg2Word.embbededTriple != null) | |||
{ | |||
sparql.addTriple(sr.arg2Word.embbededTriple); | |||
} | |||
sparql.adjustTriplesOrder(); | |||
} | |||
// deduplicate | |||
sparql.deduplicate(); | |||
if(checkedSparqlStrs.contains(sparql.toStringForGStore2())) | |||
return; | |||
checkedSparqlStrs.add(sparql.toStringForGStore2()); | |||
if (!qlog.MODE_fragment) { | |||
// Method 1: do NOT check compatibility | |||
rankedSparqls.add(sparql); | |||
isAnswerFound = true; | |||
} | |||
else { | |||
// Method 2: check compatibility by FRAGMENT (offline index) | |||
//1. single-triple check (a quickly prune), allow to swap subject and object. Try to adjust to the best order. | |||
tripleCheckCallCnt++; | |||
long t1 = System.currentTimeMillis(); | |||
for (Triple t : sparql.tripleList) | |||
if(t.predicateID!=Globals.pd.typePredicateID && !isTripleCompatibleCanSwap(t)) | |||
{ | |||
firstFalseSr = t.semRltn; | |||
return; | |||
} | |||
tripleCheckTime += (System.currentTimeMillis()-t1); | |||
//2. SPARQL check (consider the interact between all triples), allow to swap subject and object. | |||
t1 = System.currentTimeMillis(); | |||
sparqlCheckCallCnt++; | |||
enumerateSubjObjOrders(sparql, new Sparql(sparql.semanticRelations), 0); | |||
sparqlCheckTime += (System.currentTimeMillis()-t1); | |||
} | |||
} | |||
/* | |||
* Notice: | |||
* typeId=-1 then no data fragment | |||
* */ | |||
public static TypeFragment getTypeFragmentByWord(Word word) | |||
{ | |||
TypeFragment tf = null; | |||
if(word!=null && word.tmList!=null && word.tmList.size()>0) | |||
{ | |||
int typeId = word.tmList.get(0).typeID; | |||
if(typeId != -1) | |||
tf = TypeFragment.typeFragments.get(typeId); | |||
} | |||
return tf; | |||
} | |||
/* | |||
* (Just PRE CHECK [single triple check] in this function, the final check in enumerateSubjObjOrders which utilize more INDEX) | |||
* notice: predicate = type cannot entrance this function | |||
* */ | |||
public boolean isTripleCompatibleCanSwap (Triple t) { | |||
if (qlog.s.sentenceType==SentenceType.GeneralQuestion) | |||
{ | |||
if (fragmentCompatible2(t.subjId, t.predicateID, t.objId) > | |||
fragmentCompatible2(t.objId, t.predicateID, t.subjId)) | |||
t.swapSubjObjOrder(); | |||
if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) | |||
return true; | |||
return false; | |||
} | |||
else | |||
{ | |||
//var & var | |||
if(t.subject.startsWith("?") && t.object.startsWith("?")) | |||
{ | |||
Word subjWord = t.getSubjectWord(), objWord = t.getObjectWord(); | |||
TypeFragment subjTf = getTypeFragmentByWord(subjWord), objTf = getTypeFragmentByWord(objWord); | |||
//based on whether the two varabile's type fragment's in/out edge contain predicate, calculate whether need change order | |||
//just vote | |||
int nowOrderCnt = 0, reverseOrderCnt = 0; | |||
if(subjTf == null || subjTf.outEdges.contains(t.predicateID)) | |||
nowOrderCnt ++; | |||
if(objTf == null || objTf.inEdges.contains(t.predicateID)) | |||
nowOrderCnt ++; | |||
if(subjTf == null || subjTf.inEdges.contains(t.predicateID)) | |||
reverseOrderCnt ++; | |||
if(objTf == null || objTf.outEdges.contains(t.predicateID)) | |||
reverseOrderCnt ++; | |||
if(nowOrderCnt<2 && reverseOrderCnt<2) | |||
return false; | |||
else if(nowOrderCnt > reverseOrderCnt) | |||
{ | |||
// do nothing | |||
} | |||
else if(reverseOrderCnt > nowOrderCnt) | |||
{ | |||
t.swapSubjObjOrder(); | |||
} | |||
else //now order and reverse order both passed type fragment checking, need SELECT one | |||
{ | |||
//rule1: ?inventor <occupation> ?occupation || ... <name> ?name -> more similar string will be put latter | |||
String p = Globals.pd.getPredicateById(t.predicateID); | |||
int ed1 = EntityFragment.calEditDistance(subjWord.baseForm, p); | |||
int ed2 = EntityFragment.calEditDistance(objWord.baseForm, p); | |||
if(ed1 < ed2) | |||
{ | |||
t.swapSubjObjOrder(); | |||
} | |||
} | |||
return true; | |||
} | |||
///ent & ent || var & ent | |||
else | |||
{ | |||
boolean flag = false; | |||
if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) { | |||
flag = true; | |||
} | |||
else if (fragmentCompatible(t.objId, t.predicateID, t.subjId)) { | |||
t.swapSubjObjOrder(); | |||
flag = true; | |||
} | |||
// Var & Ent | ?city <type1> <City> & <Chile_Route_68> <country> ?city : <country> is invalid for City | Notice: the data often dirty and can not prune correctly. | |||
if(flag == true && (t.subject.startsWith("?") || t.object.startsWith("?"))) | |||
{ | |||
Word subjWord = t.getSubjectWord(), objWord = t.getObjectWord(); | |||
TypeFragment subjTf = getTypeFragmentByWord(subjWord), objTf = getTypeFragmentByWord(objWord); | |||
if(subjTf != null) | |||
{ | |||
if(subjTf.outEdges.contains(t.predicateID)) | |||
flag = true; | |||
else if(subjTf.inEdges.contains(t.predicateID)) | |||
{ | |||
t.swapSubjObjOrder(); | |||
flag = true; | |||
} | |||
else | |||
flag = false; | |||
} | |||
else if(objTf != null) | |||
{ | |||
if(objTf.inEdges.contains(t.predicateID)) | |||
flag = true; | |||
else if(objTf.outEdges.contains(t.predicateID)) | |||
{ | |||
t.swapSubjObjOrder(); | |||
flag = true; | |||
} | |||
else | |||
flag = false; | |||
} | |||
} | |||
return flag; | |||
} | |||
} | |||
} | |||
public boolean isTripleCompatibleNotSwap (Triple t) { | |||
if (t.predicateID == Globals.pd.typePredicateID) { | |||
return true; | |||
} | |||
else if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) { | |||
return true; | |||
} | |||
else { | |||
return false; | |||
} | |||
} | |||
public boolean fragmentCompatible (int id1, int pid, int id2) { | |||
EntityFragment ef1 = efd.getEntityFragmentByEid(id1); | |||
EntityFragment ef2 = efd.getEntityFragmentByEid(id2); | |||
// valid entity MUST has fragment | |||
if (id1!=Triple.TYPE_ROLE_ID && id1!=Triple.VAR_ROLE_ID && ef1 == null) return false; | |||
if (id2!=Triple.TYPE_ROLE_ID && id2!=Triple.VAR_ROLE_ID && ef2 == null) return false; | |||
boolean ef1_constant = (ef1==null)?false:true; | |||
boolean ef2_constant = (ef2==null)?false:true; | |||
int entityCnt=0,compatibleCnt=0; | |||
if(ef1_constant) { | |||
entityCnt++; | |||
if (ef1.outEdges.contains(pid)) | |||
compatibleCnt++; | |||
// else // <e1,p> Ϊ false pair | |||
// { | |||
// falseEntPres.add(new Pair(id1,pid)); | |||
// } | |||
} | |||
if (ef2_constant) { | |||
entityCnt++; | |||
if (ef2.inEdges.contains(pid)) | |||
compatibleCnt++; | |||
// else // <p,e2> Ϊfalse pair | |||
// { | |||
// falsePreEnts.add(new Pair(pid,id2)); | |||
// } | |||
} | |||
// for SELECT sparql, EXCAT match between predicate and subject and object, ASK sparql can be relaxed | |||
if (qlog.s.sentenceType==SentenceType.GeneralQuestion) | |||
return entityCnt-compatibleCnt<=1; | |||
else | |||
return entityCnt==compatibleCnt; | |||
} | |||
public int fragmentCompatible2 (int id1, int pid, int id2) { | |||
EntityFragment ef1 = efd.getEntityFragmentByEid(id1); | |||
EntityFragment ef2 = efd.getEntityFragmentByEid(id2); | |||
int entityCnt=0,compatibleCnt=0; | |||
if(id1 != Triple.VAR_ROLE_ID && id1 != Triple.TYPE_ROLE_ID) { | |||
entityCnt++; | |||
if (ef1!=null && ef1.outEdges.contains(pid)) | |||
compatibleCnt++; | |||
} | |||
if (id2 != Triple.VAR_ROLE_ID && id2 != Triple.TYPE_ROLE_ID) { | |||
entityCnt++; | |||
if (ef2!=null && ef2.inEdges.contains(pid)) | |||
compatibleCnt++; | |||
} | |||
return entityCnt-compatibleCnt; | |||
} | |||
public boolean checkConstantConsistency (Sparql spql) { | |||
HashMap<String, String> constants = new HashMap<String, String>(); | |||
for (Triple t : spql.tripleList) { | |||
if (!t.subject.startsWith("?")) { | |||
String e = t.getSubjectWord().getFullEntityName(); | |||
if (!constants.containsKey(e)) | |||
constants.put(e, t.subject); | |||
else { | |||
if (!constants.get(e).equals(t.subject)) | |||
return false; | |||
} | |||
} | |||
if (!t.object.startsWith("?")) { | |||
String e = t.getObjectWord().getFullEntityName(); | |||
if (!constants.containsKey(e)) | |||
constants.put(e, t.object); | |||
else { | |||
if (!constants.get(e).equals(t.object)) | |||
return false; | |||
} | |||
} | |||
} | |||
return true; | |||
} | |||
public void reviseScoreByTripleOrders(Sparql spq) | |||
{ | |||
Triple shouldDel = null; | |||
for(Triple triple: spq.tripleList) | |||
{ | |||
// eg, ?who <president> <United_States_Navy> need punished (or dropped). | |||
if(triple.subject.toLowerCase().equals("?who")) | |||
{ | |||
String rel = Globals.pd.id_2_predicate.get(triple.predicateID); | |||
if(rel.equals("president") || rel.equals("starring") || rel.equals("producer")) | |||
{ | |||
spq.score -= triple.score; | |||
triple.score /= 10; | |||
spq.score += triple.score; | |||
if(triple.semRltn!=null && triple.semRltn.isSteadyEdge == false) | |||
shouldDel = triple; | |||
} | |||
} | |||
} | |||
if(shouldDel != null) | |||
spq.delTriple(shouldDel); | |||
} | |||
// enumerate subject/object order, fragment check | |||
// Modify score of "ask one triple" | |||
public boolean enumerateSubjObjOrders (Sparql originalSpq, Sparql currentSpq, int level) | |||
{ | |||
if (level == originalSpq.tripleList.size()) | |||
{ | |||
if(currentSpq.tripleList.size() == 0) | |||
return false; | |||
CompatibilityChecker cc = new CompatibilityChecker(efd); | |||
if (qlog.s.sentenceType==SentenceType.GeneralQuestion) //ask where sparql: no need for fragment check | |||
{ | |||
if(cc.isSparqlCompatible3(currentSpq)) //reward score for "TRUE" | |||
{ | |||
for(Triple triple: currentSpq.tripleList) | |||
triple.addScore(triple.getScore()); | |||
} | |||
rankedSparqls.add(currentSpq.copy()); | |||
return true; | |||
} | |||
try | |||
{ | |||
sparqlCheckId++; | |||
if (cc.isSparqlCompatible3(currentSpq)) | |||
{ | |||
//eg, ?who <president> <United_States_Navy> | |||
//When query graph contains circle, we just prune this edge | |||
Sparql sparql = currentSpq.copy(); | |||
reviseScoreByTripleOrders(sparql); | |||
if(!rankedSparqls.contains(sparql)) | |||
rankedSparqls.add(sparql); | |||
return true; | |||
} | |||
} | |||
catch (Exception e) { | |||
System.out.println("[CompatibilityChecker ERROR]"+currentSpq); | |||
e.printStackTrace(); | |||
} | |||
return false; | |||
} | |||
Triple cur_t = originalSpq.tripleList.get(level); | |||
// first try default order | |||
currentSpq.addTriple(cur_t); | |||
boolean flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1); | |||
currentSpq.removeLastTriple(); | |||
// !deprecated: not change triple order for [literal relation] | |||
// if (RelationFragment.isLiteral(cur_t.predicateID)) return false; | |||
// Enumerate reserve/drop the type info | |||
if (cur_t.predicateID == Globals.pd.typePredicateID) | |||
{ | |||
flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1); | |||
return flag; | |||
} | |||
else | |||
{ | |||
// single triple check after swap | |||
Triple swapped_t = cur_t.copySwap(); | |||
swapped_t.score = swapped_t.score*0.8; | |||
if (isTripleCompatibleNotSwap(swapped_t)) | |||
{ | |||
currentSpq.addTriple(swapped_t); | |||
flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1); | |||
currentSpq.removeLastTriple(); | |||
} | |||
return flag; | |||
} | |||
} | |||
} |
@@ -0,0 +1,208 @@ | |||
package qa.parsing; | |||
import org.maltparser.core.exception.MaltChainedException; | |||
import log.QueryLogger; | |||
import nlp.ds.DependencyTree; | |||
import nlp.ds.DependencyTreeNode; | |||
import nlp.ds.Word; | |||
import nlp.ds.Sentence.SentenceType; | |||
import qa.Globals; | |||
import rdf.Sparql; | |||
import rdf.Triple; | |||
public class QuestionParsing { | |||
public void process(QueryLogger qlog) { | |||
getDependenciesAndNER(qlog); | |||
recognizeSentenceType(qlog); | |||
} | |||
public void getDependenciesAndNER (QueryLogger qlog) { | |||
long t1 = System.currentTimeMillis(); | |||
try { | |||
qlog.s.dependencyTreeStanford = new DependencyTree(qlog.s, Globals.stanfordParser); | |||
}catch(Exception e){ | |||
e.printStackTrace(); | |||
} | |||
long t2 = System.currentTimeMillis(); | |||
try{ | |||
qlog.s.dependencyTreeMalt = new DependencyTree(qlog.s, Globals.maltParser); | |||
}catch(Exception e){ | |||
//if errors occur, abandon malt tree | |||
qlog.s.dependencyTreeMalt = qlog.s.dependencyTreeStanford; | |||
System.err.println("MALT parser error! Use stanford parser instead."); | |||
} | |||
try { | |||
long t3 = System.currentTimeMillis(); | |||
Globals.nerRecognizer.recognize(qlog.s); | |||
long t4 = System.currentTimeMillis(); | |||
System.out.println("====StanfordDependencies("+(t2-t1)+"ms)===="); | |||
System.out.println(qlog.s.dependencyTreeStanford); | |||
System.out.println("====MaltDependencies("+(t3-t2)+"ms)===="); | |||
System.out.println(qlog.s.dependencyTreeMalt); | |||
System.out.println("====NameEntityRecognition("+(t4-t3)+"ms)===="); | |||
qlog.s.printNERResult(); | |||
qlog.timeTable.put("StanfordParser", (int)(t2-t1)); | |||
qlog.timeTable.put("MaltParser", (int)(t3-t2)); | |||
qlog.timeTable.put("NER", (int)(t4-t3)); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
public void recognizeSentenceType(QueryLogger qlog) | |||
{ | |||
boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford)|| | |||
recognizeImperativeSentence(qlog.s.dependencyTreeMalt); | |||
if (IsImperativeSentence) | |||
{ | |||
qlog.s.sentenceType = SentenceType.ImperativeSentence; | |||
//two dependencyTree's ignored words should equal | |||
for (DependencyTreeNode sNode : qlog.s.dependencyTreeStanford.nodesList) | |||
for (DependencyTreeNode mNode : qlog.s.dependencyTreeMalt.nodesList) | |||
if (sNode.equals(mNode) && (sNode.word.isIgnored||mNode.word.isIgnored)) | |||
sNode.word.isIgnored = mNode.word.isIgnored = true; | |||
return; | |||
} | |||
boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford)|| | |||
recognizeSpecialQuestion(qlog.s.dependencyTreeMalt); | |||
if (IsSpecialQuestion) | |||
{ | |||
qlog.s.sentenceType = SentenceType.SpecialQuestion; | |||
return; | |||
} | |||
boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford)|| | |||
recognizeGeneralQuestion(qlog.s.dependencyTreeMalt); | |||
if (IsGeneralQuestion) | |||
{ | |||
qlog.s.sentenceType = SentenceType.GeneralQuestion; | |||
return; | |||
} | |||
//default is special | |||
qlog.s.sentenceType = SentenceType.SpecialQuestion; | |||
} | |||
//if imperative, omitting those polite words | |||
private boolean recognizeImperativeSentence(DependencyTree tree) { | |||
if(tree.getRoot().word.posTag.startsWith("V") || tree.getRoot().word.posTag.startsWith("NN")) { | |||
DependencyTreeNode dobj = null; | |||
DependencyTreeNode iobj = null; | |||
for (DependencyTreeNode n : tree.getRoot().childrenList) { | |||
if (n.dep_father2child.equals("dobj")) { | |||
dobj = n; | |||
} | |||
else if (n.dep_father2child.equals("iobj")) { | |||
iobj = n; | |||
} | |||
} | |||
if (dobj != null && iobj != null) { | |||
tree.getRoot().word.isIgnored = true; | |||
iobj.word.isIgnored = true; | |||
// give me a list of .. | |||
if (dobj.word.baseForm.equals("list")) | |||
{ | |||
dobj.word.isIgnored = true; | |||
} | |||
return true; | |||
} | |||
//start with "List": List all games by GMT. | |||
if (dobj != null && tree.getRoot().word.baseForm.equals("list")) | |||
{ | |||
//System.out.println("isListSentence!"); | |||
tree.getRoot().word.isIgnored = true; | |||
return true; | |||
} | |||
} | |||
return false; | |||
} | |||
private boolean recognizeSpecialQuestion(DependencyTree tree) | |||
{ | |||
DependencyTreeNode firstNode = null; | |||
for (DependencyTreeNode dtn : tree.nodesList) | |||
if (dtn.word.position == 1) | |||
{ | |||
firstNode = dtn; | |||
break; | |||
} | |||
//eg. In which city... | |||
if (firstNode!=null && | |||
(firstNode.word.posTag.equals("IN")||firstNode.word.posTag.equals("TO"))&& | |||
firstNode.dep_father2child.startsWith("prep")) | |||
{ | |||
firstNode = null; | |||
for (DependencyTreeNode dtn : tree.nodesList) | |||
if (dtn.word.position == 2) | |||
{ | |||
firstNode = dtn; | |||
break; | |||
} | |||
} | |||
if (firstNode != null) | |||
{ | |||
if (firstNode.word.posTag.startsWith("W")) | |||
return true; | |||
} | |||
return false; | |||
} | |||
private boolean recognizeGeneralQuestion(DependencyTree tree) | |||
{ | |||
DependencyTreeNode firstNode = null; | |||
for (DependencyTreeNode dtn : tree.nodesList) | |||
if (dtn.word.position == 1) | |||
{ | |||
firstNode = dtn; | |||
break; | |||
} | |||
if (firstNode != null) | |||
{ | |||
String dep = firstNode.dep_father2child; | |||
String pos = firstNode.word.posTag; | |||
String baseform = firstNode.word.baseForm; | |||
if ((baseform.equals("be")||baseform.equals("do")) && | |||
pos.startsWith("VB") && | |||
(dep.equals("root")||dep.equals("cop")||dep.startsWith("aux"))) | |||
return true; | |||
} | |||
return false; | |||
} | |||
public static String detectQuestionFocus(Sparql spq) { | |||
String ret = null; | |||
int posi = Integer.MAX_VALUE; | |||
for (Triple t : spq.tripleList) { | |||
if (!t.isSubjConstant()) { | |||
Word subj = t.getSubjectWord(); | |||
if (subj!=null && subj.position < posi) { | |||
posi = subj.position; | |||
ret = t.subject; | |||
} | |||
} | |||
if (!t.isObjConstant()) { | |||
Word obj = t.getObjectWord(); | |||
if (obj!=null && obj.position < posi) { | |||
posi = obj.position; | |||
ret = t.object; | |||
} | |||
} | |||
} | |||
if (ret != null) return ret.replace(' ', '_'); | |||
else return null; | |||
} | |||
} |
@@ -0,0 +1,40 @@ | |||
package rdf; | |||
import fgmt.EntityFragment; | |||
public class EntityMapping implements Comparable<EntityMapping> { | |||
public int entityID = -1; | |||
public String entityName = null; | |||
public double score = 0; | |||
public EntityFragment entityFragment = null; | |||
public EntityMapping(int eid, String en, double sco) { | |||
entityID = eid; | |||
entityName = en; | |||
score = sco; | |||
//punishment if entity start with "?" | |||
if (entityName.startsWith("?")) | |||
score *=0.5; | |||
} | |||
// In descending order: big --> small | |||
public int compareTo(EntityMapping o) { | |||
double diff = this.score - o.score; | |||
if (diff > 0) return -1; | |||
else if (diff < 0) return 1; | |||
else return 0; | |||
} | |||
public int hashCode() | |||
{ | |||
return new Integer(entityID).hashCode(); | |||
} | |||
public String toString() | |||
{ | |||
StringBuilder res = new StringBuilder(entityName+"("+score+")"); | |||
return res.toString(); | |||
} | |||
} |
@@ -0,0 +1,77 @@ | |||
package rdf; | |||
import fgmt.TypeFragment; | |||
import qa.Globals; | |||
import lcn.EntityFragmentFields; | |||
public class ImplicitRelation { | |||
public String subj = null; | |||
public String obj = null; | |||
public int pId = -1; | |||
public double score = 0; | |||
//Role : 1|ent , 2|type_ , 3|var | |||
public enum roleEnum {ENTITY, TYPE_CONSTANT, TYPE_VARIABLE, VARIABLE}; | |||
public int subjRole = -1; | |||
public int objRole = -1; | |||
public int subjId = -1; | |||
public int objId = -1; | |||
public ImplicitRelation(String s, String o, int pid, double sc) | |||
{ | |||
pId = pid; | |||
subj = s; | |||
obj = o; | |||
score = sc; | |||
subjId = EntityFragmentFields.entityName2Id.get(s); | |||
if(pId != Globals.pd.typePredicateID) | |||
objId = EntityFragmentFields.entityName2Id.get(o); | |||
else | |||
objId = TypeFragment.typeShortName2IdList.get(o).get(0); | |||
} | |||
public ImplicitRelation(Integer sId, Integer oId, int pid, double sc) | |||
{ | |||
pId = pid; | |||
subjId = sId; | |||
objId = oId; | |||
score = sc; | |||
} | |||
public void setSubjectId(Integer s) | |||
{ | |||
subjId = s; | |||
} | |||
public void setObjectId(Integer o) | |||
{ | |||
objId = o; | |||
} | |||
public void setSubject(String s) | |||
{ | |||
subj = s; | |||
} | |||
public void setObject(String o) | |||
{ | |||
obj = o; | |||
} | |||
public int hashCode() | |||
{ | |||
return new Integer(pId).hashCode() ^ new Integer(subjId).hashCode() ^ new Integer(objId).hashCode(); | |||
} | |||
@Override | |||
public boolean equals(Object ir) | |||
{ | |||
ImplicitRelation tmpIr = (ImplicitRelation) ir; | |||
if (pId == tmpIr.pId && subjId == tmpIr.subjId && objId == tmpIr.objId) | |||
return true; | |||
else return false; | |||
} | |||
} |
@@ -0,0 +1,41 @@ | |||
package rdf; | |||
import java.util.ArrayList; | |||
import rdf.EntityMapping; | |||
import rdf.TypeMapping; | |||
public class MergedWord implements Comparable<MergedWord> | |||
{ | |||
//original position | |||
public int st,ed; | |||
//position after merge (unselected is -1) | |||
public int mergedPos = -1; | |||
public String name; | |||
public boolean mayCategory = false; | |||
public boolean mayLiteral = false; | |||
public boolean mayEnt = false; | |||
public boolean mayType = false; | |||
public ArrayList<EntityMapping> emList = null; | |||
public ArrayList<TypeMapping> tmList = null; | |||
public String category = null; | |||
public MergedWord(int s,int e,String n) | |||
{ | |||
st = s; | |||
ed = e; | |||
name = n; | |||
} | |||
@Override | |||
//long to short | |||
public int compareTo(MergedWord o) | |||
{ | |||
int lenDiff = (this.ed-this.st) - (o.ed-o.st); | |||
if (lenDiff > 0) return -1; | |||
else if (lenDiff < 0) return 1; | |||
return 0; | |||
} | |||
} |
@@ -0,0 +1,24 @@ | |||
package rdf; | |||
import java.util.ArrayList; | |||
public class NodeSelectedWithScore implements Comparable<NodeSelectedWithScore> | |||
{ | |||
public ArrayList<Integer> selected; | |||
int size; //split key to st and ed | |||
public double score = 0; | |||
public NodeSelectedWithScore(ArrayList<Integer> a, double b) | |||
{ | |||
selected = a; | |||
score = b; | |||
} | |||
// In descending order: big --> small | |||
public int compareTo(NodeSelectedWithScore o) { | |||
double diff = this.score - o.score; | |||
if (diff > 0) return -1; | |||
else if (diff < 0) return 1; | |||
else return 0; | |||
} | |||
} |
@@ -0,0 +1,28 @@ | |||
package rdf; | |||
public class PredicateMapping implements Comparable<PredicateMapping> { | |||
public int pid = -1; | |||
public double score = 0; | |||
public String parapharase = null; | |||
public PredicateMapping (int pid, double sco, String para) { | |||
this.pid = pid; | |||
score = sco; | |||
parapharase = para; | |||
} | |||
// In descending order: big --> small | |||
public int compareTo(PredicateMapping o) { | |||
double diff = this.score - o.score; | |||
if (diff > 0) return -1; | |||
else if (diff < 0) return 1; | |||
else return 0; | |||
} | |||
@Override | |||
public String toString() { | |||
String ret = ""; | |||
ret = "<"+pid+" : "+parapharase+" : "+score+">"; | |||
return ret; | |||
} | |||
} |
@@ -0,0 +1,180 @@ | |||
package rdf; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import qa.Globals; | |||
import nlp.ds.Word; | |||
public class SemanticQueryGraph implements Comparable<SemanticQueryGraph> | |||
{ | |||
public ArrayList<SemanticUnit> semanticUnitList = null; | |||
public HashMap<Integer, SemanticRelation> semanticRelations = new HashMap<>(); | |||
public double score = 0; | |||
public SemanticQueryGraph(ArrayList<SemanticUnit> suList) | |||
{ | |||
semanticUnitList = suList; //TODO: need copy? | |||
// Calculate Score by a reward function (TODO: using SVM-Rank) | |||
} | |||
public SemanticQueryGraph(SemanticQueryGraph head) | |||
{ | |||
semanticUnitList = new ArrayList<>(); | |||
for(SemanticUnit su: head.semanticUnitList) | |||
semanticUnitList.add(su.copy()); | |||
score = head.score; | |||
} | |||
public void connect(SemanticUnit u, SemanticUnit v) | |||
{ | |||
if(u.equals(v)) | |||
return; | |||
SemanticUnit su1 = null, su2 = null; | |||
for(SemanticUnit su: this.semanticUnitList) | |||
if(su.equals(u)) | |||
su1 = su; | |||
else if(su.equals(v)) | |||
su2 = su; | |||
if(su1 != null && su2 != null) | |||
if(!su1.neighborUnitList.contains(su2) && !su2.neighborUnitList.contains(su1)) | |||
{ | |||
su1.neighborUnitList.add(su2); | |||
su2.neighborUnitList.add(su1); | |||
} | |||
} | |||
public void merge(SemanticUnit u, SemanticUnit v) | |||
{ | |||
SemanticUnit su1 = null, su2 = null; | |||
for(SemanticUnit su: this.semanticUnitList) | |||
if(su.equals(u)) | |||
su1 = su; | |||
else if(su.equals(v)) | |||
su2 = su; | |||
if(su1 != null && su2 != null) | |||
{ | |||
for(SemanticUnit su: this.semanticUnitList) | |||
if(su != su2 && su.neighborUnitList.contains(su1) && !su.neighborUnitList.contains(su2)) //TODO: Notice, now REJECT multi-edges; The hash function of SR should be modified to allow multi-edges. | |||
su.neighborUnitList.add(su2); | |||
this.semanticUnitList.remove(su1); | |||
su2.neighborUnitList.remove(su1); | |||
} | |||
} | |||
@Override | |||
public int hashCode() { | |||
int code = 0; | |||
for(SemanticUnit su: this.semanticUnitList) | |||
code ^= su.hashCode(); | |||
return code; | |||
} | |||
@Override | |||
public boolean equals(Object o) | |||
{ | |||
if (o instanceof SemanticQueryGraph) | |||
{ | |||
int matchCnt = 0; | |||
for(SemanticUnit su1: ((SemanticQueryGraph) o).semanticUnitList) | |||
for(SemanticUnit su2: this.semanticUnitList) | |||
{ | |||
if(su1.equals(su2)) | |||
{ | |||
if(su1.neighborUnitList.containsAll(su2.neighborUnitList) && su2.neighborUnitList.containsAll(su1.neighborUnitList)) | |||
matchCnt++; | |||
} | |||
} | |||
if(matchCnt == ((SemanticQueryGraph) o).semanticUnitList.size() && matchCnt == this.semanticUnitList.size()) | |||
return true; | |||
} | |||
return false; | |||
} | |||
@Override | |||
public int compareTo(SemanticQueryGraph o) | |||
{ | |||
double diff = this.score - o.score; | |||
if (diff > 0) return -1; | |||
else if (diff < 0) return 1; | |||
else return 0; | |||
} | |||
public boolean isFinalState() | |||
{ | |||
if(semanticUnitList == null || semanticUnitList.isEmpty()) | |||
return false; | |||
// Basic assumption: a final Semantic Query Graph should be Connected. | |||
HashSet<SemanticUnit> visited = new HashSet<>(); | |||
SemanticUnit start = semanticUnitList.get(0); | |||
visited.add(start); | |||
dfs(start, visited); | |||
if(visited.size() == semanticUnitList.size()) | |||
return true; | |||
return false; | |||
} | |||
private void dfs(SemanticUnit headNode, HashSet<SemanticUnit> visited) | |||
{ | |||
for(SemanticUnit curNode: headNode.neighborUnitList) | |||
if(!visited.contains(curNode)) | |||
{ | |||
visited.add(curNode); | |||
dfs(curNode, visited); | |||
} | |||
for(SemanticUnit curNode: semanticUnitList) | |||
{ | |||
if(curNode.neighborUnitList.contains(headNode) || headNode.neighborUnitList.contains(curNode)) | |||
{ | |||
if(!visited.contains(curNode)) | |||
{ | |||
visited.add(curNode); | |||
dfs(curNode, visited); | |||
} | |||
} | |||
} | |||
} | |||
public void calculateScore(HashMap<Integer, SemanticRelation> potentialSemanticRelations) | |||
{ | |||
// 1. entity/type score | |||
double entSco = 0; | |||
for(SemanticUnit su: this.semanticUnitList) | |||
{ | |||
Word w = su.centerWord; | |||
if(w.mayEnt && w.emList.size()>0) | |||
entSco += w.emList.get(0).score * 100; | |||
if(w.mayType && w.tmList.size()>0) | |||
entSco += w.tmList.get(0).score; | |||
} | |||
// 2. relation score | |||
double relSco = 0; | |||
int relCnt = 0; | |||
for(SemanticUnit su1: this.semanticUnitList) | |||
for(SemanticUnit su2: su1.neighborUnitList) | |||
{ | |||
//Deduplicate | |||
if(su1.centerWord.position > su2.centerWord.position) | |||
continue; | |||
relCnt++; | |||
int key = su1.centerWord.getNnHead().hashCode() ^ su2.centerWord.getNnHead().hashCode(); | |||
SemanticRelation sr = potentialSemanticRelations.get(key); | |||
if(sr == null) | |||
System.err.println("No semantic relation for: " + su1 + " & " + su2); | |||
else | |||
{ | |||
relSco += sr.predicateMappings.get(0).score; | |||
semanticRelations.put(key, sr); | |||
} | |||
} | |||
relSco/=relCnt; //average | |||
this.score = entSco + relSco; | |||
} | |||
} |
@@ -0,0 +1,171 @@ | |||
package rdf; | |||
import java.util.ArrayList; | |||
import rdf.SimpleRelation; | |||
import nlp.ds.Word; | |||
public class SemanticRelation { | |||
public Word arg1Word = null; | |||
public Word arg2Word = null; | |||
public String relationParaphrase = null; // longest match | |||
public double LongestMatchingScore = 0; // longest match score | |||
//judge difference when copy semantic relation from special pattern | |||
public int arg1SuffixId = 0; | |||
public int arg2SuffixId = 0; | |||
public Word arg1Word_beforeCRR = null; | |||
public Word arg2Word_beforeCRR = null; | |||
public ArrayList<PredicateMapping> predicateMappings = null; | |||
public boolean isArg1Constant = false; | |||
public boolean isArg2Constant = false; | |||
public char extractingMethod = ' '; // S: StanfordParser; M: MaltParser; N: N-gram; R: rules | |||
public SemanticRelation dependOnSemanticRelation = null; | |||
public Word preferredSubj = null; | |||
public boolean isSteadyEdge = true; | |||
public SemanticRelation(SemanticRelation r2) { | |||
arg1Word = r2.arg1Word; | |||
arg2Word = r2.arg2Word; | |||
relationParaphrase = r2.relationParaphrase; | |||
LongestMatchingScore = r2.LongestMatchingScore; | |||
arg1SuffixId = r2.arg1SuffixId; | |||
arg2SuffixId = r2.arg2SuffixId; | |||
arg1Word_beforeCRR = r2.arg1Word_beforeCRR; | |||
arg2Word_beforeCRR = r2.arg2Word_beforeCRR; | |||
arg1Word.emList = r2.arg1Word.emList; | |||
arg2Word.emList = r2.arg2Word.emList; | |||
predicateMappings = r2.predicateMappings; | |||
// arg1Types = r2.arg1Types; | |||
// arg2Types = r2.arg2Types; | |||
isArg1Constant = r2.isArg1Constant; | |||
isArg2Constant = r2.isArg2Constant; | |||
extractingMethod = r2.extractingMethod; | |||
dependOnSemanticRelation = r2.dependOnSemanticRelation; | |||
preferredSubj = r2.preferredSubj; | |||
} | |||
public void swapArg1Arg2() | |||
{ | |||
Word tmpWord = arg1Word; | |||
arg1Word = arg2Word; | |||
arg2Word = tmpWord; | |||
int tmpSuffixId = arg1SuffixId; | |||
arg1SuffixId = arg2SuffixId; | |||
arg2SuffixId = tmpSuffixId; | |||
tmpWord = arg1Word_beforeCRR; | |||
arg1Word_beforeCRR = arg2Word_beforeCRR; | |||
arg2Word_beforeCRR = tmpWord; | |||
boolean tmpBool = isArg1Constant; | |||
isArg1Constant = isArg2Constant; | |||
isArg2Constant = tmpBool; | |||
} | |||
public SemanticRelation (SimpleRelation simr) { | |||
if (simr.preferredSubj == null) { | |||
if (simr.arg1Word.compareTo(simr.arg2Word) < 0) { | |||
this.arg1Word = simr.arg1Word; | |||
this.arg2Word = simr.arg2Word; | |||
this.arg1Word_beforeCRR = simr.arg1Word_beforeCRR; | |||
this.arg2Word_beforeCRR = simr.arg2Word_beforeCRR; | |||
} | |||
else { | |||
this.arg1Word = simr.arg2Word; | |||
this.arg2Word = simr.arg1Word; | |||
this.arg1Word_beforeCRR = simr.arg2Word_beforeCRR; | |||
this.arg2Word_beforeCRR = simr.arg1Word_beforeCRR; | |||
} | |||
this.extractingMethod = simr.extractingMethod; | |||
} | |||
else { | |||
if (simr.arg1Word == simr.preferredSubj) { | |||
this.arg1Word = simr.arg1Word; | |||
this.arg2Word = simr.arg2Word; | |||
this.arg1Word_beforeCRR = simr.arg1Word_beforeCRR; | |||
this.arg2Word_beforeCRR = simr.arg2Word_beforeCRR; | |||
this.preferredSubj = simr.preferredSubj; | |||
} | |||
else { | |||
this.arg1Word = simr.arg2Word; | |||
this.arg2Word = simr.arg1Word; | |||
this.arg1Word_beforeCRR = simr.arg2Word_beforeCRR; | |||
this.arg2Word_beforeCRR = simr.arg1Word_beforeCRR; | |||
this.preferredSubj = simr.preferredSubj; | |||
} | |||
this.extractingMethod = simr.extractingMethod; | |||
} | |||
} | |||
@Override | |||
public int hashCode() { | |||
return arg1Word.hashCode() ^ arg2Word.hashCode() + arg1SuffixId + arg2SuffixId; | |||
} | |||
@Override | |||
public boolean equals(Object o) { | |||
if (o instanceof SemanticRelation) { | |||
SemanticRelation sr2 = (SemanticRelation) o; | |||
if (this.arg1Word.equals(sr2.arg1Word) | |||
&& this.arg2Word.equals(sr2.arg2Word) | |||
&& this.arg1SuffixId == sr2.arg1SuffixId | |||
&& this.arg2SuffixId == sr2.arg2SuffixId | |||
&& this.relationParaphrase.equals(sr2.relationParaphrase) | |||
&& this.LongestMatchingScore == sr2.LongestMatchingScore) { | |||
return true; | |||
} | |||
} | |||
return false; | |||
} | |||
@Override | |||
public String toString() { | |||
return arg1Word.originalForm + "," + arg2Word.originalForm + "," + relationParaphrase + "," + LongestMatchingScore + "["+extractingMethod+"]"; | |||
// return arg1Word.getFullEntityName() + "," + arg2Word.getFullEntityName() + "," + relationParaphrase + "," + LongestMatchingScore + "["+extractingMethod+"]"; | |||
} | |||
public void normalizeScore() | |||
{ | |||
double maxScore; | |||
if (arg1Word.emList!=null && !arg1Word.emList.isEmpty()) | |||
{ | |||
maxScore=0.0; | |||
for (EntityMapping em : arg1Word.emList) | |||
maxScore = Math.max(maxScore, em.score); | |||
for (EntityMapping em : arg1Word.emList) | |||
em.score = em.score/maxScore; | |||
} | |||
if (arg2Word.emList!=null && !arg2Word.emList.isEmpty()) | |||
{ | |||
maxScore=0.0; | |||
for (EntityMapping em : arg2Word.emList) | |||
maxScore = Math.max(maxScore, em.score); | |||
for (EntityMapping em : arg2Word.emList) | |||
em.score = em.score/maxScore; | |||
} | |||
if (predicateMappings!=null && !predicateMappings.isEmpty()) | |||
{ | |||
maxScore=0.0; | |||
for (PredicateMapping pm : predicateMappings) | |||
maxScore = Math.max(maxScore, pm.score); | |||
for (PredicateMapping pm : predicateMappings) | |||
pm.score = pm.score/maxScore; | |||
} | |||
} | |||
} |
@@ -0,0 +1,61 @@ | |||
package rdf; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import rdf.SemanticRelation; | |||
import nlp.ds.DependencyTreeNode; | |||
import nlp.ds.Word; | |||
public class SemanticUnit | |||
{ | |||
public Word centerWord = null; | |||
public ArrayList<DependencyTreeNode> describeNodeList = new ArrayList<DependencyTreeNode>(); | |||
public ArrayList<SemanticUnit> neighborUnitList = new ArrayList<SemanticUnit>(); | |||
public HashMap<Word, SemanticRelation> RelationList = new HashMap<Word, SemanticRelation>(); | |||
public boolean isSubj = true; | |||
public Integer prefferdType = null; | |||
public SemanticUnit(Word center, boolean isSubJ) | |||
{ | |||
centerWord = center; | |||
isSubj = isSubJ; | |||
} | |||
public SemanticUnit copy() | |||
{ | |||
SemanticUnit su = new SemanticUnit(this.centerWord, this.isSubj); | |||
su.describeNodeList = (ArrayList<DependencyTreeNode>) this.describeNodeList.clone(); | |||
su.neighborUnitList = (ArrayList<SemanticUnit>) this.neighborUnitList.clone(); | |||
su.RelationList = (HashMap<Word, SemanticRelation>) this.RelationList.clone(); | |||
return su; | |||
} | |||
@Override | |||
public int hashCode() { | |||
return centerWord.hashCode(); | |||
} | |||
@Override | |||
public boolean equals(Object o) { | |||
if (o instanceof SemanticUnit) { | |||
SemanticUnit su2 = (SemanticUnit) o; | |||
if(this.centerWord.equals(su2.centerWord)) | |||
return true; | |||
} | |||
return false; | |||
} | |||
@Override | |||
public String toString() | |||
{ | |||
String ret = "<" + centerWord + ", {"; | |||
for(SemanticUnit su: neighborUnitList) | |||
ret += su.centerWord + ", "; | |||
ret += "}>"; | |||
return ret; | |||
} | |||
} |
@@ -0,0 +1,88 @@ | |||
package rdf; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import paradict.PredicateIDAndSupport; | |||
import qa.Globals; | |||
import nlp.ds.DependencyTree; | |||
import nlp.ds.DependencyTreeNode; | |||
import nlp.ds.Word; | |||
// allow repetition | |||
public class SimpleRelation { | |||
public Word arg1Word = null; | |||
public Word arg2Word = null; | |||
public String relationParaphrase = null; | |||
public double matchingScore = 0; | |||
public Word arg1Word_beforeCRR = null; | |||
public Word arg2Word_beforeCRR = null; | |||
public HashMap<Integer, Double> pasList = new HashMap<Integer, Double>(); | |||
public Word preferredSubj = null; | |||
public char extractingMethod = ' '; // S: StanfordParser; M: MaltParser; N: N-gram; R: rules | |||
public SimpleRelation() | |||
{ | |||
} | |||
public SimpleRelation(SimpleRelation sr) | |||
{ | |||
arg1Word = sr.arg1Word; | |||
arg2Word = sr.arg2Word; | |||
relationParaphrase = sr.relationParaphrase; | |||
matchingScore = sr.matchingScore; | |||
arg1Word_beforeCRR = sr.arg1Word_beforeCRR; | |||
arg2Word_beforeCRR = sr.arg2Word_beforeCRR; | |||
pasList = sr.pasList; | |||
preferredSubj = sr.preferredSubj; | |||
extractingMethod = 'R'; | |||
} | |||
@Override | |||
public String toString() { | |||
return arg1Word.originalForm + "," + arg2Word.originalForm + "," + relationParaphrase + "," + matchingScore + "["+extractingMethod+"]"; | |||
//return arg1Word.getFullEntityName() + "," + arg2Word.getFullEntityName() + "," + relationParaphrase + "," + matchingScore + "["+extractingMethod+"]"; | |||
} | |||
public int getHashCode() { | |||
return arg1Word.hashCode() ^ arg2Word.hashCode(); | |||
} | |||
public void setPasList (String pattern, double matchingScore, boolean[] matchedFlag) { | |||
ArrayList<PredicateIDAndSupport> list = Globals.pd.nlPattern_2_predicateList.get(pattern); | |||
for (PredicateIDAndSupport pidsup : list) { | |||
double sumSelectivity = 0; | |||
for (int i = 0; i < matchedFlag.length; i ++) { | |||
if (matchedFlag[i]) { | |||
sumSelectivity += pidsup.wordSelectivity[i]; | |||
} | |||
} | |||
sumSelectivity = matchingScore*sumSelectivity*pidsup.support; | |||
int pid = pidsup.predicateID; | |||
if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5; //����dbo�е�predicate //pid ���ܲ��� dbo �У� | |||
if (!pasList.containsKey(pid)) | |||
pasList.put(pid, sumSelectivity); | |||
else if (sumSelectivity > pasList.get(pid)) | |||
pasList.put(pid, sumSelectivity); | |||
} | |||
} | |||
public void setPreferedSubjObjOrder(DependencyTree tree) { | |||
DependencyTreeNode n1 = tree.getNodeByIndex(this.arg1Word.position).getNNTopTreeNode(tree); | |||
DependencyTreeNode n2 = tree.getNodeByIndex(this.arg2Word.position).getNNTopTreeNode(tree); | |||
if (n1.father != null && n1.father.word.baseForm.equals("of") && n1.dep_father2child.equals("pobj")) { | |||
this.preferredSubj = this.arg1Word; | |||
} | |||
else if (n2.father != null && n2.father.word.baseForm.equals("of") && n2.dep_father2child.equals("pobj")) { | |||
this.preferredSubj = this.arg2Word; | |||
} | |||
} | |||
} |
@@ -0,0 +1,305 @@ | |||
package rdf; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
//import java.util.Comparator; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import log.QueryLogger; | |||
import nlp.ds.Sentence; | |||
import nlp.ds.Sentence.SentenceType; | |||
import qa.Globals; | |||
public class Sparql implements Comparable<Sparql> | |||
{ | |||
public ArrayList<Triple> tripleList = new ArrayList<Triple>(); | |||
public boolean countTarget = false; | |||
public String mostStr = null; | |||
public String moreThanStr = null; | |||
public double score = 0; | |||
public String questionFocus = null; // The answer variable | |||
public HashSet<String> variables = new HashSet<String>(); | |||
public enum QueryType {Select,Ask} | |||
public QueryType queryType = QueryType.Select; | |||
public HashMap<Integer, SemanticRelation> semanticRelations = null; | |||
public void addTriple(Triple t) | |||
{ | |||
if(!tripleList.contains(t)) | |||
{ | |||
tripleList.add(t); | |||
score += t.score; | |||
} | |||
} | |||
public void delTriple(Triple t) | |||
{ | |||
if(tripleList.contains(t)) | |||
{ | |||
tripleList.remove(t); | |||
score -= t.score; | |||
} | |||
} | |||
@Override | |||
public String toString() | |||
{ | |||
String ret = ""; | |||
for (Triple t : tripleList) { | |||
ret += t.toString(); | |||
ret += '\n'; | |||
} | |||
return ret; | |||
} | |||
public void deduplicate() | |||
{ | |||
HashSet<String> set = new HashSet<String>(); | |||
ArrayList<Triple> list = new ArrayList<Triple>(); | |||
for(Triple t: tripleList) | |||
{ | |||
String st = t.toStringWithoutScore(); | |||
if(set.contains(st)) | |||
list.add(t); | |||
set.add(st); | |||
} | |||
for(Triple t: list) | |||
this.delTriple(t); | |||
} | |||
// Is it a Basic Graph Pattern without filter and aggregation? | |||
public boolean isBGP() | |||
{ | |||
if(moreThanStr != null || mostStr != null || countTarget) | |||
return false; | |||
return true; | |||
} | |||
//Use to display (can not be executed) | |||
public String toStringForGStore() | |||
{ | |||
String ret = ""; | |||
for (Triple t : tripleList) | |||
{ | |||
// !Omit obvious LITERAL | |||
if(t.object.equals("literal_HRZ")) | |||
continue; | |||
// !Omit some bad TYPEs | |||
if(t.predicateID==Globals.pd.typePredicateID && Globals.pd.bannedTypes.contains(t.object)) | |||
continue; | |||
ret += t.toStringForGStore(); | |||
ret += '\n'; | |||
} | |||
return ret; | |||
} | |||
/** | |||
* @description: | |||
* 1. Select all variables for BGP queries to display specific information. | |||
* 2. DO NOT select all variables when Aggregation like "HAVING" "COUNT" ... | |||
* (It may involves too many results, e.g. "which countries have more than 1000 caves?", caves is no need to display) | |||
* @param: NULL. | |||
* @return: A SPARQL query can be executed by GStore (NO prefix of entities/predicates). | |||
*/ | |||
public String toStringForGStore2() | |||
{ | |||
String ret = ""; | |||
variables.clear(); | |||
for(Triple t: tripleList) | |||
{ | |||
if (!t.isSubjConstant()) variables.add(t.subject.replaceAll(" ", "_")); | |||
if (!t.isObjConstant()) variables.add(t.object.replaceAll(" ", "_")); | |||
} | |||
if(variables.size() == 0) | |||
queryType = QueryType.Ask; | |||
// part1: select / ask ... | |||
if (queryType==QueryType.Ask) | |||
ret += "ask"; | |||
else if(countTarget) | |||
ret += ("select COUNT(DISTINCT " + questionFocus + ")"); | |||
else | |||
{ | |||
if(!isBGP()) // AGG: select question focus | |||
ret += ("select DISTINCT " + questionFocus); | |||
else // BGP: select all variables | |||
{ | |||
ret += "select DISTINCT "; | |||
for (String v : variables) | |||
ret += v + " "; | |||
} | |||
} | |||
// part2: triples | |||
ret += " where\n{\n"; | |||
for(Triple t : tripleList) | |||
{ | |||
if (!t.object.equals("literal_HRZ")) { // need not display literal | |||
ret += t.toStringForGStore(); | |||
ret += " .\n"; | |||
} | |||
} | |||
ret += "}\n"; | |||
// part3: order by / group by ... | |||
if(moreThanStr != null) | |||
ret += moreThanStr+"\n"; | |||
if(mostStr != null) | |||
ret += mostStr+"\n"; | |||
// part4: limit | |||
if(queryType != QueryType.Ask && (mostStr == null || !mostStr.contains("LIMIT"))) | |||
ret += "LIMIT " + Globals.MaxAnswerNum; | |||
return ret; | |||
} | |||
//Use to execute (select all variables; format 'aggregation' and 'ask') | |||
public String toStringForVirtuoso() | |||
{ | |||
String ret = ""; | |||
HashSet<String> variables = new HashSet<String>(); | |||
// prefix | |||
if (queryType==QueryType.Ask) | |||
ret += "ask where"; | |||
else if(countTarget) | |||
ret += ("select COUNT(DISTINCT " + questionFocus + ") where"); | |||
else | |||
{ | |||
// AGG: select question focus | |||
if(moreThanStr != null || mostStr != null) | |||
ret += ("select DISTINCT " + questionFocus + " where"); | |||
// BGP: select all variables | |||
else | |||
{ | |||
for (Triple t: tripleList) | |||
{ | |||
if (!t.isSubjConstant()) variables.add(t.subject.replaceAll(" ", "_")); | |||
if (!t.isObjConstant()) variables.add(t.object.replaceAll(" ", "_")); | |||
} | |||
ret += "select "; | |||
for (String v : variables) | |||
ret += v + " "; | |||
ret += "where"; | |||
} | |||
} | |||
ret += "\n{\n"; | |||
if(variables.size() == 0) | |||
variables.add(questionFocus); | |||
// triples | |||
for (Triple t : tripleList) | |||
{ | |||
if (!t.object.equals("literal_HRZ")) { | |||
ret += t.toStringForGStore(); | |||
ret += " .\n"; | |||
} | |||
} | |||
ret += "}\n"; | |||
// suffix | |||
if(moreThanStr != null) | |||
{ | |||
ret += moreThanStr+"\n"; | |||
} | |||
if(mostStr != null) | |||
{ | |||
ret += mostStr+"\n"; | |||
} | |||
return ret; | |||
} | |||
public int getVariableNumber() | |||
{ | |||
int res = 0; | |||
for (Triple t: tripleList) | |||
{ | |||
if (!t.isSubjConstant()) res++; | |||
if (!t.isObjConstant()) res++; | |||
} | |||
return res; | |||
} | |||
public void adjustTriplesOrder() | |||
{ | |||
Collections.sort(this.tripleList); | |||
} | |||
public int compareTo(Sparql o) | |||
{ | |||
double diff = this.score - o.score; | |||
if (diff > 0) | |||
return -1; | |||
else if (diff < 0) | |||
return 1; | |||
else | |||
return 0; | |||
} | |||
@Override | |||
public int hashCode() | |||
{ | |||
int key = 0; | |||
for(Triple t: this.tripleList) | |||
key ^= t.hashCode(); | |||
return key; | |||
} | |||
@Override | |||
public boolean equals(Object spq) | |||
{ | |||
Sparql tempSparql= (Sparql) spq; | |||
String s1 = this.toStringForGStore2(), s2 = tempSparql.toStringForGStore2(); | |||
if(this.toStringForGStore2().equals(tempSparql.toStringForGStore2())) | |||
return true; | |||
else | |||
return false; | |||
} | |||
public Sparql(){} | |||
public Sparql(HashMap<Integer, SemanticRelation> semanticRelations) | |||
{ | |||
this.semanticRelations = semanticRelations; | |||
} | |||
public Sparql copy() | |||
{ | |||
Sparql spq = new Sparql(this.semanticRelations); | |||
for (Triple t : this.tripleList) | |||
spq.addTriple(t); | |||
return spq; | |||
} | |||
public void removeLastTriple() | |||
{ | |||
int idx = tripleList.size()-1; | |||
score -= tripleList.get(idx).score; | |||
tripleList.remove(idx); | |||
} | |||
public Sparql removeAllTypeInfo () | |||
{ | |||
score = 0; | |||
ArrayList<Triple> newTripleList = new ArrayList<Triple>(); | |||
for (Triple t : tripleList) | |||
{ | |||
if (t.predicateID != Globals.pd.typePredicateID) | |||
{ | |||
newTripleList.add(t); | |||
score += t.score; | |||
} | |||
} | |||
tripleList = newTripleList; | |||
return this; | |||
} | |||
}; |
@@ -0,0 +1,257 @@ | |||
package rdf; | |||
import nlp.ds.Word; | |||
import qa.Globals; | |||
public class Triple implements Comparable<Triple>{ | |||
public String subject = null; // subject/object after disambiguation. | |||
public String object = null; | |||
static public int TYPE_ROLE_ID = -5; | |||
static public int VAR_ROLE_ID = -2; | |||
static public int CAT_ROLE_ID = -8; // Category | |||
static public String VAR_NAME = "?xxx"; | |||
// subjId/objId: entity id | TYPE_ROLE_ID | VAR_ROLE_ID | |||
public int subjId = -1; | |||
public int objId = -1; | |||
public int predicateID = -1; | |||
public Word subjWord = null; // only be used when semRltn == null | |||
public Word objWord = null; | |||
public SemanticRelation semRltn = null; | |||
public double score = 0; | |||
public boolean isSubjObjOrderSameWithSemRltn = true; | |||
public boolean isSubjObjOrderPrefered = false; | |||
public Word typeSubjectWord = null; // for "type" triples only | |||
public Triple (Triple t) { | |||
subject = t.subject; | |||
object = t.object; | |||
subjId = t.subjId; | |||
objId = t.objId; | |||
predicateID = t.predicateID; | |||
semRltn = t.semRltn; | |||
score = t.score; | |||
isSubjObjOrderSameWithSemRltn = t.isSubjObjOrderSameWithSemRltn; | |||
isSubjObjOrderPrefered = t.isSubjObjOrderPrefered; | |||
} | |||
// A final triple (subject/object order will not changed), does not rely on semantic relation (sr == null), from one word (type variable | embedded info) | |||
public Triple (int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco) { | |||
subjId = sId; | |||
objId = oId; | |||
subject = s; | |||
predicateID = p; | |||
object = o; | |||
semRltn = sr; | |||
score = sco; | |||
} | |||
// A triple translated from a semantic relation (subject/object order can be changed in later) | |||
public Triple (int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco, boolean isSwap) { | |||
subjId = sId; | |||
objId = oId; | |||
subject = s; | |||
predicateID = p; | |||
object = o; | |||
semRltn = sr; | |||
score = sco; | |||
isSubjObjOrderSameWithSemRltn = isSwap; | |||
} | |||
// A final triple (subject/object order will not changed), does not rely on semantic relation (sr == null), from two word (implicit relations of modifier) | |||
public Triple(int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco, Word subj, Word obj) { | |||
subjId = sId; | |||
objId = oId; | |||
subject = s; | |||
predicateID = p; | |||
object = o; | |||
semRltn = sr; | |||
score = sco; | |||
subjWord = subj; | |||
objWord = obj; | |||
} | |||
public Triple copy() { | |||
Triple t = new Triple(this); | |||
return t; | |||
} | |||
public Triple copySwap() { | |||
Triple t = new Triple(this); | |||
String temp; | |||
int tmpId; | |||
tmpId = t.subjId; | |||
t.subjId = t.objId; | |||
t.objId = tmpId; | |||
temp = t.subject; | |||
t.subject = t.object; | |||
t.object = temp; | |||
t.isSubjObjOrderSameWithSemRltn = !this.isSubjObjOrderSameWithSemRltn; | |||
t.isSubjObjOrderPrefered = !this.isSubjObjOrderPrefered; | |||
return t; | |||
} | |||
public void addScore(double s) { | |||
score += s; | |||
} | |||
public double getScore() { | |||
return score; | |||
} | |||
@Override | |||
public int hashCode() | |||
{ | |||
return new Integer(subjId).hashCode() ^ new Integer(objId).hashCode() ^ new Integer(predicateID).hashCode(); | |||
} | |||
@Override | |||
public String toString() { | |||
return subjId+":<" + subject + "> <" + Globals.pd.getPredicateById(predicateID) + "> "+objId+":<" + object + ">" + " : " + score; | |||
} | |||
public String toStringForGStore() { | |||
StringBuilder sb = new StringBuilder(""); | |||
String _subject = subject; | |||
if(_subject.startsWith("?")) | |||
sb.append(_subject+"\t"); | |||
else | |||
sb.append("<" + _subject + ">\t"); | |||
sb.append("<" + Globals.pd.getPredicateById(predicateID) + ">\t"); | |||
String _object; | |||
if(predicateID == Globals.pd.typePredicateID && object.contains("|")) | |||
_object = object.substring(0, object.indexOf('|')); | |||
else | |||
_object = object; | |||
if(_object.startsWith("?")) | |||
sb.append(_object); | |||
else | |||
sb.append("<" + _object + ">"); | |||
return sb.toString().replace(' ', '_'); | |||
} | |||
public String toStringWithoutScore() { | |||
return "<" + subject + "> <" + Globals.pd.getPredicateById(predicateID) + "> <" + object + ">"; | |||
} | |||
public Word getSubjectWord () { | |||
if (predicateID == Globals.pd.typePredicateID) { | |||
return typeSubjectWord; | |||
} | |||
else if(semRltn == null) | |||
{ | |||
return subjWord; | |||
} | |||
else { | |||
if (isSubjObjOrderSameWithSemRltn) return semRltn.arg1Word; | |||
else return semRltn.arg2Word; | |||
} | |||
} | |||
public Word getObjectWord () { | |||
if (predicateID == Globals.pd.typePredicateID) { | |||
return typeSubjectWord; | |||
} | |||
else if(semRltn == null) | |||
{ | |||
return objWord; | |||
} | |||
else { | |||
if (isSubjObjOrderSameWithSemRltn) return semRltn.arg2Word; | |||
else return semRltn.arg1Word; | |||
} | |||
} | |||
public boolean isSubjConstant () { | |||
if (predicateID == Globals.pd.typePredicateID) { | |||
return !subject.startsWith("?"); | |||
} | |||
else { | |||
// Triple from semantic (obvious) relation | |||
if(semRltn != null) | |||
{ | |||
if (isSubjObjOrderSameWithSemRltn) return semRltn.isArg1Constant; | |||
else return semRltn.isArg2Constant; | |||
} | |||
// Triple from implicit relation (no semantic relation), it is final triple | |||
else | |||
{ | |||
if(subjId != Triple.VAR_ROLE_ID && subjId != Triple.TYPE_ROLE_ID) | |||
return true; | |||
else | |||
return false; | |||
} | |||
} | |||
} | |||
public boolean isObjConstant () { | |||
if (predicateID == Globals.pd.typePredicateID) { | |||
return !object.startsWith("?"); | |||
} | |||
else { | |||
if(semRltn != null) | |||
{ | |||
if (isSubjObjOrderSameWithSemRltn) return semRltn.isArg2Constant; | |||
else return semRltn.isArg1Constant; | |||
} | |||
else | |||
{ | |||
if(objId != Triple.VAR_ROLE_ID && objId != Triple.TYPE_ROLE_ID) | |||
return true; | |||
else | |||
return false; | |||
} | |||
} | |||
} | |||
public int compareTo(Triple o) | |||
{ | |||
//Order: Type, Ent&Ent, Ent&Var, Var&Var | |||
if(this.predicateID == Globals.pd.typePredicateID) | |||
{ | |||
if(o.predicateID == Globals.pd.typePredicateID) | |||
return 0; | |||
else | |||
return -1; | |||
} | |||
int cnt1 = 0, cnt2 = 0; | |||
if(!this.subject.startsWith("?")) | |||
cnt1++; | |||
if(!this.object.startsWith("?")) | |||
cnt1++; | |||
if(!o.subject.startsWith("?")) | |||
cnt2++; | |||
if(!o.object.startsWith("?")) | |||
cnt2++; | |||
if(cnt1 == cnt2) | |||
return 0; | |||
else if(cnt1 > cnt2) | |||
return -1; | |||
else | |||
return 1; | |||
} | |||
public void swapSubjObjOrder() { | |||
String temp = subject; | |||
int tmpId = subjId; | |||
subject = object; | |||
subjId = objId; | |||
object = temp; | |||
objId = tmpId; | |||
isSubjObjOrderSameWithSemRltn = !isSubjObjOrderSameWithSemRltn; | |||
} | |||
}; |
@@ -0,0 +1,53 @@ | |||
package rdf; | |||
import qa.Globals; | |||
public class TypeMapping implements Comparable<TypeMapping> | |||
{ | |||
public Integer typeID = null; | |||
public String typeName = null; | |||
public double score = 0; | |||
/* | |||
* 1, For standard type (DBO type in DBpedia), relation = typePredicateID (rdf:type) | |||
* 2, For nonstandard type, typeID = -1 | |||
* 3, If add type into triples, need relation | eg, Which professional surfers were born in Australia? (?uri dbo:occupation res:Surfing) relation = dbo:occupation | |||
* 4, If needn't add type, relation = -1 | eg, Who was the father of [Queen] Elizabeth II | |||
* */ | |||
public int prefferdRelation = Globals.pd.typePredicateID; | |||
public TypeMapping(Integer tid, String type, double sco) | |||
{ | |||
typeID = tid; | |||
typeName = type; | |||
score = sco; | |||
} | |||
public TypeMapping(Integer tid, String type, Integer relation, double sco) | |||
{ | |||
typeID = tid; | |||
typeName = type.replace("_", ""); | |||
score = sco; | |||
prefferdRelation = relation; | |||
} | |||
// In descending order: big --> small | |||
public int compareTo(TypeMapping o) | |||
{ | |||
double diff = this.score - o.score; | |||
if (diff > 0) return -1; | |||
else if (diff < 0) return 1; | |||
else return 0; | |||
} | |||
public int hashCode() | |||
{ | |||
return typeID.hashCode(); | |||
} | |||
public String toString() | |||
{ | |||
StringBuilder res = new StringBuilder(typeName+"("+score+")"); | |||
return res.toString(); | |||
} | |||
} |
@@ -0,0 +1,91 @@ | |||
package utils; | |||
import java.io.*; | |||
import java.util.ArrayList; | |||
import java.util.HashSet; | |||
import java.util.List; | |||
import java.util.Set; | |||
public class FileUtil { | |||
public static List<String> readFile(String filePath){ | |||
List<String> lines = new ArrayList<String>(); | |||
try { | |||
BufferedReader br = new BufferedReader(new FileReader(filePath)); | |||
String line = null; | |||
while( (line = br.readLine()) != null ){ | |||
lines.add(line); | |||
} | |||
br.close(); | |||
}catch(Exception e){ | |||
e.printStackTrace(); | |||
}finally { | |||
return lines; | |||
} | |||
} | |||
public static Set<String> readFileAsSet(String filePath){ | |||
Set<String> lines = new HashSet<String>(); | |||
try { | |||
BufferedReader br = new BufferedReader(new FileReader(filePath)); | |||
String line = null; | |||
while( (line = br.readLine()) != null ){ | |||
lines.add(line); | |||
} | |||
br.close(); | |||
}catch(Exception e){ | |||
e.printStackTrace(); | |||
}finally { | |||
return lines; | |||
} | |||
} | |||
public static List<String> readFile(InputStream is){ | |||
List<String> lines = new ArrayList<String>(); | |||
try { | |||
BufferedReader br = new BufferedReader(new InputStreamReader(is)); | |||
String line = null; | |||
while( (line = br.readLine()) != null ){ | |||
lines.add(line); | |||
} | |||
br.close(); | |||
}catch(Exception e){ | |||
e.printStackTrace(); | |||
}finally { | |||
return lines; | |||
} | |||
} | |||
public static String readFileAsALine(InputStream is){ | |||
List<String> lines = readFile(is); | |||
StringBuffer buffer = new StringBuffer(); | |||
for(String line : lines){ | |||
buffer.append(line); | |||
} | |||
return buffer.toString(); | |||
} | |||
public static void writeFile(List<String> lines, String filePath){ | |||
try{ | |||
BufferedWriter bw = new BufferedWriter(new FileWriter(filePath)); | |||
for(String line : lines){ | |||
bw.write(line+"\n"); | |||
} | |||
bw.close(); | |||
}catch(Exception e){ | |||
e.printStackTrace(); | |||
} | |||
} | |||
public static void writeFile(List<String> lines, String filePath, boolean ifContinueWrite){ | |||
try{ | |||
BufferedWriter bw = new BufferedWriter(new FileWriter(filePath, ifContinueWrite)); | |||
for(String line : lines){ | |||
bw.write(line+"\n"); | |||
} | |||
bw.close(); | |||
}catch(Exception e){ | |||
e.printStackTrace(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,114 @@ | |||
package utils; | |||
import java.io.BufferedReader; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.io.InputStreamReader; | |||
import java.io.PrintWriter; | |||
import java.net.URL; | |||
import java.net.URLConnection; | |||
import java.util.List; | |||
import java.util.Map; | |||
public class HttpRequest | |||
{ | |||
public static String sendGet(String url, String param) { | |||
String result = ""; | |||
BufferedReader in = null; | |||
try { | |||
String urlNameString = url + "?" + param; | |||
URL realUrl = new URL(urlNameString); | |||
URLConnection connection = realUrl.openConnection(); | |||
connection.setRequestProperty("accept", "*/*"); | |||
connection.setRequestProperty("connection", "Keep-Alive"); | |||
connection.setRequestProperty("user-agent", | |||
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); | |||
connection.connect(); | |||
Map<String, List<String>> map = connection.getHeaderFields(); | |||
for (String key : map.keySet()) { | |||
System.out.println(key + "--->" + map.get(key)); | |||
} | |||
in = new BufferedReader(new InputStreamReader( | |||
connection.getInputStream())); | |||
String line; | |||
while ((line = in.readLine()) != null) { | |||
result += line; | |||
} | |||
} catch (Exception e) { | |||
System.out.println("Error when sending GET request: " + e); | |||
e.printStackTrace(); | |||
} | |||
finally { | |||
try { | |||
if (in != null) { | |||
in.close(); | |||
} | |||
} catch (Exception e2) { | |||
e2.printStackTrace(); | |||
} | |||
} | |||
return result; | |||
} | |||
public static String sendPost(String url, String param) { | |||
PrintWriter out = null; | |||
BufferedReader in = null; | |||
String result = ""; | |||
try { | |||
URL realUrl = new URL(url); | |||
URLConnection conn = realUrl.openConnection(); | |||
conn.setRequestProperty("accept", "*/*"); | |||
conn.setRequestProperty("connection", "Keep-Alive"); | |||
conn.setRequestProperty("user-agent", | |||
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); | |||
conn.setDoOutput(true); | |||
conn.setDoInput(true); | |||
out = new PrintWriter(conn.getOutputStream()); | |||
out.print(param); | |||
out.flush(); | |||
in = new BufferedReader( | |||
new InputStreamReader(conn.getInputStream())); | |||
String line; | |||
while ((line = in.readLine()) != null) { | |||
result += line; | |||
} | |||
} catch (Exception e) { | |||
System.out.println("Error when sending POST request: "+e); | |||
e.printStackTrace(); | |||
} | |||
finally{ | |||
try{ | |||
if(out!=null){ | |||
out.close(); | |||
} | |||
if(in!=null){ | |||
in.close(); | |||
} | |||
} | |||
catch(IOException ex){ | |||
ex.printStackTrace(); | |||
} | |||
} | |||
return result; | |||
} | |||
public static String getPostData(InputStream in, int size, String charset) { | |||
if (in != null && size > 0) { | |||
byte[] buf = new byte[size]; | |||
try { | |||
in.read(buf); | |||
if (charset == null || charset.length() == 0) | |||
return new String(buf); | |||
else { | |||
return new String(buf, charset); | |||
} | |||
} catch (IOException e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
return null; | |||
} | |||
} |