Browse Source

First commit

The first commit of gAnswer project.
tags/v0.1.0
knightmarehs 6 years ago
parent
commit
8a8cbbe85f
59 changed files with 12399 additions and 0 deletions
  1. +238
    -0
      src/addition/AddtionalFix.java
  2. +155
    -0
      src/addition/AggregationRecognition.java
  3. +312
    -0
      src/fgmt/EntityFragment.java
  4. +8
    -0
      src/fgmt/Fragment.java
  5. +105
    -0
      src/fgmt/RelationFragment.java
  6. +179
    -0
      src/fgmt/TypeFragment.java
  7. +56
    -0
      src/fgmt/VariableFragment.java
  8. +489
    -0
      src/jgsc/GstoreConnector.java
  9. +133
    -0
      src/lcn/BuildIndexForEntityFragments.java
  10. +107
    -0
      src/lcn/BuildIndexForTypeShortName.java
  11. +64
    -0
      src/lcn/EntityFragmentFields.java
  12. +31
    -0
      src/lcn/EntityNameAndScore.java
  13. +58
    -0
      src/lcn/Main.java
  14. +84
    -0
      src/lcn/SearchInEntityFragments.java
  15. +176
    -0
      src/lcn/SearchInTypeShortName.java
  16. +116
    -0
      src/log/QueryLogger.java
  17. +402
    -0
      src/nlp/ds/DependencyTree.java
  18. +150
    -0
      src/nlp/ds/DependencyTreeNode.java
  19. +88
    -0
      src/nlp/ds/Sentence.java
  20. +126
    -0
      src/nlp/ds/Word.java
  21. +202
    -0
      src/nlp/tool/CoreNLP.java
  22. +42
    -0
      src/nlp/tool/Main.java
  23. +70
    -0
      src/nlp/tool/MaltParser.java
  24. +73
    -0
      src/nlp/tool/MaltParserCon.java
  25. +53
    -0
      src/nlp/tool/NERecognizer.java
  26. +51
    -0
      src/nlp/tool/StanfordParser.java
  27. +614
    -0
      src/nlp/tool/StopWordsList.java
  28. +441
    -0
      src/paradict/ParaphraseDictionary.java
  29. +24
    -0
      src/paradict/PredicateIDAndSupport.java
  30. +105
    -0
      src/qa/Answer.java
  31. +376
    -0
      src/qa/GAnswer.java
  32. +118
    -0
      src/qa/Globals.java
  33. +9
    -0
      src/qa/Matches.java
  34. +128
    -0
      src/qa/Query.java
  35. +153
    -0
      src/qa/extract/CorefResolution.java
  36. +918
    -0
      src/qa/extract/EntityRecognition.java
  37. +467
    -0
      src/qa/extract/ExtractImplicitRelation.java
  38. +472
    -0
      src/qa/extract/ExtractRelation.java
  39. +358
    -0
      src/qa/extract/TypeRecognition.java
  40. +690
    -0
      src/qa/mapping/CompatibilityChecker.java
  41. +164
    -0
      src/qa/mapping/DBpediaLookup.java
  42. +44
    -0
      src/qa/mapping/EntityFragmentDict.java
  43. +811
    -0
      src/qa/mapping/SemanticItemMapping.java
  44. +1201
    -0
      src/qa/parsing/BuildQueryGraph.java
  45. +208
    -0
      src/qa/parsing/QuestionParsing.java
  46. +40
    -0
      src/rdf/EntityMapping.java
  47. +77
    -0
      src/rdf/ImplicitRelation.java
  48. +41
    -0
      src/rdf/MergedWord.java
  49. +24
    -0
      src/rdf/NodeSelectedWithScore.java
  50. +28
    -0
      src/rdf/PredicateMapping.java
  51. +180
    -0
      src/rdf/SemanticQueryGraph.java
  52. +171
    -0
      src/rdf/SemanticRelation.java
  53. +61
    -0
      src/rdf/SemanticUnit.java
  54. +88
    -0
      src/rdf/SimpleRelation.java
  55. +305
    -0
      src/rdf/Sparql.java
  56. +257
    -0
      src/rdf/Triple.java
  57. +53
    -0
      src/rdf/TypeMapping.java
  58. +91
    -0
      src/utils/FileUtil.java
  59. +114
    -0
      src/utils/HttpRequest.java

+ 238
- 0
src/addition/AddtionalFix.java View File

@@ -0,0 +1,238 @@
package addition;

import java.util.ArrayList;
import java.util.HashMap;

import paradict.PredicateIDAndSupport;
import log.QueryLogger;
//import nlp.ds.DependencyTree;
//import nlp.ds.DependencyTreeNode;
import nlp.ds.Word;
import nlp.ds.Sentence.SentenceType;
import qa.Globals;
//import qa.extract.TypeRecognition;
//import qa.mapping.SemanticItemMapping;
//import rdf.EntityMapping;
import rdf.SemanticUnit;
import rdf.Sparql;
import rdf.Sparql.QueryType;
import rdf.Triple;
//import fgmt.TypeFragment;


public class AddtionalFix
{
public HashMap<String, String> pattern2category = new HashMap<String, String>();
public AddtionalFix()
{
// Some category mappings for DBpedia, try automatic linking methods later. | base form
pattern2category.put("gangster_from_the_prohibition_era", "Prohibition-era_gangsters");
pattern2category.put("seven_wonder_of_the_ancient_world", "Seven_Wonders_of_the_Ancient_World");
pattern2category.put("three_ship_use_by_columbus", "Christopher_Columbus");
pattern2category.put("13_british_colony", "Thirteen_Colonies");
}
public void process(QueryLogger qlog)
{
fixCategory(qlog);
oneTriple(qlog);
oneNode(qlog);
//aggregation
AggregationRecognition ar = new AggregationRecognition();
ar.recognize(qlog);
//query type
decideQueryType(qlog);
}
public void decideQueryType(QueryLogger qlog)
{
for(Sparql spq: qlog.rankedSparqls)
if(qlog.s.sentenceType == SentenceType.GeneralQuestion)
spq.queryType = QueryType.Ask;
}
public void fixCategory(QueryLogger qlog)
{
if(qlog == null || qlog.semanticUnitList == null)
return;
String var = null, category = null;
for(SemanticUnit su: qlog.semanticUnitList)
{
if(su.centerWord.mayCategory)
{
var = "?"+su.centerWord.originalForm;
category = su.centerWord.category;
}
}
if(category != null && var != null)
for(Sparql spq: qlog.rankedSparqls)
{
boolean occured = false;
for(Triple tri: spq.tripleList)
{
if(tri.subject.equals(var))
{
occured = true;
break;
}
}
String oName = category;
String pName = "subject";
int pid = Globals.pd.predicate_2_id.get(pName);
Triple triple = new Triple(Triple.VAR_ROLE_ID, var, pid, Triple.CAT_ROLE_ID, oName, null, 100);
spq.addTriple(triple);
}
}
/* recognize one-Node query
* Two cases:1、Special question|Imperative sentence 2、General question
* 1-1:how many [], highest [] ... | For single variable, add constraint (aggregation)
* 1-2: What is backgammon? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...)
* 1-3: Give me all Seven Wonders of the Ancient World. | Notice, "Seven Wonders of the Ancient World" should be recognized as ENT before. (in fact it is CATEGORY in DBpeida)
* 2-1: Are there any [castles_in_the_United_States](yago:type)
* 2-2:Was Sigmund Freud married? | Lack of variable node.
* 2-3:Are penguins endangered? | No suitable relation matching, need transition.
*/
public void oneNode(QueryLogger qlog)
{
if(qlog == null || qlog.semanticUnitList == null || qlog.semanticUnitList.size()>1)
return;
Word target = qlog.target;
Word[] words = qlog.s.words;
if(qlog.s.sentenceType != SentenceType.GeneralQuestion)
{
//1-1: how many [type] are there | List all [type]
if(target.mayType && target.tmList != null)
{
String subName = "?"+target.originalForm;
String typeName = target.tmList.get(0).typeName;
Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
//1-2: What is [ent]?
else if(target.mayEnt && target.emList != null)
{
if(words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be"))
{
int eid = target.emList.get(0).entityID;
String subName = target.emList.get(0).entityName;
Triple triple = new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+target.originalForm, null, target.emList.get(0).score);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
}
//1-3: Give me all Seven Wonders of the Ancient World.
else if(target.mayCategory && target.category != null)
{
String oName = target.category;
String pName = "subject";
int pid = Globals.pd.predicate_2_id.get(pName);
Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+target.originalForm, pid, Triple.CAT_ROLE_ID, oName, null, 100);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
}
else
{
if(target.mayEnt && target.emList != null)
{
//2-2:Was Sigmund Freud married?
String relMention = "";
for(Word word: words)
if(word != target && !word.baseForm.equals(".") && !word.baseForm.equals("?"))
relMention += word.baseForm+" ";
if(relMention.length() > 1)
relMention = relMention.substring(0, relMention.length()-1);
ArrayList<PredicateIDAndSupport> pmList = null;
if(Globals.pd.nlPattern_2_predicateList.containsKey(relMention))
pmList = Globals.pd.nlPattern_2_predicateList.get(relMention);
if(pmList != null && pmList.size() > 0)
{
int pid = pmList.get(0).predicateID;
int eid = target.emList.get(0).entityID;
String subName = target.emList.get(0).entityName;
Triple triple = new Triple(eid, subName, pid, Triple.VAR_ROLE_ID, "?x", null, 100);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
//2-3:Are penguins endangered?
else
{
if(target.position < words.length && pattern2category.containsKey(words[target.position].baseForm))
{
String oName = pattern2category.get(words[target.position].baseForm);
String pName = "subject";
int pid = Globals.pd.predicate_2_id.get(pName);
int eid = target.emList.get(0).entityID;
String subName = target.emList.get(0).entityName;
Triple triple = new Triple(eid, subName, pid, Triple.CAT_ROLE_ID, oName, null, 100);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
}
}
//2-1: Are there any [castles_in_the_United_States](yago:type)
else if(target.mayType && target.tmList != null)
{
String typeName = target.tmList.get(0).typeName;
String subName = "?" + target.originalForm;
//System.out.println("typeName="+typeName+" subName="+subName);
Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
}
}
/*
* One triple recognized but no suitable relation.
* */
public void oneTriple (QueryLogger qlog)
{
if(qlog == null || qlog.semanticUnitList == null)
return;
if(qlog.s.sentenceType == SentenceType.SpecialQuestion)
{
Word[] words = qlog.s.words;
if(qlog.semanticUnitList.size() == 2)
{
Word entWord = null, whWord = null;
for(int i=0;i<qlog.semanticUnitList.size();i++)
{
if(qlog.semanticUnitList.get(i).centerWord.baseForm.startsWith("wh"))
whWord = qlog.semanticUnitList.get(i).centerWord;
if(qlog.semanticUnitList.get(i).centerWord.mayEnt)
entWord = qlog.semanticUnitList.get(i).centerWord;
}
// 1-1: (what) is [ent] | we guess users may want the type of ent.
if(entWord!=null && whWord!= null && words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be"))
{
int eid = entWord.emList.get(0).entityID;
String subName = entWord.emList.get(0).entityName;
Triple triple = new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+whWord.originalForm, null, entWord.emList.get(0).score);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
}
}
}
}


+ 155
- 0
src/addition/AggregationRecognition.java View File

@@ -0,0 +1,155 @@
package addition;

import nlp.ds.DependencyTree;
import nlp.ds.DependencyTreeNode;
import nlp.ds.Word;
import qa.Globals;
import rdf.SemanticRelation;
import rdf.Sparql;
import rdf.Triple;
import log.QueryLogger;

public class AggregationRecognition {

// Numbers
static String x[]={"zero","one","two","three","four","five","six","seven","eight","nine"};
static String y[]={"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen"};
static String z[]={"twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"};
static int b;

public static Integer translateNumbers(String str) // 1~100
{
int flag;
try {
b=Integer.valueOf(str);
flag=1;
}
catch (Exception e){
flag=2;
}
int i,j;
switch(flag)
{
case 1:
return b;
case 2: // Words need to be translated into numbers
boolean flag1=true;
for(i=0;i<8;i++) // 20~99
{
for(j=0;j<10;j++)
{
String str1=z[i],str2=x[j];
if(str.equals((str1))){
return i*10+20; // 1x
}
else if(str.equals((str1+" "+str2))){
return i*10+j+20;
}
}
}
for(i=0;i<10;i++){
if(str.equals(x[i])){
return i;
}
else if(str.equals(y[i])){
return 10+i;
}
}
System.out.println("Warning: Can not Translate Number: " + str);
}
return 1;
}

public void recognize(QueryLogger qlog)
{
DependencyTree ds = qlog.s.dependencyTreeStanford;
if(qlog.isMaltParserUsed)
ds = qlog.s.dependencyTreeMalt;
Word[] words = qlog.s.words;
// how often | how many
if(qlog.s.plainText.indexOf("How many")!=-1||qlog.s.plainText.indexOf("How often")!=-1||qlog.s.plainText.indexOf("how many")!=-1||qlog.s.plainText.indexOf("how often")!=-1)
{
for(Sparql sp: qlog.rankedSparqls)
{
sp.countTarget = true;
// How many pages does War and Peace have? --> res:War_and_Peace dbo:numberOfPages ?n .
// ?uri dbo:populationTotal ?inhabitants .
for(Triple triple: sp.tripleList)
{
String p = Globals.pd.getPredicateById(triple.predicateID).toLowerCase();
if(p.contains("number") || p.contains("total") || p.contains("calories") || p.contains("satellites"))
{
sp.countTarget = false;
}
}
}
}
// more than [num] [node]
for(DependencyTreeNode dtn: ds.nodesList)
{
if(dtn.word.baseForm.equals("more"))
{
if(dtn.father!=null && dtn.father.word.baseForm.equals("than"))
{
DependencyTreeNode tmp = dtn.father;
if(tmp.father!=null && tmp.father.word.posTag.equals("CD") && tmp.father.father!=null && tmp.father.father.word.posTag.startsWith("N"))
{
DependencyTreeNode target = tmp.father.father;
// Which caves have more than 3 entrances | entranceCount | filter
for(Sparql sp: qlog.rankedSparqls)
{
if(target.father !=null && target.father.word.baseForm.equals("have"))
{
sp.moreThanStr = "GROUP BY ?" + qlog.target.originalForm + "\nHAVING (COUNT(?"+target.word.originalForm + ") > "+tmp.father.word.baseForm+")";
}
else
{
int num = translateNumbers(tmp.father.word.baseForm);
sp.moreThanStr = "FILTER (?"+target.word.originalForm+"> " + num + ")";
}
}
}
}
}
}
// most
for(Word word: words)
{
if(word.baseForm.equals("most"))
{
Word modifiedWord = word.modifiedWord;
if(modifiedWord != null)
{
for(Sparql sp: qlog.rankedSparqls)
{
// Which Indian company has the most employees? --> ... dbo:numberOfEmployees ?n . || ?employees dbo:company ...
sp.mostStr = "ORDER BY DESC(COUNT(?"+modifiedWord.originalForm+"))\nOFFSET 0 LIMIT 1";
for(Triple triple: sp.tripleList)
{
String p = Globals.pd.getPredicateById(triple.predicateID).toLowerCase();
if(p.contains("number") || p.contains("total"))
{
sp.mostStr = "ORDER BY DESC(?"+modifiedWord.originalForm+")\nOFFSET 0 LIMIT 1";
}
}
}
}
}
}
}
public static void main(String[] args) {
System.out.println(translateNumbers("Twelve"));
System.out.println(translateNumbers("thirty two"));
}

}

+ 312
- 0
src/fgmt/EntityFragment.java View File

@@ -0,0 +1,312 @@
package fgmt;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;

import rdf.EntityMapping;
import lcn.EntityFragmentFields;
import lcn.EntityNameAndScore;
import lcn.SearchInEntityFragments;

public class EntityFragment extends Fragment {
public int eId;
public HashSet<Integer> inEdges = new HashSet<Integer>();
public HashSet<Integer> outEdges = new HashSet<Integer>();
public HashSet<Integer> types = new HashSet<Integer>();
// in/out entity and the connected edges. Eg, <eId><director><tom> <eId><star><tom>, then outEntMap of eId contains <tom,<director,star>>
public HashMap<Integer, ArrayList<Integer>> inEntMap = new HashMap<Integer, ArrayList<Integer>>(); // notice the input file should no redundant triple.
public HashMap<Integer, ArrayList<Integer>> outEntMap = new HashMap<Integer, ArrayList<Integer>>();
static double thres1 = 0.4;
static double thres2 = 0.8;
static int thres3 = 3;
static int k = 50;
/**
* mention to entity using Lucene index.
*
* rule:
* select top-k results of each phrase.
* (1)if current lowest score < thres1, drop those score < thres1.
* (2)if current lowest score > thres2, add those score > thres2.
*
* exact match:
* (1)Lucene score = 1.
* (2)String match (lowercase): edit distance <= thres3.
*
* score:
* use Lucene score directly.
*
* @param phrase
* @return
*/
public static HashMap<Integer, Double> getCandEntityNames2(String phrase) {
HashMap<Integer, Double> ret = new HashMap<Integer, Double>();
ArrayList<EntityNameAndScore> list1 = getCandEntityNames_subject(phrase, thres1, thres2, k);
if(list1 == null)
return ret;
int iter_size = 0;
if (list1.size() <= k) {
iter_size = list1.size();
}
else if (list1.size() > k) {
if (list1.get(k-1).score >= thres2) {
iter_size = list1.size();
}
else {
iter_size = k;
}
}
for(int i = 0; i < iter_size; i ++) {
if (i < k) {
ret.put(list1.get(i).entityID, getScore(phrase, list1.get(i).entityName, list1.get(i).score));
}
else if (list1.get(i).score >= thres2) {
ret.put(list1.get(i).entityID, getScore(phrase, list1.get(i).entityName, list1.get(i).score));
}
else {
break;
}
}

return ret;
}
public static ArrayList<EntityMapping> getEntityMappingList (String n)
{
HashMap<Integer, Double> map = getCandEntityNames2(n);
ArrayList<EntityMapping> ret = new ArrayList<EntityMapping>();
for (int eid : map.keySet())
{
String s = EntityFragmentFields.entityId2Name.get(eid);
ret.add(new EntityMapping(eid, s, map.get(eid)));
}
Collections.sort(ret);
return ret;
}
public static double getScore (String s1, String s2, double luceneScore) {
double ret = luceneScore*100.0/(Math.log(calEditDistance(s1, s2)*1.5+1)+1);
return ret;
}
/**
* Edit distance (all lowercase)
* @param s1
* @param s2
* @return
*/
public static int calEditDistance (String s1, String s2) {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
int d[][];
int n = s1.length();
int m = s2.length();
int i, j, temp;
char ch1, ch2;
if(n == 0) {
return m;
}
if(m == 0) {
return n;
}

d = new int[n+1][m+1];
for(i=0; i<=n; i++) {
d[i][0] = i;
}
for(j=0; j<=m; j++) {
d[0][j] = j;
}

for(i=1; i<=n; i++) {
ch1 = s1.charAt(i-1);
for(j=1; j<=m; j++) {
ch2 = s2.charAt(j-1);
if(ch1 == ch2) {
temp = 0;
} else {
temp = 1;
}
d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp);
}
}

return d[n][m];
}
private static int min(int a, int b, int c) {
int ab = a<b?a:b;
return ab<c?ab:c;
}
public static ArrayList<EntityNameAndScore> getCandEntityNames_subject(String phrase, double thres1, double thres2, int k) {
SearchInEntityFragments sf = new SearchInEntityFragments();
//System.out.println("EntityFragment.getCandEntityNames_subject() ...");
ArrayList<EntityNameAndScore> ret_sf = null;
try {
ret_sf = sf.searchName(phrase, thres1, thres2, k);
} catch (IOException e) {
//e.printStackTrace();
System.err.println("Reading lcn index error");
}
return ret_sf;
}

public static EntityFragment getEntityFragmentByEntityId(Integer entityId)
{
if(!EntityFragmentFields.entityFragmentString.containsKey(entityId))
return null;
String fgmt = EntityFragmentFields.entityFragmentString.get(entityId);
EntityFragment ef = new EntityFragment(entityId, fgmt);
return ef;
}
public static String getEntityFgmtStringByName(String entityName)
{
int id = EntityFragmentFields.entityName2Id.get(entityName);
String fgmt = EntityFragmentFields.entityFragmentString.get(id);
return fgmt;
}
public EntityFragment(int eid, String fgmt)
{
eId = eid;
fragmentType = typeEnum.ENTITY_FRAGMENT;
//eg: 11 |3961112:2881;410;,4641020:2330;,
fgmt = fgmt.replace('|', '#');
String[] fields = fgmt.split("#");
if(fields.length > 0 && fields[0].length() > 0)
{
String[] entEdgesArr = fields[0].split(",");
for(int i = 0; i < entEdgesArr.length; i ++)
{
String[] nums = entEdgesArr[i].split(":");
if(nums.length != 2)
continue;
int intEntId = Integer.valueOf(nums[0]);
String[] intEdges = nums[1].split(";");
ArrayList<Integer> intEdgeList = new ArrayList<Integer>();
for(String outEdge: intEdges)
{
intEdgeList.add(Integer.valueOf(outEdge));
}
if(intEdgeList.size()>0)
inEntMap.put(intEntId, intEdgeList);
}
}
if(fields.length > 1 && fields[1].length() > 0)
{
String[] entEdgesArr = fields[1].split(",");
for(int i = 0; i < entEdgesArr.length; i ++)
{
String[] nums = entEdgesArr[i].split(":");
if(nums.length != 2)
continue;
int outEntId = Integer.valueOf(nums[0]);
String[] outEdges = nums[1].split(";");
ArrayList<Integer> outEdgeList = new ArrayList<Integer>();
for(String outEdge: outEdges)
{
outEdgeList.add(Integer.valueOf(outEdge));
}
if(outEdgeList.size()>0)
outEntMap.put(outEntId, outEdgeList);
}
}
if(fields.length > 2 && fields[2].length() > 0) {
String[] nums = fields[2].split(",");
for(int i = 0; i < nums.length; i ++) {
if (nums[i].length() > 0) {
inEdges.add(Integer.parseInt(nums[i]));
}
}
}
if(fields.length > 3 && fields[3].length() > 0) {
String[] nums = fields[3].split(",");
for(int i = 0; i < nums.length; i ++) {
if (nums[i].length() > 0) {
outEdges.add(Integer.parseInt(nums[i]));
}
}
}
if(fields.length > 4 && fields[4].length() > 0) {
String[] nums = fields[4].split(",");
for(int i = 0; i < nums.length; i ++) {
if (nums[i].length() > 0) {
types.add(Integer.parseInt(nums[i]));
}
}
}
//TODO: fix data for DBpedia 2014 (should be eliminated when update dataset)
if(eid==2640237) //Barack_Obama
{
inEdges.add(8432); //spouse
outEdges.add(8432);
ArrayList<Integer> outEdgeList = new ArrayList<Integer>();
outEdgeList.add(8432);
inEntMap.put(4953443, outEdgeList);
outEntMap.put(4953443, outEdgeList);
}
}
@Override
public String toString()
{
StringBuilder ret = new StringBuilder("");
for(Integer inEnt: inEntMap.keySet())
{
ArrayList<Integer> inEdgeList = inEntMap.get(inEnt);
if(inEdgeList==null || inEdgeList.size()==0)
continue;
ret.append(inEnt+":");
for(int inEdge: inEdgeList)
ret.append(inEdge+";");
ret.append(",");
}
ret.append('|');
for(Integer outEnt: outEntMap.keySet())
{
ArrayList<Integer> outEdgeList = outEntMap.get(outEnt);
if(outEdgeList==null || outEdgeList.size()==0)
continue;
ret.append(outEnt+":");
for(int outEdge: outEdgeList)
ret.append(outEdge+";");
ret.append(",");
}
ret.append('|');
for(Integer p : inEdges) {
ret.append(p);
ret.append(',');
}
ret.append('|');
for(Integer p : outEdges) {
ret.append(p);
ret.append(',');
}
ret.append('|');
for(Integer t : types) {
ret.append(t);
ret.append(',');
}
return ret.toString();
}
}

+ 8
- 0
src/fgmt/Fragment.java View File

@@ -0,0 +1,8 @@
package fgmt;

public abstract class Fragment {
public enum typeEnum {ENTITY_FRAGMENT, RELATION_FRAGMENT, TYPE_FRAGMENT, VAR_FRAGMENT};
public typeEnum fragmentType;
public int fragmentId;
};

+ 105
- 0
src/fgmt/RelationFragment.java View File

@@ -0,0 +1,105 @@
package fgmt;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;

import qa.Globals;
import utils.FileUtil;

public class RelationFragment extends Fragment
{
public static HashMap<Integer, ArrayList<RelationFragment>> relFragments = null;
public static HashMap<String, ArrayList<Integer>> relationShortName2IdList = null;
public static HashSet<Integer> literalRelationSet = null;
public HashSet<Integer> inTypes = new HashSet<Integer>();
public HashSet<Integer> outTypes = new HashSet<Integer>();
public static final int literalTypeId = -176;
public RelationFragment(String inFgmt, String outFgmt, int fid)
{
fragmentId = fid;
fragmentType = typeEnum.RELATION_FRAGMENT;
String[] nums;
// in
nums = inFgmt.split(",");
for(String s: nums)
if(s.length() > 0)
inTypes.add(Integer.parseInt(s));
// out
if(outFgmt.equals("itera"))
outTypes.add(literalTypeId);
else
{
nums = outFgmt.split(",");
for(String s: nums)
if(s.length() > 0)
outTypes.add(Integer.parseInt(s));
}
}
public static void load() throws Exception
{
String filename = Globals.localPath + "data/DBpedia2016/fragments/predicate_RDF_fragment/predicate_fragment.txt";
List<String> inputs = FileUtil.readFile(filename);
relFragments = new HashMap<Integer, ArrayList<RelationFragment>>();
literalRelationSet = new HashSet<Integer>();
for(String line: inputs)
{
String[] lines = line.split("\t");
String inString = lines[0].substring(1, lines[0].length()-1);
int pid = Integer.parseInt(lines[1]);
String outString = lines[2].substring(1, lines[2].length()-1);
// Record which relations can connect LITERAL objects.
if(outString.equals("itera")) // "literal".substring(1, length()-1)
literalRelationSet.add(pid);
if(!relFragments.containsKey(pid))
relFragments.put(pid, new ArrayList<RelationFragment>());
relFragments.get(pid).add(new RelationFragment(inString, outString, pid));
}

loadId();
}
public static void loadId() throws IOException
{
String filename = Globals.localPath + "data/DBpedia2016/fragments/id_mappings/16predicate_id.txt";
List<String> inputs = FileUtil.readFile(filename);
relationShortName2IdList = new HashMap<String, ArrayList<Integer>>();

for(String line: inputs)
{
String[] lines = line.split("\t");
String rlnShortName = lines[0];
if (!relationShortName2IdList.containsKey(rlnShortName))
relationShortName2IdList.put(rlnShortName, new ArrayList<Integer>());
relationShortName2IdList.get(rlnShortName).add(Integer.parseInt(lines[1]));
}
}
public static boolean isLiteral (String p)
{
for (Integer i : relationShortName2IdList.get(p))
if (literalRelationSet.contains(i))
return true;
return false;
}
public static boolean isLiteral (int pid)
{
if (literalRelationSet.contains(pid))
return true;
else
return false;
}
}

+ 179
- 0
src/fgmt/TypeFragment.java View File

@@ -0,0 +1,179 @@
package fgmt;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;

import qa.Globals;


public class TypeFragment extends Fragment {

public static HashMap<Integer, TypeFragment> typeFragments = null;
public static HashMap<String, ArrayList<Integer>> typeShortName2IdList = null;
public static HashMap<Integer, String> typeId2ShortName = null;
public static final int NO_RELATION = -24232;
public static HashSet<String> yagoTypeList = null;
public HashSet<Integer> inEdges = new HashSet<Integer>();
public HashSet<Integer> outEdges = new HashSet<Integer>();
public HashSet<Integer> entSet = new HashSet<Integer>();
/*
* Eliminate some bad YAGO Types which conflict with:
* 1, ENT: amazon、earth、the_hunger_game、sparkling_wine
* 2, TYPE: type
* 3, RELATION: flow、owner、series、shot、part、care
* 4, others: peace、vice
*/
public static ArrayList<String> stopYagoTypeList = null;
static void loadStopYagoTypeList()
{
stopYagoTypeList = new ArrayList<String>();
stopYagoTypeList.add("Amazon");
stopYagoTypeList.add("Earth");
stopYagoTypeList.add("TheHungerGames");
stopYagoTypeList.add("SparklingWine");
stopYagoTypeList.add("Type");
stopYagoTypeList.add("Flow");
stopYagoTypeList.add("Owner");
stopYagoTypeList.add("Series");
stopYagoTypeList.add("Shot");
stopYagoTypeList.add("Part");
stopYagoTypeList.add("Care");
stopYagoTypeList.add("Peace");
stopYagoTypeList.add("Vice");
stopYagoTypeList.add("Dodo");
stopYagoTypeList.add("CzechFilms");
stopYagoTypeList.add("ChineseFilms");
}
public TypeFragment(String fgmt, int fid)
{
fragmentId = fid;
fragmentType = typeEnum.TYPE_FRAGMENT;
fgmt = fgmt.replace('|', '#');
String[] ss = fgmt.split("#");
String[] nums;
if (ss[0].length() > 0) {
nums = ss[0].split(",");
for(int i = 0; i < nums.length; i ++) {
if (nums[i].length() > 0) {
inEdges.add(Integer.parseInt(nums[i]));
}
}
}
else {
inEdges.add(NO_RELATION);
}

if (ss.length > 1 && ss[1].length() > 0) {
nums = ss[1].split(",");
for(int i = 0; i < nums.length; i ++) {
if (nums[i].length() > 0) {
outEdges.add(Integer.parseInt(nums[i]));
}
}
}
else {
outEdges.add(NO_RELATION);
}
if(ss.length > 2 && ss[2].length() > 0)
{
nums = ss[2].split(",");
for(int i = 0; i < nums.length; i ++) {
if (nums[i].length() > 0) {
entSet.add(Integer.parseInt(nums[i]));
}
}
}
}
public static void load() throws Exception
{
String filename = Globals.localPath+"data/DBpedia2016/fragments/class_RDF_fragment/16type_fragment.txt";
File file = new File(filename);
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8");
BufferedReader br = new BufferedReader(in);

typeFragments = new HashMap<Integer, TypeFragment>();
System.out.println("Loading type IDs and Fragments ...");
String line;
while((line = br.readLine()) != null) {
String[] lines = line.split("\t");
TypeFragment tfgmt = null;
if(lines[0].length() > 0 && !lines[0].equals("literal")) {
int tid = Integer.parseInt(lines[0]);
try{tfgmt = new TypeFragment(lines[1], tid);}
catch(Exception e){}
typeFragments.put(tid, tfgmt);
}
}
br.close();
// can fix some data there
// load Type Id
loadId();
System.out.println("Load "+typeId2ShortName.size()+" basic types and "+yagoTypeList.size()+" yago types.");
}
public static void loadId() throws IOException
{
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16basic_types_id.txt";
String yagoFileName = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16yago_types_list.txt";

File file = new File(filename);
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8");
BufferedReader br = new BufferedReader(in);

typeShortName2IdList = new HashMap<String, ArrayList<Integer>>();
typeId2ShortName = new HashMap<Integer, String>();

String line;
while((line = br.readLine()) != null) {
String[] lines = line.split("\t");
String typeShortName = lines[0];
// reserve typeShortName's capitalization
if (!typeShortName2IdList.containsKey(typeShortName)) {
typeShortName2IdList.put(typeShortName, new ArrayList<Integer>());
}
typeShortName2IdList.get(typeShortName).add(Integer.parseInt(lines[1]));
typeId2ShortName.put(Integer.parseInt(lines[1]), typeShortName);
}
// literalType
typeShortName2IdList.put("literal_HRZ", new ArrayList<Integer>());
typeShortName2IdList.get("literal_HRZ").add(RelationFragment.literalTypeId);
typeId2ShortName.put(RelationFragment.literalTypeId, "literal_HRZ");
br.close();
//load YAGO types
in = new InputStreamReader(new FileInputStream(yagoFileName),"utf-8");
br = new BufferedReader(in);
yagoTypeList = new HashSet<String>();
while((line = br.readLine())!=null)
{
String[] lines = line.split("\t");
String typeName = lines[0];
yagoTypeList.add(typeName);
}
loadStopYagoTypeList();
yagoTypeList.removeAll(stopYagoTypeList);
}
}

+ 56
- 0
src/fgmt/VariableFragment.java View File

@@ -0,0 +1,56 @@
package fgmt;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;

public class VariableFragment extends Fragment {
public static final int magic_number = -265;

public ArrayList<HashSet<Integer>> candTypes = null;
public HashSet<Integer> candEntities = null;
public boolean mayLiteral = false;
public VariableFragment()
{
fragmentType = typeEnum.VAR_FRAGMENT;
candTypes = new ArrayList<HashSet<Integer>>();
candEntities = new HashSet<Integer>();
}
@Override
public String toString()
{
return "("+ candEntities.size() +")";
}
public boolean containsAll(HashSet<Integer> s1) {
Iterator<HashSet<Integer>> it = candTypes.iterator();
while(it.hasNext()) {
HashSet<Integer> s2 = it.next();
if (s2.contains(magic_number)) {
if (!Collections.disjoint(s1, s2)) {
return true;
}
}
else {
if (s1.containsAll(s2) && s2.containsAll(s1)) {
return true;
}
}
}
return false;
}
public boolean contains(Integer i) {
Iterator<HashSet<Integer>> it = candTypes.iterator();
while(it.hasNext()) {
HashSet<Integer> s = it.next();
if (s.contains(i)) {
return true;
}
}
return false;
}
}

+ 489
- 0
src/jgsc/GstoreConnector.java View File

@@ -0,0 +1,489 @@
package jgsc;

import java.io.*;
import java.net.*;
import java.lang.*;
import java.net.URLEncoder;
import java.net.URLDecoder;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.Map;

public class GstoreConnector {

public static final String defaultServerIP = "127.0.0.1";
public static final int defaultServerPort = 9000;

private String serverIP;
private int serverPort;
//private Socket socket = null;

public GstoreConnector() {
this.serverIP = GstoreConnector.defaultServerIP;
this.serverPort = GstoreConnector.defaultServerPort;
}

public GstoreConnector(int _port) {
this.serverIP = GstoreConnector.defaultServerIP;
this.serverPort = _port;
}

public GstoreConnector(String _ip, int _port) {
this.serverIP = _ip;
this.serverPort = _port;
}

//PERFORMANCE: what if the query result is too large? receive and save to file directly at once
//In addition, set the -Xmx larger(maybe in scale of Gs) if the query result could be very large,
//this may help to reduce the GC cost
public String sendGet(String param) {
String url = "http://" + this.serverIP + ":" + this.serverPort;
StringBuffer result = new StringBuffer();
BufferedReader in = null;
System.out.println("parameter: "+param);

try {
param = URLEncoder.encode(param, "UTF-8");
}
catch (UnsupportedEncodingException ex) {
throw new RuntimeException("Broken VM does not support UTF-8");
}

try {
String urlNameString = url + "/" + param;
System.out.println("request: "+urlNameString);
URL realUrl = new URL(urlNameString);
// 閹垫挸绱戦崪瀛禦L娑斿妫块惃鍕箾閹猴拷
URLConnection connection = realUrl.openConnection();
// 鐠佸墽鐤嗛柅姘辨暏閻ㄥ嫯顕Ч鍌氱潣閹拷
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
//set agent to avoid: speed limited by server if server think the client not a browser
connection.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 瀵よ櫣鐝涚�圭偤妾惃鍕箾閹猴拷
connection.connect();

long t0 = System.currentTimeMillis(); //ms

// 閼惧嘲褰囬幍锟介張澶婃惙鎼存柨銇旂�涙顔�
Map<String, List<String>> map = connection.getHeaderFields();
// 闁秴宸婚幍锟介張澶屾畱閸濆秴绨叉径鏉戠摟濞堬拷
//for (String key : map.keySet()) {
// System.out.println(key + "--->" + map.get(key));
//}

long t1 = System.currentTimeMillis(); //ms
//System.out.println("Time to get header: "+(t1 - t0)+" ms");
//System.out.println("============================================");

// 鐎规矮绠� BufferedReader鏉堟挸鍙嗗ù浣规降鐠囪褰嘦RL閻ㄥ嫬鎼锋惔锟�
in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8"));
String line;
while ((line = in.readLine()) != null) {
//PERFORMANCE: this can be very costly if result is very large, because many temporary Strings are produced
//In this case, just print the line directly will be much faster
result.append(line+"\n");
//System.out.println("get data size: " + line.length());
//System.out.println(line);
}

long t2 = System.currentTimeMillis(); //ms
//System.out.println("Time to get data: "+(t2 - t1)+" ms");
} catch (Exception e) {
System.out.println("error in get request: " + e);
e.printStackTrace();
}
// 娴h法鏁inally閸ф娼甸崗鎶芥4鏉堟挸鍙嗗ù锟�
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result.toString();
}

public void sendGet(String param, String filename) {
String url = "http://" + this.serverIP + ":" + this.serverPort;
BufferedReader in = null;
System.out.println("parameter: "+param);
if (filename == null)
return;

FileWriter fw = null;
try {
fw = new FileWriter(filename);
} catch (IOException e) {
System.out.println("can not open " + filename + "!");
}

try {
param = URLEncoder.encode(param, "UTF-8");
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException("Broken VM does not support UTF-8");
}

try {
String urlNameString = url + "/" + param;
System.out.println("request: "+urlNameString);
URL realUrl = new URL(urlNameString);
// 閹垫挸绱戦崪瀛禦L娑斿妫块惃鍕箾閹猴拷
URLConnection connection = realUrl.openConnection();
// 鐠佸墽鐤嗛柅姘辨暏閻ㄥ嫯顕Ч鍌氱潣閹拷
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
//set agent to avoid: speed limited by server if server think the client not a browser
connection.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 瀵よ櫣鐝涚�圭偤妾惃鍕箾閹猴拷
connection.connect();
long t0 = System.currentTimeMillis(); //ms
// 閼惧嘲褰囬幍锟介張澶婃惙鎼存柨銇旂�涙顔�
Map<String, List<String>> map = connection.getHeaderFields();
// 闁秴宸婚幍锟介張澶屾畱閸濆秴绨叉径鏉戠摟濞堬拷
//for (String key : map.keySet()) {
// System.out.println(key + "--->" + map.get(key));
//}

long t1 = System.currentTimeMillis(); // ms
//System.out.println("Time to get header: "+(t1 - t0)+" ms");

// 鐎规矮绠� BufferedReader鏉堟挸鍙嗗ù浣规降鐠囪褰嘦RL閻ㄥ嫬鎼锋惔锟�
in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8"));
char chars[] = new char[2048];
int b;
while ((b = in.read(chars, 0, 2048)) != -1) {
if (fw != null)
fw.write(chars);
chars = new char[2048];
}

long t2 = System.currentTimeMillis(); //ms
//System.out.println("Time to get data: "+(t2 - t1)+" ms");
} catch (Exception e) {
//System.out.println("error in get request: " + e);
e.printStackTrace();
}
// 娴h法鏁inally閸ф娼甸崗鎶芥4鏉堟挸鍙嗗ù锟�
finally {
try {
if (in != null) {
in.close();
}
if (fw != null) {
fw.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return;
}


//NOTICE: no need to connect now, HTTP connection is kept by default
public boolean load(String _db_name, String _username, String _password) {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.load");
return false;
}

String cmd = "?operation=load&db_name=" + _db_name + "&username=" + _username + "&password=" + _password;
String msg = this.sendGet(cmd);
//if (!send_return) {
//System.err.println("send load command error. @GstoreConnector.load");
//return false;
//}

this.disconnect();

System.out.println(msg);
if (msg.equals("load database done.")) {
return true;
}

return false;
}

public boolean unload(String _db_name,String _username, String _password) {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.unload");
return false;
}

String cmd = "?operation=unload&db_name=" + _db_name + "&username=" + _username + "&password=" + _password;
String msg = this.sendGet(cmd);

this.disconnect();

System.out.println(msg);
if (msg.equals("unload database done.")) {
return true;
}

return false;
}

public boolean build(String _db_name, String _rdf_file_path, String _username, String _password) {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.build");
return false;
}

//TODO: also use encode to support spaces?
//Consider change format into ?name=DBname
String cmd = "?operation=build&db_name=" + _db_name + "&ds_path=" + _rdf_file_path + "&username=" + _username + "&password=" + _password;;
String msg = this.sendGet(cmd);

this.disconnect();

System.out.println(msg);
if (msg.equals("import RDF file to database done.")) {
return true;
}

return false;
}

//TODO: not implemented
public boolean drop(String _db_name) {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.drop");
return false;
}

String cmd = "drop/" + _db_name;
String msg = this.sendGet(cmd);

this.disconnect();

System.out.println(msg);
return msg.equals("drop database done.");
}

public String query(String _username, String _password, String _db_name, String _sparql) {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.query");
return "connect to server error.";
}

//URL encode should be used here
//try {
//_sparql = URLEncoder.encode("\""+_sparql+"\"", "UTF-8");
//}
//catch (UnsupportedEncodingException ex) {
//throw new RuntimeException("Broken VM does not support UTF-8");
//}

String cmd = "?operation=query&username=" + _username + "&password=" + _password + "&db_name=" + _db_name + "&format=txt&sparql=" + _sparql;
//String cmd = "query/\"" + _sparql + "\"";
String msg = this.sendGet(cmd);

this.disconnect();

return msg;
}
public void query(String _username, String _password, String _db_name, String _sparql, String _filename) {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.query");
}

String cmd = "?operation=query&username=" + _username + "&password=" + _password + "&db_name=" + _db_name + "&format=json&sparql=" + _sparql;
this.sendGet(cmd, _filename);
this.disconnect();
return;
}


// public String show() {
// return this.show(false);
// }

//show all databases
public String show() {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.show");
return "connect to server error.";
}

String cmd = "?operation=show";
String msg = this.sendGet(cmd);
this.disconnect();
return msg;
}
public String user(String type, String username1, String password1, String username2, String addtion) {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.show");
return "connect to server error.";
}

String cmd = "?operation=user&type=" + type + "&username1=" + username1 + "&password1=" + password1 + "&username2=" + username2 + "&addtion=" + addtion;
String msg = this.sendGet(cmd);
this.disconnect();
return msg;
}
public String showUser() {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.show");
return "connect to server error.";
}

String cmd = "?operation=showUser";
String msg = this.sendGet(cmd);
this.disconnect();
return msg;
}
public String monitor(String db_name) {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.show");
return "connect to server error.";
}

String cmd = "?operation=monitor&db_name=" + db_name;
String msg = this.sendGet(cmd);
this.disconnect();
return msg;
}
public String checkpoint(String db_name) {
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.show");
return "connect to server error.";
}

String cmd = "?operation=checkpoint&db_name=" + db_name;
String msg = this.sendGet(cmd);
this.disconnect();
return msg;
}
public String test_download(String filepath)
{
boolean connect_return = this.connect();
if (!connect_return) {
System.err.println("connect to server error. @GstoreConnector.query");
return "connect to server error.";
}

//TEST: a small file, a large file
String cmd = "?operation=delete&download=true&filepath=" + filepath;
String msg = this.sendGet(cmd);

this.disconnect();

return msg;
}

private boolean connect() {
return true;
}

private boolean disconnect() {
return true;
}

private static byte[] packageMsgData(String _msg) {
//byte[] data_context = _msg.getBytes();
byte[] data_context = null;
try {
data_context = _msg.getBytes("utf-8");
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.err.println("utf-8 charset is unsupported.");
data_context = _msg.getBytes();
}
int context_len = data_context.length + 1; // 1 byte for '\0' at the end of the context.
int data_len = context_len + 4; // 4 byte for one int(data_len at the data's head).
byte[] data = new byte[data_len];

// padding head(context_len).
byte[] head = GstoreConnector.intToByte4(context_len);
for (int i = 0; i < 4; i++) {
data[i] = head[i];
}

// padding context.
for (int i = 0; i < data_context.length; i++) {
data[i + 4] = data_context[i];
}
// in C, there should be '\0' as the terminator at the end of a char array. so we need add '\0' at the end of sending message.
data[data_len - 1] = 0;

return data;
}

private static byte[] intToByte4(int _x) // with Little Endian format.
{
byte[] ret = new byte[4];
ret[0] = (byte) (_x);
ret[1] = (byte) (_x >>> 8);
ret[2] = (byte) (_x >>> 16);
ret[3] = (byte) (_x >>> 24);

return ret;
}

private static int byte4ToInt(byte[] _b) // with Little Endian format.
{
int byte0 = _b[0] & 0xFF, byte1 = _b[1] & 0xFF, byte2 = _b[2] & 0xFF, byte3 = _b[3] & 0xFF;
int ret = (byte0) | (byte1 << 8) | (byte2 << 16) | (byte3 << 24);

return ret;
}

public static void main(String[] args) {
// initialize the GStore server's IP address and port.
GstoreConnector gc = new GstoreConnector("172.31.222.90", 9001);

// build a new database by a RDF file.
// note that the relative path is related to gserver.
//gc.build("db_LUBM10", "example/rdf_triple/LUBM_10_GStore.n3");
String sparql = "select ?x where {"
+ "<Area_51> <location> ?x"
+ "}";
sparql = "select ?countries where { ?countries <type> <Country> . ?caves <type> <Cave> . ?caves <location> ?countries . } "
+ "GROUP BY ?countries HAVING(COUNT(?caves) > 1000)";
sparql = "ASK where { <Proinsulin> <type> <Protein> .}";
sparql = "select DISTINCT ?film ?budget where { ?film <type> <Film> . ?film <director> <Paul_W._S._Anderson> . ?film <budget> ?budget . }";

// boolean flag = gc.load("dbpedia16", "root", "123456");
//System.out.println(flag);
String answer = gc.query("root", "123456", "dbpedia16", sparql);
System.out.println(answer);

//To count the time cost
//long startTime=System.nanoTime(); //ns
//long startTime=System.currentTimeMillis(); //ms
//doSomeThing(); //濞村鐦惃鍕敩閻焦顔�
//long endTime=System.currentTimeMillis(); //閼惧嘲褰囩紒鎾存将閺冨爼妫�
//System.out.println("缁嬪绨潻鎰攽閺冨爼妫块敍锟� "+(end-start)+"ms");
}
}


+ 133
- 0
src/lcn/BuildIndexForEntityFragments.java View File

@@ -0,0 +1,133 @@
package lcn;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
//import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

import qa.Globals;

//import qa.Globals;

/**
* Lucene建立索引的基本单元是document,同时其中的域filed可以根据需要自己添加
*
* Document是一个记录,用来表示一个条目,相当于数据库中的一行记录,就是搜索建立的倒排索引的条目。
* eg:你要搜索自己电脑上的文件,这个时候就可以创建field(字段,相关于数据库中的列。 然后用field组合成document,最后会变成若干文件。
* 这个document和文件系统document不是一个概念。
*
* StandardAnalyzer是lucene中内置的"标准分析器",可以做如下功能:
* 1、对原有句子按照空格进行了分词
* 2、所有的大写字母都可以能转换为小写的字母
* 3、可以去掉一些没有用处的单词,例如"is","the","are"等单词,也删除了所有的标点
*/
public class BuildIndexForEntityFragments{
public void indexforentity() throws Exception
{
if(EntityFragmentFields.entityId2Name == null)
EntityFragmentFields.load();
long startTime = new Date().getTime();
//Try update KB index to DBpedia2015. by husen 2016-04-08
//Try update KB index to DBpedia2016. by husen 2018-8-22
File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index");
File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt");
Analyzer luceneAnalyzer_en = new StandardAnalyzer();
IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true);
int mergeFactor = 100000; //default 10
int maxBufferedDoc = 1000; //default 10
int maxMergeDoc = Integer.MAX_VALUE; //INF
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
indexWriter_en.setMergeFactor(mergeFactor);
indexWriter_en.setMaxBufferedDocs(maxBufferedDoc);
indexWriter_en.setMaxMergeDocs(maxMergeDoc);
FileInputStream file = new FileInputStream(sourceDir_en);
InputStreamReader in = new InputStreamReader(file,"UTF-8");
BufferedReader br = new BufferedReader(in);
int count = 0;
while(true)
{
String _line = br.readLine();
{
if(_line == null) break;
}
count++;
if(count % 100000 == 0)
System.out.println(count);
String line = _line;
String temp[] = line.split("\t");
if(temp.length != 2)
continue;
else
{
int entity_id = Integer.parseInt(temp[0]);
if(!EntityFragmentFields.entityId2Name.containsKey(entity_id))
continue;
String entity_name = EntityFragmentFields.entityId2Name.get(entity_id);
String entity_fragment = temp[1];
entity_name = entity_name.replace("____", " ");
entity_name = entity_name.replace("__", " ");
entity_name = entity_name.replace("_", " ");
Document document = new Document();
Field EntityName = new Field("EntityName", entity_name, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
Field EntityId = new Field("EntityId", String.valueOf(entity_id),
Field.Store.YES, Field.Index.NO);
Field EntityFragment = new Field("EntityFragment", entity_fragment,
Field.Store.YES, Field.Index.NO);
document.add(EntityName);
document.add(EntityId);
document.add(EntityFragment);
indexWriter_en.addDocument(document);
}
}
indexWriter_en.optimize();
indexWriter_en.close();
br.close();

// input the time of Build index
long endTime = new Date().getTime();
System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime));
}
public static void main(String[] args)
{
BuildIndexForEntityFragments bef = new BuildIndexForEntityFragments();
try
{
Globals.localPath="D:/husen/gAnswer/";
bef.indexforentity();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}



+ 107
- 0
src/lcn/BuildIndexForTypeShortName.java View File

@@ -0,0 +1,107 @@
package lcn;

import java.io.File;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

import qa.Globals;
import fgmt.TypeFragment;

public class BuildIndexForTypeShortName {
public static void buildIndex(HashMap<String, ArrayList<Integer>> typeShortName2IdList) throws Exception
{
long startTime = new Date().getTime();
File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index");
Analyzer luceneAnalyzer_li = new StandardAnalyzer();
IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true);
int mergeFactor = 100000;
int maxBufferedDoc = 1000;
int maxMergeDoc = Integer.MAX_VALUE;
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
indexWriter_li.setMergeFactor(mergeFactor);
indexWriter_li.setMaxBufferedDocs(maxBufferedDoc);
indexWriter_li.setMaxMergeDocs(maxMergeDoc);
int count = 0;
Iterator<String> it = typeShortName2IdList.keySet().iterator();
while (it.hasNext())
{
String sn = it.next();
if (sn.length() == 0) {
continue;
}
count ++;
StringBuilder splittedSn = new StringBuilder("");
if(sn.contains("_"))
{
String nsn = sn.replace("_", " ");
splittedSn.append(nsn.toLowerCase());
}
else
{
int last = 0, i = 0;
for(i = 0; i < sn.length(); i ++)
{
// if it were not a small letter, then break it.
if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z'))
{
splittedSn.append(sn.substring(last, i).toLowerCase());
splittedSn.append(' ');
last = i;
}
}
splittedSn.append(sn.substring(last, i).toLowerCase());
while(splittedSn.charAt(0) == ' ') {
splittedSn.deleteCharAt(0);
}
}
System.out.println("SplitttedType: "+splittedSn);
Document document = new Document();

Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(),
Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
Field TypeShortName = new Field("TypeShortName", sn,
Field.Store.YES, Field.Index.NO);
document.add(SplittedTypeShortName);
document.add(TypeShortName);
indexWriter_li.addDocument(document);
}
indexWriter_li.optimize();
indexWriter_li.close();

// input the time of Build index
long endTime = new Date().getTime();
System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime));
}
public static void main (String[] args) {
try {
Globals.localPath="D:/husen/gAnswer/";
TypeFragment.load();
BuildIndexForTypeShortName.buildIndex(TypeFragment.typeShortName2IdList);
} catch (Exception e) {
e.printStackTrace();
}
}

}

+ 64
- 0
src/lcn/EntityFragmentFields.java View File

@@ -0,0 +1,64 @@
package lcn;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;

import qa.Globals;

public class EntityFragmentFields {
// entity dictionary
public static HashMap<String, Integer> entityName2Id = null;
public static HashMap<Integer, String> entityId2Name = null;
public static HashMap<Integer, String> entityFragmentString = null;
public static void load() throws IOException
{
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16entity_id.txt";
String fragmentFileName = Globals.localPath+"data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt";
File file = new File(filename);
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8"));

entityName2Id = new HashMap<String, Integer>();
entityId2Name = new HashMap<Integer, String>();

long t1, t2, t3;
t1 = System.currentTimeMillis();
// load entity id
System.out.println("Loading entity id ...");
String line;
while((line = br.readLine()) != null)
{
String[] lines = line.split("\t");
String entName = lines[0].substring(1, lines[0].length()-1);
entityName2Id.put(entName, Integer.parseInt(lines[1]));
entityId2Name.put(Integer.parseInt(lines[1]), entName);
}
br.close();
t2 = System.currentTimeMillis();
System.out.println("Load "+entityId2Name.size()+" entity ids in "+ (t2-t1) + "ms.");
// load entity fragment
System.out.println("Loading entity fragments ...");
br = new BufferedReader(new InputStreamReader(new FileInputStream(fragmentFileName),"utf-8"));
entityFragmentString = new HashMap<Integer, String>();
while((line = br.readLine()) != null)
{
String[] lines = line.split("\t");
if(lines.length != 2)
continue;
int eId = Integer.parseInt(lines[0]);
entityFragmentString.put(eId, lines[1]);
}
t3 = System.currentTimeMillis();
System.out.println("Load "+entityFragmentString.size()+" entity fragments in "+ (t3-t2) + "ms.");
br.close();
}
}

+ 31
- 0
src/lcn/EntityNameAndScore.java View File

@@ -0,0 +1,31 @@
package lcn;

public class EntityNameAndScore implements Comparable<EntityNameAndScore> {
public int entityID;
public String entityName;
public double score;
public EntityNameAndScore(int id, String n, double s) {
entityID = id;
entityName = n;
score = s;
}
@Override
public String toString() {
return entityID + ":<" + entityName + ">\t" + score;
}

public int compareTo(EntityNameAndScore o) {
if(this.score < o.score) {
return 1;
}
else if (this.score > o.score) {
return -1;
}
else {
return 0;
}
}

}

+ 58
- 0
src/lcn/Main.java View File

@@ -0,0 +1,58 @@
package lcn;

//import java.io.IOException;
//import java.util.ArrayList;
import java.util.ArrayList;
import java.util.Scanner;

import fgmt.EntityFragment;
import qa.Globals;
import qa.mapping.EntityFragmentDict;


public class Main {
//Test: searching Entities and Types through Lucene Index.
public static void main(String[] aStrings) throws Exception{
//SearchInLiteralSubset se = new SearchInLiteralSubset();
SearchInTypeShortName st = new SearchInTypeShortName();
SearchInEntityFragments sf = new SearchInEntityFragments();
EntityFragmentDict efd = new EntityFragmentDict();
EntityFragmentFields eff = null;
Globals.localPath = "D:/husen/gAnswer/";
Scanner sc = new Scanner(System.in);
System.out.print("input name: ");
while(sc.hasNextLine())
{
String literal = sc.nextLine();
System.out.println(literal);
//literal = cnlp.getBaseFormOfPattern(literal);
//search Type
ArrayList<String> result = st.searchType(literal, 0.4, 0.8, 10);
System.out.println("TypeShortName-->RESULT:");
for (String s : result) {
System.out.println("<"+s + ">");
}

//search Ent Fragment
// int eId = EntityFragmentFields.entityName2Id.get(literal);
// EntityFragment ef = EntityFragment.getEntityFragmentByEntityId(eId);
// System.out.println(ef);

//search Ent Name
// ArrayList<EntityNameAndScore> result = sf.searchName(literal, 0.4, 0.8, 50);
// System.out.println("EntityName-->RESULT:");
// for(EntityNameAndScore enas: result)
// {
// System.out.println(enas);
// }
System.out.print("input name: ");
}
sc.close();
}

}

+ 84
- 0
src/lcn/SearchInEntityFragments.java View File

@@ -0,0 +1,84 @@
package lcn;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;

import qa.Globals;


public class SearchInEntityFragments {

/*
* Search entity in Lucene
* */
public ArrayList<EntityNameAndScore> searchName(String literal, double thres1, double thres2, int k) throws IOException {
Hits hits = null;
String queryString = null;
Query query = null;
IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/entity_fragment_index");
ArrayList<EntityNameAndScore> result = new ArrayList<EntityNameAndScore>();

queryString = literal;
Analyzer analyzer = new StandardAnalyzer();
try
{
QueryParser qp = new QueryParser("EntityName", analyzer);
query = qp.parse(queryString);
} catch (ParseException e)
{
e.printStackTrace();
}
if (searcher != null)
{
hits = searcher.search(query);
//System.out.println("search for entity fragment hits.length=" + hits.length());
if (hits.length() > 0)
{
//System.out.println("find " + hits.length() + " result!");
for (int i=0; i<hits.length(); i++) {
//System.out.println(i+": <"+hits.doc(i).get("EntityName") +">;"
// +hits.doc(i).get("EntityFragment")
// + "; Score: " + hits.score(i)
// + "; Score2: " + hits.score(i)*(literalLength/hits.doc(i).get("EntityName").length()));
if(i<k) {
if (hits.score(i) >= thres1) {
String en = hits.doc(i).get("EntityName");
int id = Integer.parseInt(hits.doc(i).get("EntityId"));
result.add(new EntityNameAndScore(id, en, hits.score(i)));
}
else {
break;
}
}
else {
if (hits.score(i) >= thres2) {
String en = hits.doc(i).get("EntityName");
int id = Integer.parseInt(hits.doc(i).get("EntityId"));
result.add(new EntityNameAndScore(id, en, hits.score(i)));
}
else {
break;
}
}
}
}
}
//Collections.sort(result);
return result;

}

}

+ 176
- 0
src/lcn/SearchInTypeShortName.java View File

@@ -0,0 +1,176 @@
package lcn;

import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;

import fgmt.TypeFragment;
import qa.Globals;
import rdf.TypeMapping;

public class SearchInTypeShortName {
// get id and score -- husen
public ArrayList<TypeMapping> searchTypeScore(String s, double thres1, double thres2, int k) throws Exception
{
Hits hits = null;
String queryString = s;
Query query = null;
IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/type_fragment_index");

ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>();

Analyzer analyzer = new StandardAnalyzer();
try {
QueryParser qp = new QueryParser("SplittedTypeShortName", analyzer);
query = qp.parse(queryString);
} catch (ParseException e) {
e.printStackTrace();
}
if (searcher != null) {
hits = searcher.search(query);
//System.out.println("find " + hits.length() + " matched type.");
if (hits.length() > 0) {
for (int i=0; i<hits.length(); i++) {
if (i < k) {
//System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i));
if(hits.score(i) >= thres1)
{
//System.out.println("Score>=thres1("+thres1+") ---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i));
String type = hits.doc(i).get("TypeShortName");
System.out.println("Matched type: " + type + " : " + hits.score(i));
ArrayList<Integer> ret_in = TypeFragment.typeShortName2IdList.get(type);
if(ret_in!=null)
{
for(Integer tid: ret_in)
{
TypeMapping typeMapping = new TypeMapping(tid, hits.doc(i).get("TypeShortName"), hits.score(i));
tmList.add(typeMapping);
}
}
}
else {
break;
}
}
else {
if(hits.score(i) >= thres2)
{
System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i));

ArrayList<Integer> ret_in = TypeFragment.typeShortName2IdList.get(s);
if(ret_in!=null)
{
for(Integer tid: ret_in)
{
TypeMapping typeMapping = new TypeMapping(tid, hits.doc(i).get("TypeShortName"), hits.score(i));
tmList.add(typeMapping);
}
}
}
else {
break;
}
}
}
}
}
return tmList;
}
public ArrayList<String> searchType(String s, double thres1, double thres2, int k) throws Exception
{
Hits hits = null;
String queryString = null;
Query query = null;
IndexSearcher searcher = new IndexSearcher(Globals.localPath+"data/DBpedia2016/lucene/type_fragment_index");
ArrayList<String> typeNames = new ArrayList<String>();
//String[] array = s.split(" ");
//queryString = array[array.length-1];
queryString = s;

Analyzer analyzer = new StandardAnalyzer();
try {
QueryParser qp = new QueryParser("SplittedTypeShortName", analyzer);
query = qp.parse(queryString);
} catch (ParseException e) {
e.printStackTrace();
}
if (searcher != null) {
hits = searcher.search(query);
System.out.println("find " + hits.length() + " answars!");
if (hits.length() > 0) {
for (int i=0; i<hits.length(); i++) {
if (i < k) {
System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i));
if(hits.score(i) >= thres1){
System.out.println("Score>=thres1("+thres1+") ---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i));
typeNames.add(hits.doc(i).get("TypeShortName"));
//if (satisfiedStrictly(hits.doc(i).get("SplittedTypeShortName"), queryString)) typeNames.add(hits.doc(i).get("TypeShortName"));
}
else {
//break;
}
}
else {
if(hits.score(i) >= thres2){
System.out.println("<<<<---" + hits.doc(i).get("TypeShortName") + " : " + hits.score(i));
typeNames.add(hits.doc(i).get("TypeShortName"));
//if (satisfiedStrictly(hits.doc(i).get("SplittedTypeShortName"), queryString)) typeNames.add(hits.doc(i).get("TypeShortName"));
}
else {
break;
}
}
}
}
}
return typeNames;
}
private boolean satisfiedStrictly (String splittedTypeShortName, String queryString)
{
String[] tnames = splittedTypeShortName.toLowerCase().split(" ");
String[] qnames = queryString.toLowerCase().split(" ");
for (int i = 0; i < tnames.length; i ++) {
if (tnames[i].length() == 0) continue;
boolean matched = false;
for (int j = 0; j < qnames.length; j ++) {
if (tnames[i].equals(qnames[j])) {
matched = true;
break;
}
}
if (!matched && !Globals.stopWordsList.isStopWord(tnames[i])) {
return false;
}
}
String qlast = qnames[qnames.length-1];
boolean flag = false;
for (int i = 0; i < tnames.length; i ++) {
if (tnames[i].length() == 0) continue;
if (tnames[i].equals(qlast)) {
flag = true;
break;
}
}
if (flag) return true;
else return false;
}

}

+ 116
- 0
src/log/QueryLogger.java View File

@@ -0,0 +1,116 @@
package log;

//import java.io.File;
//import java.io.FileNotFoundException;
//import java.io.FileOutputStream;
//import java.io.OutputStreamWriter;
//import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;

import javax.servlet.http.HttpServletRequest;

//import qa.Globals;
import qa.Matches;
import qa.Query;
import rdf.EntityMapping;
import rdf.SemanticRelation;
import rdf.Sparql;
import rdf.MergedWord;
import rdf.SemanticUnit;
import qa.Answer;
import nlp.ds.Sentence;
import nlp.ds.Word;

public class QueryLogger {
public Sentence s = null;
public String ipAdress = null;
public Word target = null;
public Sparql sparql = null;
public Matches match = null;
public ArrayList<Answer> answers = null;
public boolean MODE_debug = false;
public boolean MODE_log = true;
public boolean MODE_fragment = true;
public boolean isMaltParserUsed = true; // Notice, we utilize Malt Parser as default parser, which is different from the older version. TODO: some coref rules need changed to fit Malt Parser.
public HashMap<String, Integer> timeTable = null;
public ArrayList<MergedWord> mWordList = null;
public ArrayList<SemanticUnit> semanticUnitList = null;
public HashMap<Integer, SemanticRelation> semanticRelations = null;
public HashMap<Integer, SemanticRelation> potentialSemanticRelations = null;
public HashMap<Word, ArrayList<EntityMapping>> entityDictionary = null;
public ArrayList<Sparql> rankedSparqls = null;
public String NRlog = "";
public String SQGlog = "";
public int gStoreCallTimes = 0;
public QueryLogger (Query query)
{
timeTable = new HashMap<String, Integer>();
rankedSparqls = new ArrayList<Sparql>();
mWordList = query.mWordList;
}
public void reloadSentence(Sentence sentence)
{
this.s = sentence;
if(this.semanticUnitList != null)
this.semanticUnitList.clear();
if(this.semanticRelations != null)
this.semanticRelations.clear();
if(this.rankedSparqls != null)
this.rankedSparqls.clear();
}
// Source code: http://edu.21cn.com/java/g_189_755584-1.htm
public static String getIpAddr(HttpServletRequest request) {
String ip = request.getHeader("x-forwarded-for");
if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) {
ip = request.getHeader("Proxy-Client-IP");
}
if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) {
ip = request.getHeader("WL-Proxy-Client-IP");
}
if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) {
ip = request.getRemoteAddr();
}
int idx;
if((idx = ip.indexOf(',')) != -1) {
ip = ip.substring(0, idx);
}
return ip;
}
public void reviseAnswers()
{
System.out.println("Revise Answers:");
answers = new ArrayList<Answer>();
if (match == null || sparql == null || match.answers == null || sparql.questionFocus == null)
return;
HashSet<Answer> answerSet = new HashSet<Answer>();
String questionFocus = sparql.questionFocus;
String sparqlString = sparql.toStringForGStore();
//System.out.println("mal="+match.answers.length);
for (int i=0;i<match.answers.length;i++)
{
Answer ans = new Answer(questionFocus, match.answers[i]);
if (!sparqlString.contains(ans.questionFocusValue))
answerSet.add(ans);
}
for (Answer ans : answerSet)
answers.add(ans);
Collections.sort(answers);
}
}

+ 402
- 0
src/nlp/ds/DependencyTree.java View File

@@ -0,0 +1,402 @@
package nlp.ds;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Stack;

import nlp.tool.CoreNLP;
import nlp.tool.MaltParser;
import nlp.tool.StanfordParser;

import org.maltparser.core.exception.MaltChainedException;
import org.maltparser.core.syntaxgraph.DependencyStructure;
import org.maltparser.core.syntaxgraph.node.DependencyNode;

import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.semgraph.SemanticGraph;

public class DependencyTree {
public DependencyTreeNode root = null;
public ArrayList<DependencyTreeNode> nodesList = null;
public SemanticGraph dependencies = null; // Method 1: CoreNLP (discarded)
public GrammaticalStructure gs = null; // Method 2: Stanford Parser
public DependencyStructure maltGraph = null; // Method 3: MaltParser
public HashMap<String, ArrayList<DependencyTreeNode>> wordBaseFormIndex = null;
public DependencyTree (Sentence sentence, CoreNLP coreNLPparser) {
SemanticGraph dependencies = coreNLPparser.getBasicDependencies(sentence.plainText);
this.dependencies = dependencies;
Stack<IndexedWord> stack = new Stack<IndexedWord>();
IndexedWord iwRoot = dependencies.getFirstRoot();
HashMap<IndexedWord, DependencyTreeNode> map = new HashMap<IndexedWord, DependencyTreeNode>();
nodesList = new ArrayList<DependencyTreeNode>();

stack.push(iwRoot);
root = this.setRoot(sentence.getWordByIndex(iwRoot.index()));
map.put(iwRoot, root);

while (!stack.empty())
{
IndexedWord curIWNode = stack.pop();
DependencyTreeNode curDTNode = map.get(curIWNode);
for (IndexedWord iwChild : dependencies.getChildList(curIWNode)) {
Word w = sentence.getWordByIndex(iwChild.index());
DependencyTreeNode newDTNode = this.insert(
curDTNode,
w,
dependencies.reln(curIWNode, iwChild).getShortName());
map.put(iwChild, newDTNode);
stack.push(iwChild);
}
curDTNode.sortChildrenList();
nodesList.add(curDTNode);
}
}
public DependencyTree (Sentence sentence, StanfordParser stanfordParser) {
this.gs = stanfordParser.getGrammaticalStructure(sentence.plainText);
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>();
nodesList = new ArrayList<DependencyTreeNode>();
List<TypedDependency> tdl = gs.typedDependencies(false);
// 1. generate all nodes.
for (TypedDependency td : tdl) {
// gov
if (!map.containsKey(td.gov().index()) && !td.reln().getShortName().equals("root")) {
Word w = sentence.getWordByIndex(td.gov().index());
DependencyTreeNode newNode = new DependencyTreeNode(w);
map.put(td.gov().index(), newNode);
nodesList.add(newNode);
}
// dep
if (!map.containsKey(td.dep().index())) {
Word w = sentence.getWordByIndex(td.dep().index());
DependencyTreeNode newNode = new DependencyTreeNode(w);
map.put(td.dep().index(), newNode);
nodesList.add(newNode);
}
}
// 2. add edges.
for (TypedDependency td : tdl) {
if (td.reln().getShortName().equals("root")) {
this.root = map.get(td.dep().index());
this.root.levelInTree = 0;
this.root.dep_father2child = "root";
}
else {
DependencyTreeNode gov = map.get(td.gov().index());
DependencyTreeNode dep = map.get(td.dep().index());
dep.father = gov;
gov.childrenList.add(dep);
dep.dep_father2child = td.reln().getShortName();
}
}
// add levelInTree, sort childrenList & nodesList
Stack<DependencyTreeNode> stack = new Stack<DependencyTreeNode>();
stack.push(this.root);
while (!stack.empty()) {
DependencyTreeNode dtn = stack.pop();
if (dtn.father != null) {
dtn.levelInTree = dtn.father.levelInTree + 1;
dtn.sortChildrenList();
}
for (DependencyTreeNode chd : dtn.childrenList) {
stack.push(chd);
}
}
Collections.sort(nodesList, new DependencyTreeNodeComparator());
for (DependencyTreeNode dtn : nodesList) {
dtn.linkNN(this);
}
}
public DependencyTree (Sentence sentence, MaltParser maltParser)throws MaltChainedException {
try {
// the tokens are parsed in the following line
DependencyStructure graph = maltParser.getDependencyStructure(sentence);
this.maltGraph = graph;
//System.out.println(graph);
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>();
ArrayList<DependencyTreeNode> list = new ArrayList<DependencyTreeNode>();
Stack<DependencyNode> stack = new Stack<DependencyNode>();
DependencyNode nroot = graph.getDependencyRoot();
stack.add(nroot);
// 1. generate all nodes.
while (!stack.isEmpty()) {
DependencyNode n = stack.pop();
DependencyNode sib = n.getRightmostDependent();
int key = n.getIndex();
//System.out.println("[current node][key="+key+"] "+n+" <"+n.getHeadEdge()+">");
boolean flag = true;
while (sib != null) {
flag = false;
stack.push(sib);
sib = sib.getLeftSibling();
}
if (flag) {
sib = n.getLeftmostDependent();
while (sib != null) {
stack.push(sib);
sib = sib.getRightSibling();
}
}
if (n.hasHead() && !map.containsKey(key)) {
//String snode = n.toString();
String sedge = n.getHeadEdge().toString();
//System.out.println("[" + snode + "] <" + sedge + ">");

/*int position = 0;
String wordOriginal = null;
String wordBase;
String postag = null;*/
String dep = null;
int idx1, idx2;
/*// position
idx1 = snode.indexOf("ID:")+3;
idx2 = snode.indexOf(' ', idx1);
position = Integer.parseInt(snode.substring(idx1, idx2));
// word
idx1 = snode.indexOf("FORM:", idx2)+5;
idx2 = snode.indexOf(' ', idx1);
wordOriginal = snode.substring(idx1, idx2);
wordBase = Globals.coreNLP.getBaseFormOfPattern(wordOriginal.toLowerCase());
// postag
idx1 = snode.indexOf("POSTAG:", idx2)+7;
idx2 = snode.indexOf(' ', idx1);
postag = snode.substring(idx1, idx2);*/
// dep
idx1 = sedge.lastIndexOf(':')+1;
idx2 = sedge.lastIndexOf(' ');
dep = sedge.substring(idx1, idx2);
if (dep.equals("null")) {
dep = null;
}
else if (dep.equals("punct")) {// No consider about punctuation
continue;
}
DependencyTreeNode newNode = new DependencyTreeNode(sentence.getWordByIndex(key));
newNode.dep_father2child = dep;
map.put(key, newNode);
list.add(newNode);
}
}
// 2. add edges
for (Integer k : map.keySet()) {
DependencyNode n = graph.getDependencyNode(k);
DependencyTreeNode dtn = map.get(k);
if (dtn.dep_father2child == null) {
this.setRoot(dtn);
this.root.levelInTree = 0;
this.root.dep_father2child = "root";
}
else {
DependencyTreeNode father = map.get(n.getHead().getIndex());
DependencyTreeNode child = map.get(n.getIndex());
child.father = father;
father.childrenList.add(child);
}
}
// Fix the tree for some cases.
if(list.size() > 11)
{
DependencyTreeNode dt1 = list.get(11), dt2 = list.get(5);
if(dt1!=null && dt2!=null && dt1.word.baseForm.equals("star") && dt1.father.word.baseForm.equals("be"))
{
if (dt2.word.baseForm.equals("film") || dt2.word.baseForm.equals("movie"))
{
dt1.father.childrenList.remove(dt1);
dt1.father = dt2;
dt2.childrenList.add(dt1);
}
}
}
// add levelInTree, sort childrenList & nodesList
for (DependencyTreeNode dtn : list) {
if (dtn.father != null) {
dtn.levelInTree = dtn.father.levelInTree + 1;
dtn.sortChildrenList();
}
}
nodesList = list;
Collections.sort(nodesList, new DependencyTreeNodeComparator());
for (DependencyTreeNode dtn : nodesList) {
dtn.linkNN(this);
}
} catch (MaltChainedException e) {
//e.printStackTrace();
//System.err.println("MaltParser exception: " + e.getMessage());
throw e;
}
}
public DependencyTreeNode setRoot(Word w) {
root = new DependencyTreeNode(w, "root", null);
return root;
}
public DependencyTreeNode setRoot(DependencyTreeNode root) {
this.root = root;
return this.root;
}
public void buildWordBaseFormIndex () {
wordBaseFormIndex = new HashMap<String, ArrayList<DependencyTreeNode>>();
for (DependencyTreeNode dtn: nodesList) {
String w = dtn.word.baseForm;
if (!wordBaseFormIndex.keySet().contains(w))
wordBaseFormIndex.put(w, new ArrayList<DependencyTreeNode>());
wordBaseFormIndex.get(w).add(dtn);
}
}
public DependencyTreeNode insert(DependencyTreeNode father, Word w, String dep_father2child) {
if (father == null || w == null)
return null;
DependencyTreeNode newNode = new DependencyTreeNode(w, dep_father2child, father);
father.childrenList.add(newNode);
return newNode;
}
public DependencyTreeNode getRoot() {
return root;
}
public ArrayList<DependencyTreeNode> getNodesList(){
return nodesList;
}

public ArrayList<DependencyTreeNode> getShortestNodePathBetween(DependencyTreeNode n1, DependencyTreeNode n2)
{
if(n1 == n2) {
return new ArrayList<DependencyTreeNode>();
}
ArrayList<DependencyTreeNode> path1 = getPath2Root(n1);
ArrayList<DependencyTreeNode> path2 = getPath2Root(n2);
int idx1 = path1.size()-1;
int idx2 = path2.size()-1;
DependencyTreeNode curNode1 = path1.get(idx1);
DependencyTreeNode curNode2 = path2.get(idx2);
while (curNode1 == curNode2) {
idx1 --;
idx2 --;
if(idx1 < 0 || idx2 < 0) break;
curNode1 = path1.get(idx1);
curNode2 = path2.get(idx2);
}
ArrayList<DependencyTreeNode> shortestPath = new ArrayList<DependencyTreeNode>();
for (int i = 0; i <= idx1; i ++) {
shortestPath.add(path1.get(i));
}
for (int i = idx2+1; i >= 0; i --) {
shortestPath.add(path2.get(i));
}
System.out.println("Shortest Path between <" + n1 + "> and <" + n2 + ">:");
System.out.print("\t-");
for (DependencyTreeNode dtn : shortestPath) {
System.out.print("<" + dtn + ">-");
}
System.out.println();
return shortestPath;
}
public ArrayList<DependencyTreeNode> getPath2Root(DependencyTreeNode n1) {
ArrayList<DependencyTreeNode> path = new ArrayList<DependencyTreeNode>();
DependencyTreeNode curNode = n1;
path.add(curNode);
while (curNode.father != null) {
curNode = curNode.father;
path.add(curNode);
}
return path;
}
public ArrayList<DependencyTreeNode> getTreeNodesListContainsWords(String words) {
ArrayList<DependencyTreeNode> ret = new ArrayList<DependencyTreeNode>();
for (DependencyTreeNode dtn : nodesList) {
if (dtn.word.originalForm.equalsIgnoreCase(words)
|| dtn.word.baseForm.equalsIgnoreCase(words)
|| words.contains(dtn.word.originalForm)
|| words.contains(dtn.word.baseForm))
ret.add(dtn);
}
return ret;
}
public DependencyTreeNode getNodeByIndex (int posi) {
for (DependencyTreeNode dt : nodesList) {
if (dt.word.position == posi) {
return dt;
}
}
return null;
}
public DependencyTreeNode getFirstPositionNodeInList(ArrayList<DependencyTreeNode> list) {
int firstPosi = Integer.MAX_VALUE;
DependencyTreeNode firstNode = null;
for (DependencyTreeNode dtn : list) {
if (dtn.word.position < firstPosi) {
firstPosi = dtn.word.position;
firstNode = dtn;
}
}
return firstNode;
}
@Override
public String toString() {
String ret = "";

Stack<DependencyTreeNode> stack = new Stack<DependencyTreeNode>();
stack.push(root);
while(!stack.empty()) {
DependencyTreeNode curNode = stack.pop();
for (int i = 0; i <= curNode.levelInTree; i ++)
ret += " ";
ret += "-> ";
ret += curNode.word.baseForm;
ret += "-";
ret += curNode.word.posTag;
ret += " (";
ret += curNode.dep_father2child;
ret += ")";
ret += "[" + curNode.word.position + "]\n";
for (DependencyTreeNode child : curNode.childrenList) {
stack.push(child);
}
}
return ret;
}
}

+ 150
- 0
src/nlp/ds/DependencyTreeNode.java View File

@@ -0,0 +1,150 @@
package nlp.ds;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Stack;

public class DependencyTreeNode {
public Word word = null;
public String dep_father2child = null;
public DependencyTreeNode father = null;
public ArrayList<DependencyTreeNode> childrenList = null;
public int levelInTree = -1;
/**
* The constructor for knowing its father
*
* @param w
* @param dep_father2child
* @param father
*/
public DependencyTreeNode(Word w, String dep_father2child, DependencyTreeNode father)
{
word = w;
this.dep_father2child = dep_father2child;
this.father = father;
this.childrenList = new ArrayList<DependencyTreeNode>();
if(father==null) levelInTree = 0;
else levelInTree = father.levelInTree+1;
}

/**
* The constructor for not knowing the father
*
* @param word
*/
public DependencyTreeNode(Word w)
{
this.word = w;
this.childrenList = new ArrayList<DependencyTreeNode>();
}
public void sortChildrenList () {
childrenList.trimToSize();
Collections.sort(childrenList, new DependencyTreeNodeComparator());
}
@Override
public String toString(){
return word.originalForm + "-" + word.posTag + "(" + dep_father2child + ")[" + word.position + "]";
}
public static void sortArrayList(ArrayList<DependencyTreeNode> list) {
Collections.sort(list, new DependencyTreeNodeComparator());
}
public DependencyTreeNode containDependencyWithChildren (String dep) {
for (DependencyTreeNode son : childrenList) {
if (son.dep_father2child.equals(dep)) return son;
}
return null;
}

/**
* equal_or_startWith = true: equal
* equal_or_startWith = false: startWith
*
* @param posChild
* @param equal_or_startWith
* @return
*/
public DependencyTreeNode containPosInChildren (String posChild, boolean equal_or_startWith) {
for (DependencyTreeNode son : childrenList) {
if (equal_or_startWith) {
if (son.word.posTag.equals(posChild)) return son;
}
else {
if (son.word.posTag.startsWith(posChild)) return son;
}
}
return null;
}
public DependencyTreeNode containWordBaseFormInChildren (String wordBaseFormChild) {
for (DependencyTreeNode son : childrenList) {
if (son.word.baseForm.equals(wordBaseFormChild)) return son;
}
return null;
}
public DependencyTreeNode getNNTopTreeNode (DependencyTree T) {
if(this.father != null && (this.dep_father2child.equals("nn") || (this.word.posTag.startsWith("NN") && this.dep_father2child.equals("dep")))) {
return this.father.getNNTopTreeNode(T);
}
else return this;
}
public Word linkNN(DependencyTree T) {
// (Now useless) backtracking the NN connections.
ArrayList<DependencyTreeNode> nn = new ArrayList<DependencyTreeNode>();
nn.add(this);

if(this.father != null && (this.dep_father2child.equals("nn")
|| (this.word.posTag.startsWith("NN") && this.dep_father2child.equals("dep") && this.father.word.posTag.startsWith("NN")))) {
nn.add(this.father);
for(DependencyTreeNode son : this.father.childrenList) {
if (son != this && son.dep_father2child.equals("nn")) {
nn.add(son);
}
}
}
Stack<DependencyTreeNode> stack = new Stack<DependencyTreeNode>();
stack.push(this);
while (!stack.empty()) {
DependencyTreeNode curNode = stack.pop();
for(DependencyTreeNode son : curNode.childrenList) {
if (son.dep_father2child.equals("nn")
|| (son.word.posTag.startsWith("NN") && son.dep_father2child.equals("dep") && son.father.word.posTag.startsWith("NN"))) {
nn.add(son);
stack.push(son);
}
}
}
DependencyTreeNode.sortArrayList(nn);

int size = nn.size() - 1;
for (int i = 0; i < size; i ++) {
nn.get(i).word.nnNext = nn.get(i+1).word;
nn.get(i+1).word.nnPrev = nn.get(i).word;
}
return this.word.getNnHead();
}

};


class DependencyTreeNodeComparator implements Comparator<DependencyTreeNode> {
public int compare(DependencyTreeNode n1, DependencyTreeNode n2) {
return n1.word.position - n2.word.position;
}
}

+ 88
- 0
src/nlp/ds/Sentence.java View File

@@ -0,0 +1,88 @@
package nlp.ds;

import java.util.ArrayList;
import java.util.HashMap;

import qa.Globals;
import qa.Query;
import rdf.MergedWord;

public class Sentence {
public String plainText = null;
public Word[] words = null;
public HashMap<String, Word> map = null;
public DependencyTree dependencyTreeStanford = null;
public DependencyTree dependencyTreeMalt = null;
public enum SentenceType {SpecialQuestion,GeneralQuestion,ImperativeSentence}
public SentenceType sentenceType = SentenceType.SpecialQuestion;
public Sentence (String s)
{
plainText = s;
words = Globals.coreNLP.getTaggedWords(plainText);
map = new HashMap<String, Word>();
for (Word w : words)
map.put(w.key, w);
}
public Sentence (Query query, String s)
{
plainText = s;
words = Globals.coreNLP.getTaggedWords(plainText);
// inherit NodeRecognition's information
for(Word word: words)
{
for(MergedWord mWord: query.mWordList)
{
if(word.originalForm.equals(mWord.name))
{
word.mayLiteral = mWord.mayLiteral;
word.mayEnt = mWord.mayEnt;
word.mayType = mWord.mayType;
word.mayCategory = mWord.mayCategory;
word.tmList = mWord.tmList;
word.emList = mWord.emList;
word.category = mWord.category;
}
}
}
map = new HashMap<String, Word>();
for (Word w : words)
map.put(w.key, w);
}
public ArrayList<Word> getWordsByString (String w) {
ArrayList<Word> ret = new ArrayList<Word>();
for (Word wo: words) {
if (wo.originalForm.equals(w)) ret.add(wo);
}
return ret;
}
public Word getWordByIndex (int idx) {
return words[idx-1];
}
public Word getWordByKey (String k) {
return map.get(k);
}
public boolean hasModifier(Word w)
{
for(Word word: words)
if(word!=w && word.modifiedWord==w)
return true;
return false;
}
public void printNERResult () {
for (Word word : words) {
System.out.print(word + " ");
System.out.println("ner=" + word.ner);
}
}
}




+ 126
- 0
src/nlp/ds/Word.java View File

@@ -0,0 +1,126 @@
package nlp.ds;

import java.util.ArrayList;

import rdf.EntityMapping;
import rdf.Triple;
import rdf.TypeMapping;

public class Word implements Comparable<Word>
{
public boolean mayCategory = false;
public boolean mayLiteral = false;
public boolean mayEnt = false;
public boolean mayType = false;
public boolean mayExtendVariable = false;
public String category = null;
public ArrayList<EntityMapping> emList = null;
public ArrayList<TypeMapping> tmList = null;
public Triple embbededTriple = null;
public String baseForm = null;
public String originalForm = null;
public String posTag = null;
public int position = -1; // Notice the first word's position = 1
public String key = null;
public boolean isCovered = false;
public boolean isIgnored = false;
//Notice: These variables are not used because we merge a phrase to a word if it is a node now.
public String ner = null; // record NER result
public Word nnNext = null;
public Word nnPrev = null;
public Word crr = null; // coreference resolution result
public Word represent = null; // This word is represented by others, eg, "which book is ..." "which"
public boolean omitNode = false; // This word can not be node
public Word modifiedWord = null; // This word modify which word (it modify itself if it is not a modified word)
public Word (String base, String original, String pos, int posi) {
baseForm = base;
originalForm = original;
posTag = pos;
position = posi;
key = new String(originalForm+"["+position+"]");
}
@Override
public String toString() {
return key;
}

public int compareTo(Word another) {
return this.position-another.position;
}
@Override
public int hashCode() {
return key.hashCode();
}
@Override
public boolean equals(Object o) {
return (o instanceof Word)
&& originalForm.equals(((Word)o).originalForm)
&& position == ((Word)o).position;
}
// We now discard all NN information and return the word itself. | husen 2016
public Word getNnHead() {
Word w = this;
return w;
// if(w.mayEnt || w.mayType)
// return w;
//
// while (w.nnPrev != null) {
// w = w.nnPrev;
// }
// return w;
}
public String getFullEntityName() {
Word w = this.getNnHead();
return w.originalForm;
// if(w.mayEnt || w.mayType)
// return w.originalForm;
//
// StringBuilder sb = new StringBuilder("");
// while (w != null) {
// sb.append(w.originalForm);
// sb.append(' ');
// w = w.nnNext;
// }
// sb.deleteCharAt(sb.length()-1);
// return sb.toString();
}
public String getBaseFormEntityName() {
Word w = this.getNnHead();
if(w.mayEnt || w.mayType)
return w.baseForm;
StringBuilder sb = new StringBuilder("");
while (w != null) {
sb.append(w.baseForm);
sb.append(' ');
w = w.nnNext;
}
sb.deleteCharAt(sb.length()-1);
return sb.toString();
}
public String isNER () {
return this.getNnHead().ner;
}
public void setIsCovered () {
Word w = this.getNnHead();
while (w != null) {
w.isCovered = true;
w = w.nnNext;
}
}
}

+ 202
- 0
src/nlp/tool/CoreNLP.java View File

@@ -0,0 +1,202 @@
package nlp.tool;

import java.util.List;
import java.util.Properties;

import nlp.ds.Word;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.trees.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.util.CoreMap;

public class CoreNLP {

// CoreNLP can also recognize TIME and NUMBER (see SUTime)
private StanfordCoreNLP pipeline_lemma;
public CoreNLP () {
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
/*Properties props_all = new Properties();
props_all.put("annotators", "tokenize, ssplit, pos, lemma, parse"); // full list: "tokenize, ssplit, pos, lemma, ner, parse, dcoref"
pipeline_all = new StanfordCoreNLP(props_all);*/

Properties props_lemma = new Properties();
props_lemma.put("annotators", "tokenize, ssplit, pos, lemma");
pipeline_lemma = new StanfordCoreNLP(props_lemma);

}
// For more efficient usage, refer to "http://www.jarvana.com/jarvana/view/edu/stanford/nlp/stanford-corenlp/1.2.0/stanford-corenlp-1.2.0-javadoc.jar!/edu/stanford/nlp/process/Morphology.html"
public String getBaseFormOfPattern (String text) {
String ret = new String("");
// create an empty Annotation just with the given text
Annotation document = new Annotation(text);
// run all Annotators on this text
pipeline_lemma.annotate(document);


// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
// ���������
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
int count = 0;
for(CoreMap sentence: sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
// this is the base form (lemma) of the token
String lemma = token.getString(LemmaAnnotation.class);
ret += lemma;
ret += " ";
}
count ++;
if (count % 100 == 0) {
System.out.println(count);
}
}
return ret.substring(0, ret.length()-1);
}
public SemanticGraph getBasicDependencies (String s) {
// create an empty Annotation just with the given text
Annotation document = new Annotation(s);
// run all Annotators on this text
pipeline_lemma.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence: sentences) {
// this is the Stanford dependency graph of the current sentence
SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class);
return dependencies;
}
return null;
}

public Tree getParseTree (String text) {
// create an empty Annotation just with the given text
Annotation document = new Annotation(text);
// run all Annotators on this text
pipeline_lemma.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence: sentences) {
// this is the parse tree of the current sentence
return sentence.get(TreeAnnotation.class);
}
return null;
}
/**
* How to use:
* for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
* // this is the text of the token
* String word = token.get(TextAnnotation.class);
* // this is the POS tag of the token
* String pos = token.get(PartOfSpeechAnnotation.class);
* }
* @param s
* @return
*/
public CoreMap getPOS (String s) {
// create an empty Annotation just with the given text
Annotation document = new Annotation(s);
// run all Annotators on this text
pipeline_lemma.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence: sentences) {
// this is the sentence with POS Tags
return sentence;
}
return null;
}
public Word[] getTaggedWords (String sentence) {
CoreMap taggedSentence = getPOS(sentence);
Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()];
int count = 0;
for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) {
// this is the text of the token
String word = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
//System.out.println(word+"["+pos+"]");
ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1);
count ++;
}
return ret;
}
/*public void demo () {
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
// read some text in the text variable
String text = ... // Add your text here!
// create an empty Annotation just with the given text
Annotation document = new Annotation(text);
// run all Annotators on this text
pipeline.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence: sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
// this is the text of the token
String word = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
// this is the NER label of the token
String ne = token.get(NamedEntityTagAnnotation.class);
}

// this is the parse tree of the current sentence
Tree tree = sentence.get(TreeAnnotation.class);

// this is the Stanford dependency graph of the current sentence
SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
}

// This is the coreference link graph
// Each chain stores a set of mentions that link to each other,
// along with a method for getting the most representative mention
// Both sentence and token offsets start at 1!
Map<Integer, CorefChain> graph =
document.get(CorefChainAnnotation.class);
}*/
}

+ 42
- 0
src/nlp/tool/Main.java View File

@@ -0,0 +1,42 @@
package nlp.tool;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import nlp.ds.DependencyTree;
import nlp.ds.Sentence;
import qa.Globals;

public class Main {
public static void main (String[] args) {
Globals.init();
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
try {
while (true) {
System.out.println("Test maltparser.");
System.out.print("Please input the NL question: ");
String question = br.readLine();
if (question.length() <= 3)
break;
try {
long t1 = System.currentTimeMillis();
Sentence s = new Sentence(question);
DependencyTree dt = new DependencyTree(s, Globals.stanfordParser);
System.out.println("====StanfordDependencies====");
System.out.println(dt);
DependencyTree dt2 = new DependencyTree(s, Globals.maltParser);
System.out.println("====MaltDependencies====");
System.out.println(dt2);
long t2 = System.currentTimeMillis();
System.out.println("time=" + (t2-t1) + "ms");
} catch (Exception e) {
e.printStackTrace();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}

}

+ 70
- 0
src/nlp/tool/MaltParser.java View File

@@ -0,0 +1,70 @@
package nlp.tool;


import nlp.ds.Sentence;
import nlp.ds.Word;

import org.maltparser.MaltParserService;
import org.maltparser.core.exception.MaltChainedException;
import org.maltparser.core.syntaxgraph.DependencyStructure;

import qa.Globals;

public class MaltParser {
private MaltParserService service = null;
public MaltParser() {
try
{
System.out.print("Loading MaltParser ...");
service = new MaltParserService();
// Inititalize the parser model 'model0' and sets the working directory to '.' and sets the logging file to 'parser.log'
//service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w . -lfi parser.log");
service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w "+Globals.localPath+"lib/maltparser-1.9.1 -lfi parser.log");
firstParse();
System.out.println("ok!");
} catch (MaltChainedException e) {
e.printStackTrace();
System.err.println("MaltParser exception: " + e.getMessage());
}
}
private void firstParse() {
String[] tokens = new String[12];
tokens[0] = "1\tIn\t_\tIN\tIN\t_";
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_";
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_";
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_";
tokens[4] = "5\tby\t_\tIN\tIN\t_";
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_";
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_";
tokens[7] = "8\twas\t_\tVBD\tVBD\t_";
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_";
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_";
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_";
tokens[11] = "12\t?\t_\t.\t.\t_";
try {
service.parse(tokens);
} catch (MaltChainedException e) {
e.printStackTrace();
}
}
public DependencyStructure getDependencyStructure (Sentence sentence) {
try {
return service.parse(getTaggedTokens(sentence));
} catch (MaltChainedException e) {
e.printStackTrace();
}
return null;
}
private String[] getTaggedTokens (Sentence sentence) {
String[] ret = new String[sentence.words.length];
int count = 0;
for (Word w : sentence.words) {
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_");
count ++;
}
return ret;
}
}

+ 73
- 0
src/nlp/tool/MaltParserCon.java View File

@@ -0,0 +1,73 @@
package nlp.tool;

import java.io.File;
import java.net.URL;

import nlp.ds.Sentence;
import nlp.ds.Word;

import org.maltparser.concurrent.ConcurrentMaltParserModel;
import org.maltparser.concurrent.ConcurrentMaltParserService;
import org.maltparser.concurrent.graph.ConcurrentDependencyGraph;
import org.maltparser.core.exception.MaltChainedException;
//import org.maltparser.core.syntaxgraph.DependencyStructure;


public class MaltParserCon {
private ConcurrentMaltParserModel model = null;
public ConcurrentDependencyGraph outputGraph = null;
public MaltParserCon(){
try{
System.out.println("Loading Maltparser...\n");
URL ModelURL = new File("output/engmalt.linear-1.7.mco").toURI().toURL();
model = ConcurrentMaltParserService.initializeParserModel(ModelURL);
firstTest();
System.out.println("ok!\n");
}catch(Exception e){
e.printStackTrace();
System.err.println("MaltParser exception: " + e.getMessage());
}
}
private void firstTest(){
String[] tokens = new String[12];
tokens[0] = "1\tIn\t_\tIN\tIN\t_";
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_";
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_";
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_";
tokens[4] = "5\tby\t_\tIN\tIN\t_";
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_";
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_";
tokens[7] = "8\twas\t_\tVBD\tVBD\t_";
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_";
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_";
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_";
tokens[11] = "12\t?\t_\t.\t.\t_";
try {
outputGraph = model.parse(tokens);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(outputGraph);
}
public ConcurrentDependencyGraph getDependencyStructure (Sentence sentence) {
try {
return model.parse(getTaggedTokens(sentence));
} catch (MaltChainedException e) {
e.printStackTrace();
}
return null;
}
private String[] getTaggedTokens (Sentence sentence) {
String[] ret = new String[sentence.words.length];
int count = 0;
for (Word w : sentence.words) {
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_");
count ++;
}
return ret;
}
}

+ 53
- 0
src/nlp/tool/NERecognizer.java View File

@@ -0,0 +1,53 @@
package nlp.tool;

import java.util.List;

import qa.Globals;

import nlp.ds.Sentence;
import nlp.ds.Word;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation;
import edu.stanford.nlp.ling.CoreLabel;

public class NERecognizer {
static String serializedClassifier;
static AbstractSequenceClassifier<CoreLabel> classifier;
//public static String localPath="E:\\Hanshuo\\gAnswer\\";
public NERecognizer() {
serializedClassifier = Globals.localPath+"lib/stanford-ner-2012-11-11/classifiers/english.all.3class.distsim.crf.ser.gz";
classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier);
}
/*public NERecognizer(String basePath, boolean flag) {
serializedClassifier = "WEB-INF\\lib\\stanford-ner-2012-11-11\\stanford-ner-2012-11-11\\classifiers\\english.all.3class.distsim.crf.ser.gz";
}*/
public void recognize(Sentence sentence) {
List<CoreLabel> lcl = classifier.classify(sentence.plainText).get(0);
for (CoreLabel cl : lcl) {
int position = Integer.parseInt(cl.get(PositionAnnotation.class))+1;
Word w = sentence.getWordByIndex(position);
String ner = cl.get(AnswerAnnotation.class);
if (ner.equals("O")) w.ner = null;
else w.ner = ner;
}
}
public static void main(String[] args) {
System.out.println("Test NER");
Globals.init();
Sentence s = new Sentence("I go to school at Stanford University, which is located in California.");//"Which states of Germany are governed by the Social Democratic Party?"
Globals.nerRecognizer.recognize(s);
for (Word word : s.words) {
System.out.print(word + " ");
System.out.println("ner=" + word.ner);
}
}
}

+ 51
- 0
src/nlp/tool/StanfordParser.java View File

@@ -0,0 +1,51 @@
package nlp.tool;

import java.io.StringReader;
import java.util.List;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;

public class StanfordParser {
private LexicalizedParser lp;
private TokenizerFactory<CoreLabel> tokenizerFactory;
private TreebankLanguagePack tlp;
private GrammaticalStructureFactory gsf;
public StanfordParser() {
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
tlp = new PennTreebankLanguagePack();
gsf = tlp.grammaticalStructureFactory();
}
public GrammaticalStructure getGrammaticalStructure (String sentence) {
List<CoreLabel> rawWords2 =
tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
// Converts a Sentence/List/String into a Tree.
// In all circumstances, the input will be treated as a single sentence to be parsed.
Tree parse = lp.apply(rawWords2);

return gsf.newGrammaticalStructure(parse);
/*List<TypedDependency> tdl = gs.typedDependencies(false);
for (TypedDependency td : tdl) {
System.out.println(td.reln().getShortName()+"("+td.gov()+","+td.dep()+")");
System.out.println("gov="+td.gov()
+"\tgov.index="
+td.gov().index()
+"\tgov.value="
+td.gov().value()
+"\tgov.pos="
+((TreeGraphNode)td.gov().parent()).value());
}*/
//System.out.println(tdl);
}
}

+ 614
- 0
src/nlp/tool/StopWordsList.java View File

@@ -0,0 +1,614 @@
package nlp.tool;

import java.util.HashSet;
import java.util.Arrays;

public class StopWordsList {
public static HashSet<String> sw_list = new HashSet<String>();
public StopWordsList() {
initiate();
}
public void initiate() {
sw_list.addAll(Arrays.asList(sw_array));
// some commas
/*sw_list.add(".");
sw_list.add(",");
sw_list.add(";");
sw_list.add("?");
sw_list.add("!");
sw_list.add(":");
sw_list.add("(");
sw_list.add(")");
sw_list.add("-");*/
}
/**
* To judge whether a word is a stop-word
* @param word_lowercase: the word, should be in lower-case
* @return if the word is a stop-word, then true; otherwise, false.
*/
public boolean isStopWord(String word_lowercase) {
if (sw_list.contains(word_lowercase)) return true;
else return false;
}
private static final String sw_array[] = new String[]{
"a",
"able",
"about",
"across",
"after",
"all",
"almost",
"also",
"am",
"among",
"an",
"and",
"any",
"are",
"as",
"at",
//"be",
"because",
"been",
"but",
"by",
"can",
"cannot",
"could",
"dear",
"did",
"do",
"does",
"either",
"else",
"ever",
"every",
"for",
"from",
"get",
"got",
"had",
"has",
"have",
"he",
"her",
"hers",
"him",
"his",
"how",
"however",
"i",
"if",
"in",
"into",
"is",
"it",
"its",
"just",
"least",
"let",
"like",
"likely",
"may",
"me",
"might",
"most",
"must",
"my",
"neither",
"no",
"nor",
"not",
"of",
"off",
"often",
"on",
"only",
"or",
"other",
"our",
"own",
"rather",
"said",
"say",
"says",
"she",
"should",
"since",
"so",
"some",
"than",
"that",
"the",
"their",
"them",
"then",
"there",
"these",
"they",
"this",
"tis",
"to",
"too",
"twas",
"us",
"wants",
"was",
"we",
"were",
"what",
"when",
"where",
"which",
"while",
"who",
"whom",
"why",
"will",
"with",
"would",
"yet",
"you",
"your"
};
};

/*// stop word 308

// http://norm.al/2009/04/14/list-of-english-stop-words/

private static final String sw_array[] = new String[]{
"a",
"about",
"above",
"across",
"after",
"afterwards",
"again",
"against",
"all",
"almost",
"alone",
"along",
"already",
"also",
"although",
"always",
"am",
"among",
"amongst",
"amoungst",
"amount",
"an",
"and",
"another",
"any",
"anyhow",
"anyone",
"anything",
"anyway",
"anywhere",
"are",
"around",
"as",
"at",
"back",
"be",
"became",
"because",
"become",
"becomes",
"becoming",
"been",
"before",
"beforehand",
"behind",
"being",
"below",
"beside",
"besides",
"between",
"beyond",
"bill",
"both",
"bottom",
"but",
"by",
"call",
"can",
"cannot",
"cant",
"co",
"computer",
"con",
"could",
"couldnt",
"cry",
"de",
"describe",
"detail",
"do",
"did",
"done",
"down",
"due",
"during",
"each",
"eg",
"eight",
"either",
"eleven",
"else",
"elsewhere",
"empty",
"enough",
"etc",
"even",
"ever",
"every",
"everyone",
"everything",
"everywhere",
"except",
"few",
"fifteen",
"fify",
"fill",
"find",
"fire",
"first",
"five",
"for",
"former",
"formerly",
"forty",
"found",
"four",
"from",
"front",
"full",
"further",
"get",
"give",
"go",
"had",
"has",
"hasnt",
"have",
"he",
"hence",
"her",
"here",
"here",
"hereafter",
"hereby",
"herein",
"hereupon",
"hers",
"herself",
"him",
"himself",
"his",
"how",
"however",
"hundred",
"i",
"ie",
"if",
"in",
"inc",
"indeed",
"interest",
"into",
"is",
"it",
"its",
"itself",
"keep",
"last",
"latter",
"latterly",
"least",
"less",
"ltd",
"made",
"many",
"may",
"me",
"meanwhile",
"might",
"mill",
"mine",
"more",
"moreover",
"most",
"mostly",
"move",
"much",
"must",
"my",
"myself",
"name",
"namely",
"neither",
"never",
"nevertheless",
"next",
"nine",
"no",
"nobody",
"none",
"noone",
"nor",
"not",
"nothing",
"now",
"nowhere",
"of",
"off",
"often",
"on",
"once",
"one",
"only",
"onto",
"or",
"other",
"others",
"otherwise",
"our",
"ours",
"ourselves",
"out",
"over",
"own",
"part",
"per",
"perhaps",
"please",
"put",
"rather",
"re",
"same",
"see",
"seem",
"seemed",
"seeming",
"seems",
"serious",
"several",
"she",
"should",
"show",
"side",
"since",
"sincere",
"six",
"sixty",
"so",
"some",
"somehow",
"someone",
"something",
"sometime",
"sometimes",
"somewhere",
"still",
"such",
"system",
"take",
"ten",
"than",
"that",
"the",
"their",
"them",
"themselves",
"then",
"thence",
"there",
"thereafter",
"thereby",
"therefore",
"therein",
"thereupon",
"these",
"they",
"thick",
"thin",
"third",
"this",
"those",
"though",
"throughout",
"thru",
"thus",
"to",
"together",
"too",
"top",
"toward",
"towards",
"twelve",
"twenty",
"two",
"un",
"under",
"until",
"up",
"upon",
"us",
"very",
"via",
"was",
"we",
"we",
"well",
"were",
"what",
"whatever",
"when",
"whence",
"whenever",
"where",
"whereafter",
"whereas",
"whereby",
"wherein",
"whereupon",
"wherever",
"whether",
"which",
"while",
"whither",
"who",
"whoever",
"whole",
"whom",
"whose",
"why",
"will",
"with",
"within",
"without",
"would",
"yet",
"you",
"your",
"yours",
"yourself",
"yourselves"
};
*/


/* // stop words 119

// http://www.textfixer.com/resources/common-english-words.txt
private static final String sw_array[] = new String[]{
"a",
"able",
"about",
"across",
"after",
"all",
"almost",
"also",
"am",
"among",
"an",
"and",
"any",
"are",
"as",
"at",
"be",
"because",
"been",
"but",
"by",
"can",
"cannot",
"could",
"dear",
"did",
"do",
"does",
"either",
"else",
"ever",
"every",
"for",
"from",
"get",
"got",
"had",
"has",
"have",
"he",
"her",
"hers",
"him",
"his",
"how",
"however",
"i",
"if",
"in",
"into",
"is",
"it",
"its",
"just",
"least",
"let",
"like",
"likely",
"may",
"me",
"might",
"most",
"must",
"my",
"neither",
"no",
"nor",
"not",
"of",
"off",
"often",
"on",
"only",
"or",
"other",
"our",
"own",
"rather",
"said",
"say",
"says",
"she",
"should",
"since",
"so",
"some",
"than",
"that",
"the",
"their",
"them",
"then",
"there",
"these",
"they",
"this",
"tis",
"to",
"too",
"twas",
"us",
"wants",
"was",
"we",
"were",
"what",
"when",
"where",
"which",
"while",
"who",
"whom",
"why",
"will",
"with",
"would",
"yet",
"you",
"your"
};
*/

+ 441
- 0
src/paradict/ParaphraseDictionary.java View File

@@ -0,0 +1,441 @@
package paradict;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;




import nlp.tool.CoreNLP;
import qa.Globals;

public class ParaphraseDictionary {
public static String localDataPath;
public static String dbpedia_relation_paraphrases_baseform_withScore;
public static String dbpedia_relation_paraphrases_baseform_withScore_rerank;
public static String dbpedia_relation_paraphrases_handwrite;
public static String dbpedia_predicate_id;
public static String dbpedia_dbo_predicate;

public HashMap<String, Integer> predicate_2_id = null;
public HashMap<Integer, String> id_2_predicate = null;
public HashSet<Integer> dbo_predicate_id = null;
public HashMap<String, ArrayList<PredicateIDAndSupport>> nlPattern_2_predicateList = null;
public HashMap<String, ArrayList<String>> invertedIndex = null;
public HashSet<String> relns_subject;
public HashSet<String> relns_object;
public HashSet<String> prepositions;
public HashSet<String> bannedTypes;
//public final int typePredicateID = 1541; //dbpedia2015 <type>=1541
public final int typePredicateID = 5157; //Dbpedia 2016 <type>=5166
public int totalPredCount = 0;
public int paraphrasedPredCount = 0;
public int lineCount = 0;
/**
* constructor
* @param parser
* @param ner
*/
public ParaphraseDictionary () {
String fixedPath = Globals.localPath;

System.out.println(System.getProperty("user.dir"));
localDataPath = fixedPath + "data/DBpedia2016/parapharse/";
dbpedia_relation_paraphrases_baseform_withScore_rerank = localDataPath + "dbpedia-relation-paraphrases-withScore-baseform-merge-sorted-rerank-slct.txt";
dbpedia_relation_paraphrases_handwrite = localDataPath + "dbpedia-relation-paraphrase-handwrite.txt";
dbpedia_predicate_id = localDataPath + "16predicate_id.txt";
dbpedia_dbo_predicate = localDataPath + "16dbo_predicates.txt";
bannedTypes = new HashSet<String>();
bannedTypes.add("Mayor");
relns_subject = new HashSet<String>();
relns_subject.add("subj");
relns_subject.add("csubjpass");
relns_subject.add("csubj");
relns_subject.add("xsubj");
relns_subject.add("nsubjpass");
relns_subject.add("nsubj");
relns_subject.add("poss"); // Obama's wife
relns_subject.add("dobj");
relns_object = new HashSet<String>();
relns_object.add("dobj");
relns_object.add("iobj");
relns_object.add("obj");
relns_object.add("pobj");
prepositions = new HashSet<String>();
prepositions.add("in");//in at on with to from before after of for
prepositions.add("at");
prepositions.add("on");
prepositions.add("with");
prepositions.add("to");
prepositions.add("from");
prepositions.add("before");
prepositions.add("after");
prepositions.add("of");
prepositions.add("for");
prepositions.add("as");

try {
loadPredicateId();
loadDboPredicate();
loadParaDict();
buildInvertedIndex();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Load the mapping between predicates and their IDs.
* @throws IOException
*/
public void loadPredicateId () throws IOException {
predicate_2_id = new HashMap<String, Integer>();
id_2_predicate = new HashMap<Integer, String>();
String input_filename = dbpedia_predicate_id;
File file = new File(input_filename);
InputStreamReader in = null;
BufferedReader br = null;
try{
in = new InputStreamReader(new FileInputStream(file), "utf-8");
br = new BufferedReader(in);
String line = null;
while ((line = br.readLine())!= null) {
String[] lines = line.split("\t");
predicate_2_id.put(lines[0], Integer.parseInt(lines[1]));
id_2_predicate.put(Integer.parseInt(lines[1]), lines[0]);
}
}catch(IOException e){
System.out.println("NLPatterns.loadPredicateId() : IOException!");
e.printStackTrace();
}finally{
if(br != null){
try{
br.close();
}catch(IOException e){
e.printStackTrace();
}
}
}
System.out.println("NLPatterns.loadPredicateId() : ok!");
}
public void loadDboPredicate() throws IOException
{
dbo_predicate_id = new HashSet<Integer>();
int cnt = 0;
String input_filename = dbpedia_dbo_predicate;
InputStreamReader in = null;
BufferedReader br = null;
try{
File file = new File(input_filename);
in = new InputStreamReader(new FileInputStream(file), "utf-8");
br = new BufferedReader(in);
String line = null;
while ((line = br.readLine())!= null)
{
if (!predicate_2_id.containsKey(line))
{
cnt++;
//System.out.println("error: not found "+line+" id.");
continue;
}
dbo_predicate_id.add(predicate_2_id.get(line));
}
}catch(IOException e){
System.out.println("NLPatterns.loadDboPredicate() : IOException!");
}finally{
if(br!=null){
try{
br.close();
}catch(IOException e){
e.printStackTrace();
}
}
}
System.out.println("Warning: DBO not found id count: "+cnt);
System.out.println("NLPatterns.loadDboPredicate() : ok!");
}
/**
* Get predicate by its id
* @param predicateID
* @return
*/
public String getPredicateById (int predicateID) {
return id_2_predicate.get(predicateID);
}
public void loadParaDict () throws Exception {
nlPattern_2_predicateList = new HashMap<String, ArrayList<PredicateIDAndSupport>>();
HashSet<String> missInDBP2014 = new HashSet<String>();
InputStreamReader in = null;
BufferedReader br = null;
try{
String inputFileName = dbpedia_relation_paraphrases_baseform_withScore_rerank;
File file = new File(inputFileName);
in = new InputStreamReader(new FileInputStream(file), "utf-8");
br = new BufferedReader(in);
String line = null;
int lineCount = 0;
//line = br.readLine();//read the first line which indicates the format
while ((line = br.readLine()) != null)
{
if (line.startsWith("#")) continue;
lineCount ++;
String[] content = line.split("\t");
if(!predicate_2_id.containsKey(content[0]))
{
missInDBP2014.add(content[0]);
continue;
}
int predicateID = predicate_2_id.get(content[0]);
String nlPattern = content[1].toLowerCase();
int support = Integer.parseInt(content[2]);
//double score = Double.parseDouble(content[3]);
String []slctString = content[3].split(" ");
double[] slct = new double[slctString.length];
for (int i=0; i < slct.length; i++) {
slct[i] = Double.parseDouble(slctString[i]);
}
if (!nlPattern_2_predicateList.containsKey(nlPattern)) {
nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>());
}
nlPattern_2_predicateList.get(nlPattern).add(new PredicateIDAndSupport(predicateID, support, slct));
}
System.out.println("Number of NL-Patterns-to-predicate mappings = " + lineCount);
System.out.println("NLPatterns.size = " + nlPattern_2_predicateList.size());
System.out.println("Predicate.size = " + predicate_2_id.size());
System.out.println("Warning: Predicates not in DBpedia 2014 count: "+missInDBP2014.size());

// Notice predicate itself and handwritten patterns have no wordSelectivity.
addPredicateAsNLPattern(); // This is very important.
addHandwriteAsNLPattern();
Iterator<String> it = nlPattern_2_predicateList.keySet().iterator();
while (it.hasNext()) {
Collections.sort(nlPattern_2_predicateList.get(it.next()));
}
}catch(IOException e){
System.out.println("NLPatterns.Paradict() : IOException!");
}finally{
if(br!=null){
try{
br.close();
}catch(IOException e){
e.printStackTrace();
}
}
}
System.out.println("NLPatterns.Paradict() : ok!");
}
/**
* A set of very important NL patterns are the predicates themselves!
*/
public void addPredicateAsNLPattern () {
final int support = 200;
int predicate_id;
for (String p : predicate_2_id.keySet())
{
// TODO: Omitting some bad relations (should be discarded in future)
if(p.equals("state") || p.equals("states"))
continue;
predicate_id = predicate_2_id.get(p);
StringBuilder pattern = new StringBuilder("");
// Work/runtime 11,SpaceStation/volume 68 and some predicates have prefix (DBpedia 2015), discard the prefix when generating pattern
if(p.contains("/"))
{
if(p.charAt(0)>='A' && p.charAt(0)<='Z')
p = p.substring(p.indexOf("/")+1);
//gameW/l 1974
else
p = p.replace("/", "");
}
int last = 0, i = 0;
for(i = 0; i < p.length(); i ++) {
// if it were not a small letter, then break it.
if(!(p.charAt(i)>='a' && p.charAt(i)<='z')) {
pattern.append(p.substring(last, i).toLowerCase());
pattern.append(" ");
last = i;
}
}
pattern.append(p.substring(last, i).toLowerCase());
for (i = 3; i < pattern.length(); i ++) {
// the blank between two digits should be deleted.
if (pattern.charAt(i)>='0' && pattern.charAt(i)<='9'
&& pattern.charAt(i-1)==' '
&& pattern.charAt(i-2)>='0' && pattern.charAt(i-2)<='9') {
pattern.deleteCharAt(i-1);
}
// the blank between I and D should be deleted.
else if (pattern.charAt(i)=='d'
&& pattern.charAt(i-1)==' '
&& pattern.charAt(i-2)=='i'
&& pattern.charAt(i-3)==' ') {
pattern.deleteCharAt(i-1);
}
// the blank between D and B should be deleted.
else if (pattern.charAt(i)=='b'
&& pattern.charAt(i-1)==' '
&& pattern.charAt(i-2)=='d'
&& pattern.charAt(i-3)==' ') {
pattern.deleteCharAt(i-1);
}
}
// pattern -> base form
/*String[] ptns = pattern.toString().split(" ");
pattern = new StringBuilder("");
for (String s : ptns) {
pattern.append(Globals.coreNLPparser.getBaseFormOfPattern(s));
pattern.append(" ");
}
pattern.deleteCharAt(pattern.length()-1);
String patternString = pattern.toString();*/
// Special case cannot use base form, eg, foundingYear //TODO: maybe Porter's Algorithm
String patternString = Globals.coreNLP.getBaseFormOfPattern(pattern.toString());
//System.out.println(p + "-->" + patternString);
if (!nlPattern_2_predicateList.containsKey(patternString)) {
nlPattern_2_predicateList.put(patternString, new ArrayList<PredicateIDAndSupport>());
}
nlPattern_2_predicateList.get(patternString).add(
new PredicateIDAndSupport(predicate_id,
support,
PredicateIDAndSupport.genSlct(patternString.split(" ").length)));
}
System.out.println("NLPatterns.addPredicateAsNLPattern(): ok!");
}
public void addHandwriteAsNLPattern() throws IOException {
String inputFileName = dbpedia_relation_paraphrases_handwrite;
InputStreamReader in = null;
BufferedReader br = null;
try{
File file = new File(inputFileName);
in = new InputStreamReader(new FileInputStream(file), "utf-8");
br = new BufferedReader(in);
String line = null;
//int lineCount = 0;
//line = br.readLine();//read the first line which indicates the format
while ((line = br.readLine()) != null) {
if (line.startsWith("#") || line.isEmpty()) continue;
//lineCount ++;
String[] content = line.split("\t");
if(!predicate_2_id.containsKey(content[0]))
continue;
int predicateID = predicate_2_id.get(content[0]);
String nlPattern = content[1].toLowerCase();
int support = Integer.parseInt(content[2]);
if (!nlPattern_2_predicateList.containsKey(nlPattern)) {
nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>());
}
nlPattern_2_predicateList.get(nlPattern).add(
new PredicateIDAndSupport(predicateID,
support,
PredicateIDAndSupport.genSlct(nlPattern.split(" ").length)));
}
}catch(IOException e){
System.out.println("NLPatterns.addHandwriteAsNLPattern(): IOException!");
}finally{
if(br!=null){
try{
br.close();
}catch(IOException e){
e.printStackTrace();
}
}
}
System.out.println("NLPatterns.addHandwriteAsNLPattern(): ok!");
}

/**
* Show the NLPatterns
*/
public void showNLPatterns () {
/*for (String s: syntacticMarker) {
System.out.println(s);
}
GlobalTools.systemPause();*/
System.out.println("predicate-->id");
for (String s : predicate_2_id.keySet()) {
System.out.println(s + "-->" + predicate_2_id.get(s));
}
Globals.systemPause();
int count = 1;
System.out.println("nlPattern-->predicate<support>");
for (String p : nlPattern_2_predicateList.keySet()) {
System.out.print("" + (count++) + ".\t" + p + "\t[" + nlPattern_2_predicateList.get(p).size() + "]\t");
for (PredicateIDAndSupport i : nlPattern_2_predicateList.get(p)) {
System.out.print(id_2_predicate.get(i.predicateID) + "<" + i.support + ">" + ", ");
}
System.out.println();
}
}
/**
* Build the inverted index, where each word will be mapped to the patterns that it occurs
*/
public void buildInvertedIndex () {
invertedIndex = new HashMap<String, ArrayList<String>>();
// traversing all patterns
for (String p : nlPattern_2_predicateList.keySet()) {
String[] tokens = p.split(" ");
for (String token : tokens) {
if (token.length() < 1) continue;
if (!invertedIndex.containsKey(token)) {
invertedIndex.put(token, new ArrayList<String>());
}
invertedIndex.get(token).add(p);
}
}
System.out.println("NLPatterns.buildInvertedIndex(): ok!");
}
public static void main (String[] args) {
Globals.coreNLP = new CoreNLP();
Globals.pd = new ParaphraseDictionary();
//Globals.pd.showNLPatterns();
}
}

+ 24
- 0
src/paradict/PredicateIDAndSupport.java View File

@@ -0,0 +1,24 @@
package paradict;

public class PredicateIDAndSupport implements Comparable<PredicateIDAndSupport> {
public int predicateID;
public int support;
public double[] wordSelectivity = null; // wordSelectivity helps PATTY patterns ranking more accurate.
public PredicateIDAndSupport(int _pid, int _support, double[] _slct) {
predicateID = _pid;
support = _support;
wordSelectivity = _slct;
}

public int compareTo(PredicateIDAndSupport o) {
return o.support - this.support;
}

// only use for predicate itself and handwriting paraphrase
public static double[] genSlct(int size) {
double[] ret = new double[size];
for (int i=0;i<size;i++) ret[i] = 1.0;
return ret;
}
}

+ 105
- 0
src/qa/Answer.java View File

@@ -0,0 +1,105 @@
package qa;

import java.util.ArrayList;


public class Answer implements Comparable<Answer>{
public String questionFocusKey=null;
public String questionFocusValue=null;
public ArrayList<String> otherInformationKey = null;
public ArrayList<String> otherInformationValue = null;
public Answer(String qf, String[] ans) {
otherInformationKey = new ArrayList<String>();
otherInformationValue = new ArrayList<String>();
int p1, p2;
for (String line : ans) {
System.out.println("line=" + line);
if (line.startsWith(qf)) {
questionFocusKey = qf;
p1 = line.indexOf('<');
p2 = line.lastIndexOf('>');
String value = null;
if (p1 != -1 && p2 != -1) {
value = line.substring(p1+1, p2);
}
else {
p1 = line.indexOf('\"');
p2 = line.lastIndexOf('\"');
if(p1 != -1 && p2 != -1)
value = line.substring(p1+1, p2);
else
{
p1 = line.indexOf(':');
value = line.substring(p1+1);
}
}
questionFocusValue = value;
}
else {
p1 = line.indexOf(':');
String key = line.substring(0, p1);

p1 = line.indexOf('<');
p2 = line.lastIndexOf('>');
String value = null;
if (p1 != -1 && p2 != -1) {
value = line.substring(p1+1, p2);
}
else {
p1 = line.indexOf('\"');
p2 = line.lastIndexOf('\"');
if(p1 != -1 && p2 != -1)
value = line.substring(p1+1, p2);
else
{
p1 = line.indexOf(':');
value = line.substring(p1+1);
}
}
otherInformationKey.add(key);
otherInformationValue.add(value);
}
}
// Sove BUG: GStore return messy code in questionFocusKey
if (questionFocusKey==null || questionFocusValue==null)
{
questionFocusKey = qf;
String line = ans[0];
p1 = line.indexOf('<');
p2 = line.lastIndexOf('>');
String value = null;
if (p1 != -1 && p2 != -1) {
value = line.substring(p1+1, p2);
}
else {
p1 = line.indexOf('\"');
p2 = line.lastIndexOf('\"');
if(p1 != -1 && p2 != -1)
value = line.substring(p1+1, p2);
else
{
p1 = line.indexOf(':');
value = line.substring(p1+1);
}
}
questionFocusValue = value;
otherInformationKey.clear();
otherInformationValue.clear();
}
/*System.out.println("otherInformationKey.size=" + otherInformationKey.size());
for (String k : otherInformationKey) {
System.out.println("otherInfoKey = " + k);
}*/
}
public int compareTo (Answer p)
{
return questionFocusValue.compareTo(p.questionFocusValue);
}
}

+ 376
- 0
src/qa/GAnswer.java View File

@@ -0,0 +1,376 @@
package qa;

import java.io.*;
import java.net.Socket;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;

import jgsc.GstoreConnector;
import log.QueryLogger;
import nlp.ds.Sentence;
import nlp.ds.Sentence.SentenceType;
import qa.parsing.QuestionParsing;
import qa.parsing.BuildQueryGraph;
import rdf.Sparql;
import utils.FileUtil;
import addition.AddtionalFix;
import qa.Globals;

public class GAnswer {
public static final int MAX_SPQ_NUM = 3;
public static void init() {
System.out.println("gAnswer2 init ...");
Globals.init();
System.out.println("gAnswer2 init ... ok!");
}
public QueryLogger getSparqlList(String input)
{
QueryLogger qlog = null;
try
{
if (input.length() <= 5)
return null;
System.out.println("[Input:] "+input);
// step 0: Node (entity & type & literal) Recognition
long t0 = System.currentTimeMillis(), t, NRtime;
Query query = new Query(input);
qlog = new QueryLogger(query);
ArrayList<Sparql> rankedSparqls = new ArrayList<Sparql>();
NRtime = (int)(System.currentTimeMillis()-t0);
System.out.println("step0 [Node Recognition] : "+ NRtime +"ms");
// Try to solve each NR plan, and combine the ranked SPARQLs.
// We only reserve LOG of BEST NR plan for convenience.
for(int i=query.sList.size()-1; i>=0; i--)
{
Sentence possibleSentence = query.sList.get(i);
qlog.reloadSentence(possibleSentence);
// qlog.isMaltParserUsed = true;
// LOG
System.out.println("transQ: "+qlog.s.plainText);
qlog.NRlog = query.preLog;
qlog.SQGlog = "Id: "+query.queryId+"\nQuery: "+query.NLQuestion+"\n";
qlog.SQGlog += qlog.NRlog;
qlog.timeTable.put("step0", (int)NRtime);
// step 1: question parsing (dependency tree, sentence type)
t = System.currentTimeMillis();
QuestionParsing step1 = new QuestionParsing();
step1.process(qlog);
qlog.timeTable.put("step1", (int)(System.currentTimeMillis()-t));
// step 2: build query graph (structure construction, relation extraction, top-k join)
t = System.currentTimeMillis();
BuildQueryGraph step2 = new BuildQueryGraph();
step2.process(qlog);
// step2.processEXP(qlog);
qlog.timeTable.put("step2", (int)(System.currentTimeMillis()-t));
// step 3: some fix (such as "one-node" or "ask-one-triple") and aggregation
t = System.currentTimeMillis();
AddtionalFix step3 = new AddtionalFix();
step3.process(qlog);
// Collect SPARQLs.
rankedSparqls.addAll(qlog.rankedSparqls);
qlog.timeTable.put("step3", (int)(System.currentTimeMillis()-t));
}

// deduplicate in SPARQL
for(Sparql spq: rankedSparqls)
spq.deduplicate();
// Sort (descending order).
Collections.sort(rankedSparqls);
qlog.rankedSparqls = rankedSparqls;
System.out.println("number of rankedSparqls = " + qlog.rankedSparqls.size());
// Detect question focus.
for (int i=0; i<qlog.rankedSparqls.size(); i++)
{
// First detect by SPARQLs.
Sparql spq = qlog.rankedSparqls.get(i);
String questionFocus = QuestionParsing.detectQuestionFocus(spq);
// If failed, use TARGET directly.
if(questionFocus == null)
questionFocus = "?"+qlog.target.originalForm;
spq.questionFocus = questionFocus;
}
return qlog;
}
catch (Exception e) {
e.printStackTrace();
return qlog;
}
}
public String getStdSparqlWoPrefix(QueryLogger qlog, Sparql curSpq)
{
if(qlog == null || curSpq == null)
return null;
String res = "";
if (qlog.s.sentenceType==SentenceType.GeneralQuestion)
res += "ask where";
else
{
if(!curSpq.countTarget)
res += ("select DISTINCT " + curSpq.questionFocus + " where");
else
res += ("select COUNT(DISTINCT " + curSpq.questionFocus + ") where");
}
res += "\n";
res += curSpq.toStringForGStore();
if(curSpq.moreThanStr != null)
{
res += curSpq.moreThanStr+"\n";
}
if(curSpq.mostStr != null)
{
res += curSpq.mostStr+"\n";
}
return res;
}
// Notice, this will change the original SPARQL.
public Sparql getUntypedSparql (Sparql spq)
{
if(spq == null)
return null;
spq.removeAllTypeInfo();
if (spq.tripleList.size() == 0) return null;
return spq;
}

/**
* Get answers from Virtuoso + DBpedia, this function require OLD version Virtuoso + Virtuoso Handler.
* Virtuoso can solve "Aggregation"
**/
// public Matches getAnswerFromVirtuoso (QueryLogger qlog, Sparql spq)
// {
// Matches ret = new Matches();
// try
// {
// Socket socket = new Socket(Globals.QueryEngineIP, 1112);
// DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream()));
//
// //formatting SPARQL & evaluate
// String formatedSpq = spq.toStringForVirtuoso();
// dos.writeUTF(formatedSpq);
// dos.flush();
// System.out.println("STD SPARQL:\n"+formatedSpq+"\n");
//
// ArrayList<String> rawLines = new ArrayList<String>();
// DataInputStream dis = new DataInputStream(new BufferedInputStream(socket.getInputStream()));
// while (true)
// {
// String line = dis.readUTF();
// if (line.equals("[[finish]]")) break;
// rawLines.add(line);
// }
//
// // ASK query was translated to SELECT query, whose answer need translation.
// // It is no need to translate, use "ASK WHERE" directly ! 2018-12-11
// if(qlog.s.sentenceType == SentenceType.GeneralQuestion)
// {
// ret.answersNum = 1;
// ret.answers = new String[1][1];
// if(rawLines.size() == 0)
// {
// ret.answers[0][0] = "general:false";
// }
// else
// {
// ret.answers[0][0] = "general:true";
// }
// System.out.println("general question answer:" + ret.answers[0][0]);
// dos.close();
// dis.close();
// socket.close();
// return ret;
// }
//
// //select but no results
// if (rawLines.size() == 0)
// {
// ret.answersNum = 0;
// dos.close();
// dis.close();
// socket.close();
// return ret;
// }
//
// int ansNum = rawLines.size();
// int varNum = variables.size();
// ArrayList<String> valist = new ArrayList<String>(variables);
// ret.answers = new String[ansNum][varNum];
//
// System.out.println("ansNum=" + ansNum);
// System.out.println("varNum=" + varNum);
// for (int i=0;i<rawLines.size();i++)
// {
// String[] ansLineContents = rawLines.get(i).split("\t");
// for (int j=0;j<varNum;j++)
// {
// ret.answers[i][j] = valist.get(j) + ":" + ansLineContents[j];
// }
// }
//
// dos.close();
// dis.close();
// socket.close();
// }
// catch (Exception e) {
// e.printStackTrace();
// }
//
// return ret;
// }
public Matches getAnswerFromGStore2 (Sparql spq)
{
// modified by Lin Yinnian using ghttp - 2018-9-28
GstoreConnector gc = new GstoreConnector("172.31.222.90", 9001);
String answer = gc.query("root", "123456", "dbpedia16", spq.toStringForGStore2());
System.out.println(answer);
String[] rawLines = answer.split("\n");
Matches ret = new Matches();
if (rawLines.length == 0 || rawLines[0].equals("[empty result]"))
{
ret.answersNum = 0;
return ret;
}
int ansNum = rawLines.length-1;
String[] varLineContents = rawLines[0].split("\t");
int varNum = varLineContents.length;
ret.answers = new String[ansNum][varNum];
System.out.println("ansNum=" + ansNum);
System.out.println("varNum=" + varNum);
System.out.println("rawLines.length=" + rawLines.length);
for (int i=1;i<rawLines.length;i++)
{
// if one answer of rawAnswer contains '\n', it may leads error so we just return.
if(i-1 >= ansNum)
break;
String[] ansLineContents = rawLines[i].split("\t");
for (int j=0;j<varNum;j++)
{
ret.answers[i-1][j] = varLineContents[j] + ":" + ansLineContents[j];
}
}
return ret;
}
public static void main (String[] args)
{
Globals.init();
GAnswer ga = new GAnswer();
int i =1;
//file in/output
List<String> inputList = FileUtil.readFile("E:/Linyinnian/qald6_special.txt");
for(String input: inputList)
{
ArrayList<String> outputs = new ArrayList<String>();
ArrayList<String> spqs = new ArrayList<String>();
spqs.add("id:"+String.valueOf(i));
i++;
long parsing_st_time = System.currentTimeMillis();
QueryLogger qlog = ga.getSparqlList(input);
if(qlog == null || qlog.rankedSparqls == null)
continue;
long parsing_ed_time = System.currentTimeMillis();
System.out.println("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms");
System.out.println("TripleCheck time: "+ qlog.timeTable.get("TripleCheck") + "ms");
System.out.println("SparqlCheck time: "+ qlog.timeTable.get("SparqlCheck") + "ms");
System.out.println("Ranked Sparqls: " + qlog.rankedSparqls.size());
outputs.add(qlog.SQGlog);
outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms");
outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms");
outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms");
long excuting_st_time = System.currentTimeMillis();
Matches m = null;
System.out.println("[RESULT]");
ArrayList<String> lastSpqList = new ArrayList<String>();
int idx;
// Consider top-5 SPARQLs
for(idx=1; idx<=Math.min(qlog.rankedSparqls.size(), 5); idx++)
{
Sparql curSpq = qlog.rankedSparqls.get(idx-1);
String stdSPQwoPrefix = ga.getStdSparqlWoPrefix(qlog, curSpq);
lastSpqList.add(stdSPQwoPrefix);
System.out.println("[" + idx + "]" + "score=" + curSpq.score);
System.out.println(stdSPQwoPrefix);

// Print top-3 SPARQLs to file.
if(idx <= MAX_SPQ_NUM)
// spqs.add("[" + idx + "]" + "score=" + curSpq.score + "\n" + stdSPQwoPrefix);
outputs.add("[" + idx + "]" + "score=" + curSpq.score + "\n" + stdSPQwoPrefix);
// // Execute by Virtuoso or GStore when answers not found
if(m == null || m.answers == null)
{
if (curSpq.tripleList.size()>0 && curSpq.questionFocus!=null)
{
// if(ga.isBGP(qlog, curSpq))
m = ga.getAnswerFromGStore2(curSpq);
// else
// m = ga.getAnswerFromVirtuoso(qlog, curSpq);
}
if (m != null && m.answers != null)
{
// Found results using current SPQ, then we can break and print result.
qlog.sparql = curSpq;
qlog.match = m;
qlog.reviseAnswers();
System.out.println("Query Executing time: "+ (int)(System.currentTimeMillis() - excuting_st_time)+ "ms");
}
}
}
// Some TYPEs can be omitted, (such as <type> <yago:Wife>)
if(!qlog.rankedSparqls.isEmpty())
{
Sparql untypedSparql = ga.getUntypedSparql(qlog.rankedSparqls.get(0));
if(untypedSparql != null)
{
String stdSPQwoPrefix = ga.getStdSparqlWoPrefix(qlog, untypedSparql);
if(!lastSpqList.contains(stdSPQwoPrefix))
// spqs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n");
outputs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n");
}
}
outputs.add(qlog.match.toString());
FileUtil.writeFile(outputs, "E:/Linyinnian/qald6_special_out.txt", true);
}
}
}

+ 118
- 0
src/qa/Globals.java View File

@@ -0,0 +1,118 @@
package qa;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import lcn.EntityFragmentFields;
import fgmt.RelationFragment;
import fgmt.TypeFragment;
import paradict.ParaphraseDictionary;
import qa.mapping.DBpediaLookup;
import nlp.tool.NERecognizer;
import nlp.tool.CoreNLP;
import nlp.tool.MaltParser;
import nlp.tool.StanfordParser;
import nlp.tool.StopWordsList;

public class Globals {
// nlp tools
public static CoreNLP coreNLP;
public static StanfordParser stanfordParser;
public static StopWordsList stopWordsList;
public static MaltParser maltParser;
public static NERecognizer nerRecognizer;
// relation paraphrase dictionary
public static ParaphraseDictionary pd;
// entity linking system
public static DBpediaLookup dblk;
public static int MaxAnswerNum = 100;
/*
* evaluationMethod:
* 1. baseline(SQG), does not allow CIRCLE and WRONG edge. The structure may be different by changing the TARGET.
* 2. super SQG, allow CIRCLE and WRONG edge. The structure is decided by DS tree, and can be changed in query evaluation(TOP-K match) stage.
* */
public static int evaluationMethod = 2;
public static boolean isRunAsWebServer = false; // Run Local: false; Run Server: true
public static String runningBenchmark = "QALD"; // WQ:WebQuestions; WQSP:WebQuestionsSP; CQ:ComplexQuestions
// using different method and Freebase Version (in Virtuoso.java)
public static boolean usingOperationCondition = false; // only for EXP: try state transition operations only when condition are satisfied.

public static String localPath = "/media/wip/husen/NBgAnswer/";
public static String QueryEngineIP = "127.0.0.1"; // Notice, PORT number is in the evaluation function.
public static void init ()
{
System.out.println("====== gAnswer2.0 over DBpedia ======");
if(isRunAsWebServer == false)
{
localPath = "D:/husen/gAnswer/";
QueryEngineIP = "172.31.222.72";
}

long t1, t2, t3, t4, t5, t6, t7, t8, t9;
t1 = System.currentTimeMillis();
coreNLP = new CoreNLP();
t2 = System.currentTimeMillis();
stanfordParser = new StanfordParser();
t3 = System.currentTimeMillis();
maltParser = new MaltParser();
t4 = System.currentTimeMillis();
nerRecognizer = new NERecognizer();
t5 = System.currentTimeMillis();
stopWordsList = new StopWordsList();
t6 = System.currentTimeMillis();
pd = new ParaphraseDictionary();
t7 = System.currentTimeMillis();
try
{
EntityFragmentFields.load();
RelationFragment.load();
TypeFragment.load();
}
catch (Exception e1) {
System.out.println("EntityIDs and RelationFragment and TypeFragment loading error!");
e1.printStackTrace();
}
t8 = System.currentTimeMillis();
dblk = new DBpediaLookup();
t9 = System.currentTimeMillis();
System.out.println("======Initialization======");
System.out.println("CoreNLP(Lemma): " + (t2-t1) + "ms.");
System.out.println("StanfordParser: " + (t3-t2) + "ms.");
System.out.println("MaltParser: " + (t4-t3) + "ms.");
System.out.println("NERecognizer: " + (t5-t4) + "ms.");
System.out.println("StopWordsList: " + (t6-t5) + "ms.");
System.out.println("ParaphraseDict & posTagPattern: " + (t7-t6) + "ms.");
System.out.println("GraphFragments: " + (t8-t7) + "ms.");
System.out.println("DBpediaLookup: " + (t9-t8) + "ms.");
System.out.println("* Total *: " + (t9-t1) + "ms.");
System.out.println("==========================");
}

/**
* Use as system("pause") in C
*/
public static void systemPause () {
System.out.println("System pause ...");
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
try {
br.readLine();
} catch (IOException e) {
e.printStackTrace();
}
}
}

+ 9
- 0
src/qa/Matches.java View File

@@ -0,0 +1,9 @@
package qa;

public class Matches {
public String[][] answers = null;
public int answersNum = 0;
public long time = 0;
public static final int pageNum = 3000;
}

+ 128
- 0
src/qa/Query.java View File

@@ -0,0 +1,128 @@
package qa;

import java.util.ArrayList;

import nlp.ds.Sentence;
import qa.extract.EntityRecognition;
import rdf.MergedWord;

/**
* 1. preprocessing of question
* 2. Node Recognition
* @author husen
*/
public class Query
{
public String NLQuestion = null;
public String TransferedQuestion = null;
public ArrayList<String> MergedQuestionList = null;
public ArrayList<Sentence> sList = null;
public String queryId = null;
public String preLog = "";
public ArrayList<MergedWord> mWordList = null;
public Query(){}
public Query(String _question)
{
NLQuestion = _question;
NLQuestion = removeQueryId(NLQuestion);
TransferedQuestion = getTransferedQuestion(NLQuestion);
// step1. NODE Recognition
MergedQuestionList = getMergedQuestionList(TransferedQuestion);
// build Sentence
sList = new ArrayList<Sentence>();
for(String mergedQuestion: MergedQuestionList)
{
Sentence sentence = new Sentence(this, mergedQuestion);
sList.add(sentence);
}
}
public boolean isDigit(char ch)
{
if(ch>='0' && ch<='9')
return true;
return false;
}
public boolean isUpperWord(char ch)
{
if(ch>='A' && ch<='Z')
return true;
return false;
}
/**
* some words -> equivalent words
* 1、stanfordParser often parse incorrect.
* 2、Synonyms unify. eg, movie->film
* @param question
* @return transfered question
*/
public String getTransferedQuestion(String question)
{
//rule1: discard ".", because "." and "_" will be disconnected by parser. Discard word tail's "'", which may pollutes NER
question = question.replace("' ", " ");
String [] words = question.split(" ");
String ret = "";
for(String word: words)
{
String retWord = word;
//TODO: now just check NUM in head/tail
if(word.length()>=2 && !isDigit(word.charAt(0)) && !isDigit(word.charAt(word.length()-1)))
{
retWord = retWord.replace(".", "");
}
ret += retWord + " ";
}
if(ret.length()>1)
ret = ret.substring(0,ret.length()-1);
ret = ret.replace("-", " ");
ret = ret.replace("in america", "in United States");
//rule2: as well as -> and
ret = ret.replace("as well as", "and");
//rule3: movie -> film
ret = ret.replace(" movie", " film");
ret = ret.replace(" movies", " films");
return ret;
}
/**
* Recognize entity & type & literal in KB and replace " " in Phrases with "_"
* @param question
* @return merged question list
*/
public ArrayList<String> getMergedQuestionList(String question)
{
ArrayList<String> mergedQuestionList = null;
//entity & type recognize
EntityRecognition er = new EntityRecognition();
mergedQuestionList = er.process(question);
preLog = er.preLog;
mWordList = er.mWordList;

return mergedQuestionList;
}
public String removeQueryId(String question)
{
String ret = question;
int st = question.indexOf("\t");
if(st!=-1 && question.length()>1 && question.charAt(0)>='0' && question.charAt(0)<='9')
{
queryId = question.substring(0,st);
ret = question.substring(st+1);
System.out.println("Extract QueryId :"+queryId);
}
return ret;
}
}

+ 153
- 0
src/qa/extract/CorefResolution.java View File

@@ -0,0 +1,153 @@
package qa.extract;

import java.util.ArrayList;
import java.util.HashSet;

import qa.Globals;

import log.QueryLogger;

import nlp.ds.DependencyTree;
import nlp.ds.DependencyTreeNode;
import nlp.ds.Word;
import rdf.SimpleRelation;

public class CorefResolution {
/**
* 1. a very simple reference resolution
* 2. Coref Resolution should be done after relation extraction and before items mapping
*/
public void process(ArrayList<SimpleRelation> simpleRelations, QueryLogger qlog) {
if (qlog.s.words.length <= 4) return; // if the sentence is too short, skip the coref step.
System.out.println("=====Co-reference resolution=======");
ArrayList<SimpleRelation> deleteList = new ArrayList<SimpleRelation>();
for(SimpleRelation sr : simpleRelations) {
Word w1=null, w2=null;
if (sr.extractingMethod == 'S') {
w1 = getRefWord(sr.arg1Word.getNnHead(), qlog.s.dependencyTreeStanford, qlog);
w2 = getRefWord(sr.arg2Word.getNnHead(), qlog.s.dependencyTreeStanford, qlog);
}
else if (sr.extractingMethod == 'M') {
w1 = getRefWord(sr.arg1Word.getNnHead(), qlog.s.dependencyTreeMalt, qlog);
w2 = getRefWord(sr.arg2Word.getNnHead(), qlog.s.dependencyTreeMalt, qlog);
}
else {
continue;
}
if (w1 != null) {
sr.arg1Word_beforeCRR = sr.arg1Word;
sr.arg1Word = w1;
}
if (w2 != null) {
sr.arg2Word_beforeCRR = sr.arg2Word;
sr.arg2Word = w2;
}
if (sr.arg1Word == sr.arg2Word)
deleteList.add(sr);
}
simpleRelations.removeAll(deleteList);
printCRR(qlog);
System.out.println("===================================");
}

// return the reference word of w
public Word getRefWord (Word w, DependencyTree dt, QueryLogger qlog) {
w = w.getNnHead();
if (w.crr != null) {
return w.crr;
}
/*
* method: (suitable for stanford parser (old version))
* (1) WDT --det--> [] eg: Which city is located in China?
* (2) WDT -------> V/J --rcmod--> [] eg: Who is married to someone that was born in Rome?
* "when is the sth" is conflict with this rule, so discarded. (3) W -------> be <------- [] eg: Who is the author of WikiLeaks?
* (4) WDT -------> V --ccomp--> [] eg: The actor that married the child of a politician.
* (5) DT(that, which) --dep--> V eg:The actors that married an athlete. // DS parser error.
* (6) W(position=1) ------> NN eg:What are the language used in China? // DS parser error, should eliminate "WRB":When was Carlo Giuliani shot?
* (7) where <--advmod-- V <--advcl-- V --prep/pobj--> [] eg: Who graduate from the school where Keqiang Li graduates?
*/

DependencyTreeNode dtn = dt.getNodeByIndex(w.position);
// no need for root
if (dtn.father == null) return null;
try {
if(dtn.word.posTag.equals("WDT") && dtn.dep_father2child.equals("det")) { // (1)
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead());
w.crr = dtn.father.word.getNnHead();
}
else if(dtn.word.posTag.startsWith("W") && !dtn.word.posTag.equals("WRB") && dtn.word.position == 1 && dtn.father.word.posTag.equals("NN")) { // (6)
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead());
w.crr = dtn.father.word.getNnHead();
}
else if(dtn.word.posTag.equals("DT")
&& dtn.dep_father2child.equals("dep")
&& (dtn.word.baseForm.equals("that")||dtn.word.baseForm.equals("which"))) { // (5)
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.word.getNnHead());
w.crr = dtn.father.word.getNnHead();
}
// else if(dtn.word.posTag.startsWith("W")
// && dtn.father.word.baseForm.equals("be")) { // (3) //&& dtn.dep_father2child.equals("attr")
// DependencyTreeNode target = dtn.father.containDependencyWithChildren("nsubj");
// if (target != null) {
// if(qlog.MODE_debug) System.out.println(w + "-->" + target.word.getNnHead());
// w.crr = target.word.getNnHead();
// }
// }
else if(dtn.word.posTag.equals("WDT")
&& (dtn.father.word.posTag.startsWith("V") || dtn.father.word.posTag.startsWith("J"))
&& dtn.father.dep_father2child.equals("rcmod")) { // (2)
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.father.word.getNnHead());
w.crr = dtn.father.father.word.getNnHead();
}
else if(dtn.word.posTag.equals("WDT")
&& dtn.father.word.posTag.startsWith("V")
&& dtn.father.dep_father2child.equals("ccomp")) { // (4)
if(qlog.MODE_debug) System.out.println(w + "-->" + dtn.father.father.word.getNnHead());
w.crr = dtn.father.father.word.getNnHead();
}
else if (dtn.word.baseForm.equals("where")
&& dtn.dep_father2child.equals("advmod")
&& dtn.father.dep_father2child.equals("advcl")) { // (7)
DependencyTreeNode target = dtn.father.father.containDependencyWithChildren("prep");
if (target != null) {
target = target.containDependencyWithChildren("pobj");
}
else {
for (DependencyTreeNode n : dtn.father.father.childrenList) {
if (Globals.pd.relns_object.contains(n.dep_father2child)) {
target = n;
}
}
}
if (target != null) {
if(qlog.MODE_debug) System.out.println(w + "-->" + target.word.getNnHead());
w.crr = target.word.getNnHead();
}
}
} catch (Exception e) {}
return w.crr;
}
public void printCRR (QueryLogger qlog) {
HashSet<Word> printed = new HashSet<Word>();
for (Word w : qlog.s.words) {
w = w.getNnHead();
if (printed.contains(w))
continue;
if (w.crr != null)
System.out.println("\""+w.getFullEntityName() + "\" is resoluted to \"" + w.crr.getFullEntityName() + "\"");
printed.add(w);
}
}
}

+ 918
- 0
src/qa/extract/EntityRecognition.java View File

@@ -0,0 +1,918 @@
package qa.extract;

import java.io.BufferedReader;
//import java.io.File;
//import java.io.FileInputStream;
//import java.io.FileNotFoundException;
//import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
//import java.io.OutputStreamWriter;
//import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;

import lcn.EntityFragmentFields;
import fgmt.EntityFragment;
import nlp.ds.Word;
import qa.Globals;
import rdf.EntityMapping;
import rdf.NodeSelectedWithScore;
import rdf.TypeMapping;
import rdf.MergedWord;
import utils.FileUtil;
import addition.*;

/**
* Core class of Node Recognition
* @author husen
*/
public class EntityRecognition {
public String preLog = "";
public String stopEntFilePath = Globals.localPath + "data/DBpedia2016/parapharse/stopEntDict.txt";
double EntAcceptedScore = 26;
double TypeAcceptedScore = 0.5;
double AcceptedDiffScore = 1;
public HashMap<String, String> m2e = null;
public ArrayList<MergedWord> mWordList = null;
public ArrayList<String> stopEntList = null;
public ArrayList<String> badTagListForEntAndType = null;
ArrayList<ArrayList<Integer>> selectedList = null;
TypeRecognition tr = null;
AddtionalFix af = null;
public EntityRecognition()
{
// LOG
preLog = "";
loadStopEntityDict();
// Bad posTag for entity
badTagListForEntAndType = new ArrayList<String>();
badTagListForEntAndType.add("RBS");
badTagListForEntAndType.add("JJS");
badTagListForEntAndType.add("W");
badTagListForEntAndType.add(".");
badTagListForEntAndType.add("VBD");
badTagListForEntAndType.add("VBN");
badTagListForEntAndType.add("VBZ");
badTagListForEntAndType.add("VBP");
badTagListForEntAndType.add("POS");
// !Handwriting entity linking; (lower case)
m2e = new HashMap<String, String>();
m2e.put("bipolar_syndrome", "Bipolar_disorder");
m2e.put("battle_in_1836_in_san_antonio", "Battle_of_San_Jacinto");
m2e.put("federal_minister_of_finance_in_germany", "Federal_Ministry_of_Finance_(Germany)");
// Additional fix for CATEGORY (in DBpedia)
af = new AddtionalFix();
tr = new TypeRecognition();
System.out.println("EntityRecognizer Initial : ok!");
}
public void loadStopEntityDict()
{
stopEntList = new ArrayList<String>();
try
{
List<String> inputs = FileUtil.readFile(stopEntFilePath);
for(String line: inputs)
{
if(line.startsWith("#"))
continue;
stopEntList.add(line);
}
}
catch (Exception e) {
e.printStackTrace();
}
}
public ArrayList<String> process(String question)
{
ArrayList<String> fixedQuestionList = new ArrayList<String>();
ArrayList<Integer> literalList = new ArrayList<Integer>();
HashMap<Integer, Double> entityScores = new HashMap<Integer, Double>();
HashMap<Integer, Integer> entityMappings = new HashMap<Integer, Integer>();
HashMap<Integer, Double> typeScores = new HashMap<Integer, Double>();
HashMap<Integer, String> typeMappings = new HashMap<Integer, String>();
HashMap<Integer, Double> mappingScores = new HashMap<Integer, Double>();
ArrayList<Integer> mustSelectedList = new ArrayList<Integer>();
System.out.println("--------- entity/type recognition start ---------");
Word[] words = Globals.coreNLP.getTaggedWords(question);
mWordList = new ArrayList<MergedWord>();
long t1 = System.currentTimeMillis();
int checkEntCnt = 0, checkTypeCnt = 0, hitEntCnt = 0, hitTypeCnt = 0, allCnt = 0;
boolean needRemoveCommas = false;
// Check entity & type
// Notice, ascending order by length
StringBuilder tmpOW = new StringBuilder();
StringBuilder tmpBW = new StringBuilder();
for(int len=1; len<=words.length; len++)
{
for(int st=0,ed=st+len; ed<=words.length; st++,ed++)
{
String originalWord = "", baseWord = "", allUpperWord = "";
//String[] posTagArr = new String[len];
for(int j=st; j<ed; j++)
{
//posTagArr[j-st] = words[j].posTag;
//originalWord += words[j].originalForm;
//baseWord += words[j].baseForm;
tmpOW.append(words[j].originalForm);
tmpBW.append(words[j].baseForm);
String tmp = words[j].originalForm;
if(tmp.length()>0 && tmp.charAt(0) >='a' && tmp.charAt(0)<='z')
{
String pre = tmp.substring(0,1).toUpperCase();
tmp = pre + tmp.substring(1);
}
allUpperWord += tmp;
if(j < ed-1)
{
//originalWord += "_";
//baseWord += "_";
tmpOW.append("_");
tmpBW.append("_");
}
}
originalWord = tmpOW.toString();
baseWord=tmpBW.toString();
tmpOW.setLength(0);
tmpBW.setLength(0);
allCnt++;
/*
* Filters to save time and drop some bad cases.
*/
boolean entOmit = false, typeOmit = false;
int prep_cnt=0;
// Upper words can pass filter. eg: "Melbourne , Florida"
int UpperWordCnt = 0;
for(int i=st;i<ed;i++)
if((words[i].originalForm.charAt(0)>='A' && words[i].originalForm.charAt(0)<='Z')
|| ((words[i].posTag.equals(",") || words[i].originalForm.equals("'")) && i>st && i<ed-1))
UpperWordCnt++;
// Filters
if(UpperWordCnt<len || st==0)
{
if(st==0)
{
if(!words[st].posTag.startsWith("DT") && !words[st].posTag.startsWith("N"))
{
entOmit = true;
typeOmit = true;
}
}
else if(st>0)
{
Word formerWord = words[st-1];
//as princess
if(formerWord.baseForm.equals("as"))
entOmit = true;
//how many dogs?
if(formerWord.baseForm.equals("many"))
entOmit = true;
//obama's daughter ; your height | len=1 to avoid: Asimov's Foundation series
if(len == 1 && (formerWord.posTag.startsWith("POS") || formerWord.posTag.startsWith("PRP")))
entOmit = true;
//the father of you
if(ed<words.length)
{
Word nextWord = words[ed];
if(formerWord.posTag.equals("DT") && nextWord.posTag.equals("IN"))
entOmit = true;
}
//the area code of ; the official language of
boolean flag1=false, flag2=false;
for(int i=0;i<=st;i++)
if(words[i].posTag.equals("DT"))
flag1 = true;
for(int i=ed-1;i<words.length;i++)
if(words[i].posTag.equals("IN"))
flag2 = true;
if(flag1 && flag2)
entOmit = true;
}
if(ed < words.length)
{
Word nextWord = words[ed];
// (lowerCase)+(UpperCase)
if(nextWord.originalForm.charAt(0)>='A' && nextWord.originalForm.charAt(0)<='Z')
entOmit = true;
}
for(int i=st;i<ed;i++)
{
if(words[i].posTag.startsWith("I"))
prep_cnt++;
for(String badTag: badTagListForEntAndType)
{
if(words[i].posTag.startsWith(badTag))
{
entOmit = true;
typeOmit = true;
break;
}
}
if(words[i].posTag.startsWith("P") && (i!=ed-1 || len==1)){
entOmit = true;
typeOmit = true;
}
// First word
if(i==st)
{
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("EX") || words[i].posTag.startsWith("TO"))
{
entOmit = true;
typeOmit = true;
}
if(words[i].posTag.startsWith("D") && len==2){
entOmit = true;
typeOmit = true;
}
if(words[i].baseForm.startsWith("list") || words[i].baseForm.startsWith("many"))
{
entOmit = true;
typeOmit = true;
}
if(words[i].baseForm.equals("and"))
{
entOmit = true;
typeOmit = true;
}
}
// Last word.
if(i==ed-1)
{
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("D") || words[i].posTag.startsWith("TO"))
{
entOmit = true;
typeOmit = true;
}
if(words[i].baseForm.equals("and"))
{
entOmit = true;
typeOmit = true;
}
}
// Single word.
if(len==1)
{
//TODO: Omit general noun. eg: father, book ...
if(!words[i].posTag.startsWith("N"))
{
entOmit = true;
typeOmit = true;
}
}
}
// Too many preposition.
if(prep_cnt >= 3)
{
entOmit = true;
typeOmit = true;
}
}
/*
* Filter done.
*/
// Search category | highest priority
String category = null;
if(af.pattern2category.containsKey(baseWord))
{
typeOmit = true;
entOmit = true;
category = af.pattern2category.get(baseWord);
}
// Search type
int hitMethod = 0; // 1=dbo(baseWord), 2=dbo(originalWord), 3=yago|extend()
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>();
if(!typeOmit)
{
System.out.println("Type Check: "+originalWord);
//checkTypeCnt++;
//search standard type
tmList = tr.getTypeIDsAndNamesByStr(baseWord);
if(tmList == null || tmList.size() == 0)
{
tmList = tr.getTypeIDsAndNamesByStr(originalWord);
if(tmList != null && tmList.size()>0)
hitMethod = 2;
}
else
hitMethod = 1;
//Search extend type (YAGO type)
if(tmList == null || tmList.size() == 0)
{
tmList = tr.getExtendTypeByStr(allUpperWord);
if(tmList != null && tmList.size() > 0)
{
preLog += "++++ Extend Type detect: "+baseWord+": "+" prefferd relaiton:"+tmList.get(0).prefferdRelation+"\n";
hitMethod = 3;
}
}
}
// Search entity
ArrayList<EntityMapping> emList = new ArrayList<EntityMapping>();
if(!entOmit && !stopEntList.contains(baseWord))
{
System.out.println("Ent Check: "+originalWord);
checkEntCnt++;
// Notice, the second parameter is whether use DBpedia Lookup.
emList = getEntityIDsAndNamesByStr(originalWord, (UpperWordCnt>=len-1 || len==1),len);
if(emList == null || emList.size() == 0)
{
emList = getEntityIDsAndNamesByStr(baseWord, (UpperWordCnt>=len-1 || len==1), len);
}
if(emList!=null && emList.size()>10)
{
ArrayList<EntityMapping> tmpList = new ArrayList<EntityMapping>();
for(int i=0;i<10;i++)
{
tmpList.add(emList.get(i));
}
emList = tmpList;
}
}
MergedWord mWord = new MergedWord(st,ed,originalWord);
// Add category
if(category != null)
{
mWord.mayCategory = true;
mWord.category = category;
int key = st*(words.length+1) + ed;
mustSelectedList.add(key);
}
// Add literal
if(len==1 && checkLiteralWord(words[st]))
{
mWord.mayLiteral = true;
int key = st*(words.length+1) + ed;
literalList.add(key);
}
// Add type mappings
if(tmList!=null && tmList.size()>0)
{
// Drop by score threshold
if(tmList.get(0).score < TypeAcceptedScore)
typeOmit = true;

// Only allow EXACT MATCH when method=1|2
// TODO: consider approximate match and taxonomy. eg, actor->person
String likelyType = tmList.get(0).typeName.toLowerCase();
String candidateBase = baseWord.replace("_", ""), candidateOriginal = originalWord.replace("_", "").toLowerCase();
if(!candidateBase.equals(likelyType) && hitMethod == 1)
typeOmit = true;
if(!candidateOriginal.equals(likelyType) && hitMethod == 2)
typeOmit = true;
if(!typeOmit)
{
mWord.mayType = true;
mWord.tmList = tmList;
int key = st*(words.length+1) + ed;
typeMappings.put(key, tmList.get(0).typeName);
typeScores.put(key, tmList.get(0).score);
}
}
// Add entity mappings
if(emList!=null && emList.size()>0)
{
// Drop by score threshold
if(emList.get(0).score < EntAcceptedScore)
entOmit = true;
// Drop: the [German Shepherd] dog
else if(len > 2)
{
for(int key: entityMappings.keySet())
{
//int te=key%(words.length+1);
int ts=key/(words.length+1);
if(ts == st+1 && ts <= ed)
{
//DT in lowercase (allow uppercase, such as: [The Pillars of the Earth])
if(words[st].posTag.startsWith("DT") && !(words[st].originalForm.charAt(0)>='A'&&words[st].originalForm.charAt(0)<='Z'))
{
entOmit = true;
}
}
}
}
// Record info in merged word
if(!entOmit)
{
mWord.mayEnt = true;
mWord.emList = emList;
// use to remove duplicate and select
int key = st*(words.length+1) + ed;
entityMappings.put(key, emList.get(0).entityID);
// fix entity score | conflict resolution
double score = emList.get(0).score;
String likelyEnt = emList.get(0).entityName.toLowerCase().replace(" ", "_");
String lowerOriginalWord = originalWord.toLowerCase();
// !Award: whole match
if(likelyEnt.equals(lowerOriginalWord))
score *= len;
// !Award: COVER (eg, Robert Kennedy: [Robert] [Kennedy] [Robert Kennedy])
//像Social_Democratic_Party,这三个word任意组合都是ent,导致方案太多;相比较“冲突选哪个”,“连or不应该连”显得更重要(而且实际错误多为连或不连的错误),所以这里直接抛弃被覆盖的小ent
//像Abraham_Lincoln,在“不连接”的方案中,会把他们识别成两个node,最后得分超过了正确答案的得分;故对于这种词设置为必选
if(len>1)
{
boolean[] flag = new boolean[words.length+1];
ArrayList<Integer> needlessEntList = new ArrayList<Integer>();
double tmpScore=0;
for(int preKey: entityMappings.keySet())
{
if(preKey == key)
continue;
int te=preKey%(words.length+1),ts=preKey/(words.length+1);
for(int i=ts;i<te;i++)
flag[i] = true;
if(st<=ts && ed>= te)
{
needlessEntList.add(preKey);
tmpScore += entityScores.get(preKey);
}
}
int hitCnt = 0;
for(int i=st;i<ed;i++)
if(flag[i])
hitCnt++;
// WHOLE match || HIGH match & HIGH upper || WHOLE upper
if(hitCnt == len || ((double)hitCnt/(double)len > 0.6 && (double)UpperWordCnt/(double)len > 0.6) || UpperWordCnt == len || len>=4)
{
//如中间有逗号,则要求两边的词都在mapping的entity中出现
//例如 Melbourne_,_Florida: Melbourne, Florida 是必须选的,而 California_,_USA: Malibu, California,认为不一定正确
boolean commaTotalRight = true;
if(originalWord.contains(","))
{
String candidateCompactString = originalWord.replace(",","").replace("_", "").toLowerCase();
String likelyCompactEnt = likelyEnt.replace(",","").replace("_", "");
if(!candidateCompactString.equals(likelyCompactEnt))
commaTotalRight = false;
else
{
mWord.name = mWord.name.replace("_,_","_");
needRemoveCommas = true;
}
}
if(commaTotalRight)
{
mustSelectedList.add(key);
if(tmpScore>score)
score = tmpScore+1;
for(int preKey: needlessEntList)
{
entityMappings.remove(preKey);
mustSelectedList.remove(Integer.valueOf(preKey));
}
}
}
}
//NOTICE: score in mWord have no changes. we only change the score in entityScores.
entityScores.put(key,score);
}
}
if(mWord.mayCategory || mWord.mayEnt || mWord.mayType || mWord.mayLiteral)
mWordList.add(mWord);
}
}
/* Print all candidates (use fixed score).*/
System.out.println("------- Result ------");
for(MergedWord mWord: mWordList)
{
int key = mWord.st * (words.length+1) + mWord.ed;
if(mWord.mayCategory)
{
System.out.println("Detect category mapping: "+mWord.name+": "+ mWord.category +" score: 100.0");
preLog += "++++ Category detect: "+mWord.name+": "+mWord.category+" score: 100.0\n";
}
if(mWord.mayEnt)
{
System.out.println("Detect entity mapping: "+mWord.name+": [");
for(EntityMapping em: mWord.emList)
System.out.print(em.entityName + ", ");
System.out.println("]");
preLog += "++++ Entity detect: "+mWord.name+": "+mWord.emList.get(0).entityName+" score:"+entityScores.get(key)+"\n";
hitEntCnt++;
}
if(mWord.mayType)
{
System.out.println("Detect type mapping: "+mWord.name+": [");
for(TypeMapping tm: mWord.tmList)
System.out.print(tm.typeName + ", ");
System.out.println("]");
preLog += "++++ Type detect: "+mWord.name+": "+mWord.tmList.get(0).typeName +" score:"+typeScores.get(key)+"\n";
hitTypeCnt++;
}
if(mWord.mayLiteral)
{
System.out.println("Detect literal: "+mWord.name);
preLog += "++++ Literal detect: "+mWord.name+"\n";
}
}
/*
* Sort by score and remove duplicate.
* eg, <"video_game" "ent:Video game" "50.0"> <"a_video_game" "ent:Video game" "45.0">.
* Notice, reserve all information in mWordList.
*/
// one ENT maps different mergedWord in query, reserve the higher score.
ByValueComparator bvc = new ByValueComparator(entityScores,words.length+1);
List<Integer> keys = new ArrayList<Integer>(entityMappings.keySet());
Collections.sort(keys, bvc);
for(Integer key : keys)
{
if(!mappingScores.containsKey(entityMappings.get(key)))
mappingScores.put(entityMappings.get(key), entityScores.get(key));
else
entityMappings.remove(key);
}
selectedList = new ArrayList<ArrayList<Integer>>();
ArrayList<Integer> selected = new ArrayList<Integer>();
// Some phrases must be selected.
selected.addAll(mustSelectedList);
for(Integer key: typeMappings.keySet())
{
// !type(len>1) (Omit len=1 because: [Brooklyn Bridge] is a entity.
int ed = key%(words.length+1), st = key/(words.length+1);
if(st+1 < ed)
{
boolean beCovered = false;
//Entity cover type, eg:[prime_minister of Spain]
for(int preKey: entityMappings.keySet())
{
int te=preKey%(words.length+1),ts=preKey/(words.length+1);
//Entiy should longer than type
if(ts <= st && te >= ed && ed-st < te-ts)
{
beCovered = true;
}
}
if(!beCovered)
selected.add(key);
}
}
// Conflict resolution
ArrayList<Integer> noConflictSelected = new ArrayList<Integer>();
//select longer one when conflict
boolean[] flag = new boolean[words.length];
ByLenComparator blc = new ByLenComparator(words.length+1);
Collections.sort(selected,blc);
for(Integer key : selected)
{
int ed = key%(words.length+1), st = (key-ed)/(words.length+1);
boolean omit = false;
for(int i=st;i<ed;i++)
{
if(flag[i])
{
omit = true;
break;
}
}
if(omit)
continue;
for(int i=st;i<ed;i++)
flag[i]=true;
noConflictSelected.add(key);
}
// Scoring and ranking --> top-k decision
dfs(keys,0,noConflictSelected,words.length+1);
ArrayList<NodeSelectedWithScore> nodeSelectedWithScoreList = new ArrayList<NodeSelectedWithScore>();
for(ArrayList<Integer> select: selectedList)
{
double score = 0;
for(Integer key: select)
{
if(entityScores.containsKey(key))
score += entityScores.get(key);
if(typeScores.containsKey(key))
score += typeScores.get(key);
}
NodeSelectedWithScore tmp = new NodeSelectedWithScore(select, score);
nodeSelectedWithScoreList.add(tmp);
}
Collections.sort(nodeSelectedWithScoreList);
// Replace
int cnt = 0;
for(int k=0; k<nodeSelectedWithScoreList.size(); k++)
{
if(k >= nodeSelectedWithScoreList.size())
break;
selected = nodeSelectedWithScoreList.get(k).selected;
Collections.sort(selected);
int j = 0;
String res = question;
if(selected.size()>0)
{
res = words[0].originalForm;
int tmp = selected.get(j++), st = tmp/(words.length+1), ed = tmp%(words.length+1);
for(int i=1;i<words.length;i++)
{
if(i>st && i<ed)
{
res = res+"_"+words[i].originalForm;
}
else
{
res = res+" "+words[i].originalForm;
}
if(i >= ed && j<selected.size())
{
tmp = selected.get(j++);
st = tmp/(words.length+1);
ed = tmp%(words.length+1);
}
}
}
else
{
res = words[0].originalForm;
for(int i=1;i<words.length;i++)
{
res = res+" "+words[i].originalForm;
}
}
boolean ok = true;
for(String str: fixedQuestionList)
if(str.equals(res))
ok = false;
if(!ok)
continue;
if(needRemoveCommas)
res = res.replace("_,_","_");
System.out.println("Merged: "+res);
preLog += "plan "+cnt+": "+res+"\n";
fixedQuestionList.add(res);
cnt++;
if(cnt >= 3) // top-3
break;
}
long t2 = System.currentTimeMillis();
// preLog += "Total hit/check/all ent num: "+hitEntCnt+" / "+checkEntCnt+" / "+allCnt+"\n";
// preLog += "Total hit/check/all type num: "+hitTypeCnt+" / "+checkTypeCnt+" / "+allCnt+"\n";
preLog += "Node Recognition time: "+ (t2-t1) + "ms\n";
System.out.println("Total check time: "+ (t2-t1) + "ms");
System.out.println("--------- pre entity/type recognition end ---------");
return fixedQuestionList;
}
public void dfs(List<Integer> keys,int dep,ArrayList<Integer> selected,int size)
{
if(dep == keys.size())
{
ArrayList<Integer> tmpList = (ArrayList<Integer>) selected.clone();
selectedList.add(tmpList);
}
else
{
//off: dep-th mWord
dfs(keys,dep+1,selected,size);
//on: no conflict
boolean conflict = false;
for(int preKey: selected)
{
int curKey = keys.get(dep);
int preEd = preKey%size, preSt = (preKey-preEd)/size;
int curEd = curKey%size, curSt = (curKey-curEd)/size;
if(!(preSt<preEd && preEd<=curSt && curSt<curEd) && !(curSt<curEd && curEd<=preSt && preSt<preEd))
conflict = true;
}
if(!conflict)
{
selected.add(keys.get(dep));
dfs(keys,dep+1,selected,size);
selected.remove(keys.get(dep));
}
}
}
public ArrayList<EntityMapping> getEntityIDsAndNamesByStr(String entity, boolean useDblk, int len)
{
String n = entity;
ArrayList<EntityMapping> ret= new ArrayList<EntityMapping>();
//1. Handwriting
if(m2e.containsKey(entity))
{
String eName = m2e.get(entity);
EntityMapping em = new EntityMapping(EntityFragmentFields.entityName2Id.get(eName), eName, 1000);
ret.add(em);
return ret; //handwriting is always correct
}
//2. Lucene index
ret.addAll(EntityFragment.getEntityMappingList(n));
//3. DBpedia Lookup (some cases)
if (useDblk)
{
ret.addAll(Globals.dblk.getEntityMappings(n, null));
}
Collections.sort(ret);
if (ret.size() > 0) return ret;
else return null;
}
public int preferDBpediaLookupOrLucene(String entityName)
{
int cntUpperCase = 0;
int cntSpace = 0;
int cntPoint = 0;
int length = entityName.length();
for (int i=0; i<length; i++)
{
char c = entityName.charAt(i);
if (c==' ')
cntSpace++;
else if (c=='.')
cntPoint++;
else if (c>='A' && c<='Z')
cntUpperCase++;
}
if ((cntUpperCase>0 || cntPoint>0) && cntSpace<3)
return 1;
if (cntUpperCase == length)
return 1;
return 0;
}
static class ByValueComparator implements Comparator<Integer> {
HashMap<Integer, Double> base_map;
int base_size;
double eps = 1e-8;
int dblcmp(double a,double b)
{
if(a+eps < b)
return -1;
return b+eps<a ? 1:0;
}
public ByValueComparator(HashMap<Integer, Double> base_map, Integer size) {
this.base_map = base_map;
this.base_size = size;
}
public int compare(Integer arg0, Integer arg1) {
if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) {
return 0;
}
if (dblcmp(base_map.get(arg0),base_map.get(arg1))<0) {
return 1;
}
else if (dblcmp(base_map.get(arg0),base_map.get(arg1))==0)
{
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size;
if (len0 < len1) {
return 1;
} else if (len0 == len1) {
return 0;
} else {
return -1;
}
}
else {
return -1;
}
}
}
static class ByLenComparator implements Comparator<Integer> {
int base_size;
public ByLenComparator(int size) {
this.base_size = size;
}
public int compare(Integer arg0, Integer arg1) {
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size;
if (len0 < len1) {
return 1;
} else if (len0 == len1) {
return 0;
} else {
return -1;
}
}
}
public boolean isDigit(char ch)
{
if(ch>='0' && ch<='9')
return true;
return false;
}
//TODO: other literal words.
public boolean checkLiteralWord(Word word)
{
boolean ok = false;
if(word.posTag.equals("CD"))
ok = true;
return ok;
}
public static void main (String[] args)
{
Globals.init();
EntityRecognition er = new EntityRecognition();
try
{
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
while (true)
{
System.out.println("Please input the question: ");
String question = br.readLine();
er.process(question);
}
// File inputFile = new File("D:\\husen\\gAnswer\\data\\test\\test_in.txt");
// File outputFile = new File("D:\\husen\\gAnswer\\data\\test\\test_out.txt");
// BufferedReader fr = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile),"utf-8"));
// OutputStreamWriter fw = new OutputStreamWriter(new FileOutputStream(outputFile,true),"utf-8");
//
// String input;
// while((input=fr.readLine())!=null)
// {
// String[] strArray = input.split("\t");
// String id = "";
// String question = strArray[0];
// if(strArray.length>1)
// {
// question = strArray[1];
// id = strArray[0];
// }
// //Notice "?" may leads lucene/dbpedia lookup error
// if(question.length()>1 && question.charAt(question.length()-1)=='.' || question.charAt(question.length()-1)=='?')
// question = question.substring(0,question.length()-1);
// if(question.isEmpty())
// continue;
// er.process(question);
// fw.write("Id: "+id+"\nQuery: "+question+"\n");
// fw.write(er.preLog+"\n");
// }
//
// fr.close();
// fw.close();
} catch (IOException e) {
e.printStackTrace();
}
}

}

+ 467
- 0
src/qa/extract/ExtractImplicitRelation.java View File

@@ -0,0 +1,467 @@
package qa.extract;

import java.io.BufferedReader;
//import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;

import paradict.ParaphraseDictionary;
import qa.Globals;
import rdf.Sparql;
import rdf.Triple;
import rdf.ImplicitRelation;
import lcn.EntityFragmentFields;
import log.QueryLogger;
import fgmt.EntityFragment;
import fgmt.TypeFragment;
import nlp.ds.Word;
import nlp.tool.CoreNLP;

public class ExtractImplicitRelation {
static final int SamplingNumber = 100; // the maximum sampling number in calculation
static final int k = 3; // select top-k when many suitable relations; select top-k entities for a word
public HashMap<String, Integer> implicitEntRel = new HashMap<String, Integer>();
/*
* Implicit Relations:
* eg, Which is the film directed by Obama and starred by a Chinese ?x
* 1. [What] is in a [chocolate_chip_cookie] ?var + ent
* 2. What [country] is [Sitecore] from ?type + ent = [?var p ent + ?var<-type]
* 3. Czech movies | Chinese actor ent + ?type
* 4. President Obama type + ent
* 5. Andy Liu's Hero(film) ent + ent
* */
public ExtractImplicitRelation()
{
//orignal word to lower case
implicitEntRel.put("american", Globals.pd.predicate_2_id.get("country"));
implicitEntRel.put("united_states", Globals.pd.predicate_2_id.get("country"));
}
// Notice, it is usually UNNECESSARY for two constant, so we unimplemented this function.
// eg, "president Obama", "Andy Liu's Hero(film)".
public ArrayList<Integer> getPrefferdPidListBetweenTwoConstant(Word w1, Word w2)
{
ArrayList<Integer> res = new ArrayList<Integer>();
int w1Role = 0, w2Role = 0; // 0:var 1:ent 2:type
if(w1.mayEnt && w1.emList.size()>0)
w1Role = 1;
if(w1.mayType && w1.tmList.size()>0)
w1Role = 2;
if(w2.mayEnt && w2.emList.size()>0)
w2Role = 1;
if(w2.mayType && w2.tmList.size()>0)
w2Role = 2;
//Reject variables | two types
if(w1Role == 0 || w2Role == 0 || (w1Role == 2 && w2Role == 2))
return null;
//ent1 & ent2
//if(w1Role == 1 && w2Role == 1)
//{
//EntityFragment ef = null;
// TODO: implement.
//}
return res;
}
public ArrayList<Triple> supplementTriplesByModifyWord(QueryLogger qlog)
{
ArrayList<Triple> res = new ArrayList<Triple>();
ArrayList<Word> typeVariableList = new ArrayList<Word>();
// Modifier
for(Word word: qlog.s.words)
{
if(word.modifiedWord != null && word.modifiedWord != word)
{
ArrayList<ImplicitRelation> irList = null;
// ent -> typeVariable | eg, Chinese actor, Czech movies | TODO: consider more types of modifier
if(word.mayEnt && word.modifiedWord.mayType)
{
typeVariableList.add(word.modifiedWord);
int tId = word.modifiedWord.tmList.get(0).typeID; // select the top-1 type
String tName = word.modifiedWord.originalForm;
for(int i=0; i<k&&i<word.emList.size(); i++) // select the top-k entities
{
int eId = word.emList.get(i).entityID;
String eName = word.emList.get(i).entityName;
irList = getPrefferdPidListBetween_Entity_TypeVariable(eId, tId);
// !Handwriting implicit relations
if(irList != null && implicitEntRel.containsKey(word.originalForm.toLowerCase()))
{
int pId = implicitEntRel.get(word.originalForm.toLowerCase());
ImplicitRelation ir = new ImplicitRelation(tId, eId, pId, 1000);
irList.add(0, ir);
}
if(irList!=null && irList.size()>0)
{
ImplicitRelation ir = irList.get(0);
String subjName = null, objName = null;
Word subjWord = null, objWord = null;
if(ir.subjId == eId)
{
subjName = eName;
objName = "?"+tName;
subjWord = word;
objWord = word.modifiedWord;
}
else
{
subjName = "?"+tName;
objName = eName;
subjWord = word.modifiedWord;
objWord = word;
}
Triple triple = new Triple(ir.subjId, subjName, ir.pId, ir.objId, objName, null, ir.score, subjWord, objWord);
res.add(triple);
break;
}
}
}
}
}
if(qlog.rankedSparqls == null || qlog.rankedSparqls.size() == 0)
{
if(res != null && res.size() > 0)
{
Sparql spq = new Sparql();
for(Triple t: res)
spq.addTriple(t);
// Add type info
for(Word typeVar: typeVariableList)
{
Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+typeVar.originalForm, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeVar.tmList.get(0).typeName, null, 100);
spq.addTriple(triple);
}
qlog.rankedSparqls.add(spq);
}
}
else
{
// Supplement implicit relations (modified) for each SPARQL.
for(Sparql spq: qlog.rankedSparqls)
{
for(Triple t: res)
spq.addTriple(t);
}
}
return res;
}
/*
* eg:Czech|ent movies|?type Chinese|ent actor|?type
* type variable + entity -> entities belong to type + entity
* */
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_TypeVariable(Integer entId, Integer typeId)
{
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>();
TypeFragment tf = TypeFragment.typeFragments.get(typeId);
EntityFragment ef2 = EntityFragment.getEntityFragmentByEntityId(entId);
if(tf == null || ef2 == null)
{
System.out.println("Error in getPrefferdPidListBetween_TypeVariable_Entity :Type(" +
TypeFragment.typeId2ShortName.get(typeId) + ") or Entity(" + EntityFragmentFields.entityId2Name.get(entId) + ") no fragments.");
return null;
}
// select entities belong to type, count relations | TODO: random select
int samplingCnt = 0;
HashMap<ImplicitRelation, Integer> irCount = new HashMap<ImplicitRelation, Integer>();
for(int candidateEid: tf.entSet)
{
EntityFragment ef1 = EntityFragment.getEntityFragmentByEntityId(candidateEid);
if(ef1 == null)
continue;
ArrayList<ImplicitRelation> tmp = getPrefferdPidListBetween_TwoEntities(ef1, ef2);
if(tmp == null || tmp.size() == 0)
continue;
if(samplingCnt++ > SamplingNumber)
break;
for(ImplicitRelation ir: tmp)
{
if(ir.subjId == candidateEid)
ir.setSubjectId(Triple.VAR_ROLE_ID);
else if(ir.objId == candidateEid)
ir.setObjectId(Triple.VAR_ROLE_ID);
if(irCount.containsKey(ir))
irCount.put(ir, irCount.get(ir)+1);
else
irCount.put(ir, 1);
}
}
//sort, get top-k
ByValueComparator bvc = new ByValueComparator(irCount);
List<ImplicitRelation> keys = new ArrayList<ImplicitRelation>(irCount.keySet());
Collections.sort(keys, bvc);
for(ImplicitRelation ir: keys)
{
res.add(ir);
if(res.size() >= k)
break;
}
return res;
}
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_TypeVariable(String entName, String typeName)
{
if(!TypeFragment.typeShortName2IdList.containsKey(typeName) || !EntityFragmentFields.entityName2Id.containsKey(entName))
return null;
return getPrefferdPidListBetween_Entity_TypeVariable(EntityFragmentFields.entityName2Id.get(entName), TypeFragment.typeShortName2IdList.get(typeName).get(0));
}
static class ByValueComparator implements Comparator<ImplicitRelation> {
HashMap<ImplicitRelation, Integer> base_map;
public ByValueComparator(HashMap<ImplicitRelation, Integer> base_map) {
this.base_map = base_map;
}
public int compare(ImplicitRelation arg0, ImplicitRelation arg1) {
if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1))
return 0;
if (base_map.get(arg0) < base_map.get(arg1))
return 1;
else if (base_map.get(arg0) == base_map.get(arg1))
return 0;
else
return -1;
}
}
/*
* Notice, this function has not been used in fact.
* eg:[What] is in a [chocolate_chip_cookie]
* Just guess by single entity: select the most frequent edge.
* */
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_Variable(Integer entId, String var)
{
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>();
EntityFragment ef = null;
ef = EntityFragment.getEntityFragmentByEntityId(entId);
if(ef == null)
{
System.out.println("Error in getPrefferdPidListBetween_Entity_Variable: Entity No Fragments!");
return null;
}
// find most frequent inEdge
int pid = findMostFrequentEdge(ef.inEntMap, ef.inEdges);
if(pid != -1)
res.add(new ImplicitRelation(Triple.VAR_ROLE_ID, entId, pid, 100));
// find most frequent outEdge
pid = findMostFrequentEdge(ef.outEntMap, ef.outEdges);
if(pid != -1)
res.add(new ImplicitRelation(entId, Triple.VAR_ROLE_ID, pid, 100));
return res;
}
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_Entity_Variable(String entName, String var)
{
return getPrefferdPidListBetween_Entity_Variable(EntityFragmentFields.entityName2Id.get(entName), var);
}
public int findMostFrequentEdge(HashMap<Integer, ArrayList<Integer>> entMap, HashSet<Integer> edges)
{
int mfPredicateId = -1, maxCount = 0;
HashMap<Integer, Integer> edgeCount = new HashMap<Integer, Integer>();
for(int key: entMap.keySet())
{
for(int edge: entMap.get(key))
{
if(!edgeCount.containsKey(edge))
edgeCount.put(edge, 1);
else
edgeCount.put(edge, edgeCount.get(edge)+1);
if(maxCount < edgeCount.get(edge))
{
maxCount = edgeCount.get(edge);
mfPredicateId = edge;
}
}
}
return mfPredicateId;
}

// Unnecessary.
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TypeConstant_Entity(Integer typeId, Integer entId)
{
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>();
TypeFragment tf = TypeFragment.typeFragments.get(typeId);
if(tf == null)
{
System.out.println("Error in getPrefferdPidListBetween_TypeConstant_Entity: Type No Fragments!");
return null;
}
// subj : ent1
if(tf.entSet.contains(entId))
{
ImplicitRelation ir = new ImplicitRelation(entId, typeId, Globals.pd.typePredicateID, 100);
res.add(ir);
}
return res;
}
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TwoEntities(String eName1, String eName2)
{
return getPrefferdPidListBetween_TwoEntities(EntityFragmentFields.entityName2Id.get(eName1), EntityFragmentFields.entityName2Id.get(eName2));
}
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TwoEntities(Integer eId1, Integer eId2)
{
EntityFragment ef1 = null, ef2 = null;
ef1 = EntityFragment.getEntityFragmentByEntityId(eId1);
ef2 = EntityFragment.getEntityFragmentByEntityId(eId2);
if(ef1 == null || ef2 == null)
{
System.out.println("Error in GetPrefferdPidListBetweenTwoEntities: Entity No Fragments!");
return null;
}
return getPrefferdPidListBetween_TwoEntities(ef1,ef2);
}
public ArrayList<ImplicitRelation> getPrefferdPidListBetween_TwoEntities(EntityFragment ef1, EntityFragment ef2)
{
ArrayList<ImplicitRelation> res = new ArrayList<ImplicitRelation>();
if(ef1 == null || ef2 == null)
return null;
int eId1 = ef1.eId;
int eId2 = ef2.eId;
// subj : ent1
if(ef1.outEntMap.containsKey(eId2))
{
ArrayList<Integer> pidList = ef1.outEntMap.get(eId2);
for(int pid: pidList)
{
// TODO: other score strategy
ImplicitRelation ir = new ImplicitRelation(eId1, eId2, pid, 100);
res.add(ir);
}
}
// subj : ent2
else if(ef2.outEntMap.containsKey(eId1))
{
ArrayList<Integer> pidList = ef2.outEntMap.get(eId1);
for(int pid: pidList)
{
ImplicitRelation ir = new ImplicitRelation(eId2, eId1, pid, 100);
res.add(ir);
}
}
return res;
}
public static void main(String[] args) throws Exception {
Globals.coreNLP = new CoreNLP();
Globals.pd = new ParaphraseDictionary();
try
{
EntityFragmentFields.load();
TypeFragment.load();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
ExtractImplicitRelation eir = new ExtractImplicitRelation();
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
String name1,name2;
while(true)
{
System.out.println("Input two node to extract their implicit relations:");
name1 = br.readLine();
name2 = br.readLine();
ArrayList<ImplicitRelation> irList = null;
irList = eir.getPrefferdPidListBetween_TwoEntities(name1, name2);
if(irList == null || irList.size()==0)
System.out.println("Can't find!");
else
{
for(ImplicitRelation ir: irList)
{
int pId = ir.pId;
String p = Globals.pd.getPredicateById(pId);
System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId);
System.out.println(ir.subj+"\t"+p+"\t"+ir.obj);
}
}
// irList = eir.getPrefferdPidListBetween_TypeConstant_Entity(name1, name2);
// if(irList == null || irList.size()==0)
// System.out.println("Can't find!");
// else
// {
// for(ImplicitRelation ir: irList)
// {
// int pId = ir.pId;
// String p = Globals.pd.getPredicateById(pId);
// System.out.println(ir.subj+"\t"+p+"\t"+ir.obj);
// }
// }
// irList = eir.getPrefferdPidListBetween_Entity_Variable(name1, name2);
// if(irList == null || irList.size()==0)
// System.out.println("Can't find!");
// else
// {
// for(ImplicitRelation ir: irList)
// {
// int pId = ir.pId;
// String p = Globals.pd.getPredicateById(pId);
// System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId);
// }
// }
// irList = eir.getPrefferdPidListBetween_Entity_TypeVariable(name1, name2);
// if(irList == null || irList.size()==0)
// System.out.println("Can't find!");
// else
// {
// for(ImplicitRelation ir: irList)
// {
// int pId = ir.pId;
// String p = Globals.pd.getPredicateById(pId);
// System.out.println(ir.subjId+"\t"+p+"\t"+ir.objId);
// }
// }
}
}
}

+ 472
- 0
src/qa/extract/ExtractRelation.java View File

@@ -0,0 +1,472 @@
package qa.extract;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;

import log.QueryLogger;
import nlp.ds.DependencyTree;
import nlp.ds.DependencyTreeNode;
//import nlp.ds.Word;
import paradict.ParaphraseDictionary;
import qa.Globals;
import rdf.SimpleRelation;
import rdf.PredicateMapping;
import rdf.SemanticRelation;
import rdf.SemanticUnit;

public class ExtractRelation {

public static final int notMatchedCountThreshold = 1; // the bigger, the looser (more relations can be extracted)
public static final int notCoverageCountThreshold = 2;
/*
* Find relations by dependency tree & paraphrases.
* */
public ArrayList<SimpleRelation> findRelationsBetweenTwoUnit(SemanticUnit su1, SemanticUnit su2, QueryLogger qlog)
{
DependencyTree T = qlog.s.dependencyTreeStanford;
if(qlog.isMaltParserUsed)
T = qlog.s.dependencyTreeMalt;
DependencyTreeNode n1 = T.getNodeByIndex(su1.centerWord.position), n2 = T.getNodeByIndex(su2.centerWord.position);
ArrayList<DependencyTreeNode> shortestPath = T.getShortestNodePathBetween(n1,n2);
ArrayList<SimpleRelation> ret = new ArrayList<SimpleRelation>();
HashSet<String> BoW_T = new HashSet<String>();
HashSet<String> SubBoW_T = new HashSet<String>();
// (Fix shortest path) Some cases consider the words not in shortest path | eg: What [be] [ent] (famous) for?
// what-be-[ent], the word [be] is useless but we need (famous)
if(shortestPath.size() == 3 && shortestPath.get(1).word.baseForm.equals("be") && T.nodesList.size() > shortestPath.get(2).word.position)
{
shortestPath.remove(1);
shortestPath.add(1, T.getNodeByIndex(shortestPath.get(1).word.position + 1));
}
// Shortest path -> SubBag of Words
for(DependencyTreeNode curNode: shortestPath)
{
String text = curNode.word.baseForm;
if(!curNode.word.isIgnored && !Globals.stopWordsList.isStopWord(text))
{
//!split words |eg, soccer club -> soccer_club(after node recognition) -> soccer club(used in matching paraphrase)
if(curNode.word.mayEnt || curNode.word.mayType)
{
String [] strArray = curNode.word.baseForm.split("_");
for(String str: strArray)
SubBoW_T.add(str);
}
else
{
SubBoW_T.add(text);
}
}
}
// DS tree -> Bag of Words
for (DependencyTreeNode curNode : T.getNodesList())
{
if (!curNode.word.isIgnored)
{
String text = curNode.word.baseForm;
if(curNode.word.mayEnt || curNode.word.mayType)
{
String [] strArray = curNode.word.baseForm.split("_");
for(String str: strArray)
BoW_T.add(str);
}
else
{
BoW_T.add(text);
}
}
}
// Find candidate patterns by SubBoW_T & inveretdIndex
HashSet<String> candidatePatterns = new HashSet<String>();
for (String curWord : SubBoW_T)
{
ArrayList<String> postingList = Globals.pd.invertedIndex.get(curWord);
if (postingList != null)
{
candidatePatterns.addAll(postingList);
}
}
// Check patterns by BoW_P & subtree matching
int notMatchedCount = 0;
HashSet<String> validCandidatePatterns = new HashSet<String>();
for (String p : candidatePatterns)
{
String[] BoW_P = p.split(" ");
notMatchedCount = 0; // not match number between pattern & question
for (String s : BoW_P)
{
if (s.length() < 2)
continue;
if (s.startsWith("["))
continue;
if (Globals.stopWordsList.isStopWord(s))
continue;
if (!BoW_T.contains(s))
{
notMatchedCount ++;
if (notMatchedCount > notMatchedCountThreshold)
break;
}
}
if (notMatchedCount <= notMatchedCountThreshold)
{
validCandidatePatterns.add(p);
//TODO: to support matching like [soccer_club]
subTreeMatching(p, BoW_P, shortestPath, T, qlog, ret, 'S');
}
}
// Another chance for [soccer_club] (the relation embedded in nodes)
if(validCandidatePatterns.size() > 0)
{
if(n1.word.originalForm.contains("_") || n2.word.originalForm.contains("_"))
{
for (String p : validCandidatePatterns)
{
String[] BoW_P = p.split(" ");
notMatchedCount = 0;
int mappedCharacterCount = 0;
int matchedWordInArg = 0;

boolean[] matchedFlag = new boolean[BoW_P.length];
for(int idx = 0; idx < BoW_P.length; idx ++) {matchedFlag[idx] = false;}
int idx = 0;
for (String s : BoW_P)
{
if(n1.word.baseForm.contains(s) || n2.word.baseForm.contains(s)) // Hit nodes
matchedWordInArg++;
if(BoW_T.contains(s))
{
mappedCharacterCount += s.length();
matchedFlag[idx] = true;
}
idx++;
if (s.length() < 2)
continue;
if (s.startsWith("["))
continue;
if (Globals.stopWordsList.isStopWord(s))
continue;
if (!BoW_T.contains(s))
notMatchedCount ++;
}
// Success if has 2 hits
if(matchedWordInArg >= 2)
{
double matched_score = ((double)(BoW_P.length-notMatchedCount))/((double)(BoW_P.length));
if (matched_score > 0.95)
matched_score *= 10; // award for WHOLE match
// TODO: this will make LONGER one has LARGER score, sometimes unsuitable | eg, be bear die in
matched_score = matched_score * Math.sqrt(mappedCharacterCount);
SimpleRelation sr = new SimpleRelation();
sr.arg1Word = n1.word;
sr.arg2Word = n2.word;
sr.relationParaphrase = p;
sr.matchingScore = matched_score;
sr.extractingMethod = 'X';
if (n1.dep_father2child.endsWith("subj"))
sr.preferredSubj = sr.arg1Word;
sr.arg1Word.setIsCovered();
sr.arg2Word.setIsCovered();
sr.setPasList(p, matched_score, matchedFlag);
sr.setPreferedSubjObjOrder(T);
ret.add(sr);
}
}
}
}
return ret;
}
// Core function of paraphrase matching
private void subTreeMatching (String pattern, String[] BoW_P,
ArrayList<DependencyTreeNode> shortestPath,
DependencyTree T, QueryLogger qlog,
ArrayList<SimpleRelation> ret, char extractingMethod)
{
DependencyTreeNode n1 = shortestPath.get(0);
DependencyTreeNode n2 = shortestPath.get(shortestPath.size()-1);
ParaphraseDictionary pd = Globals.pd;
Queue<DependencyTreeNode> queue = new LinkedList<DependencyTreeNode>();
queue.add(T.getRoot());
for(DependencyTreeNode curOuterNode: shortestPath)
{
outer:
for(String s: BoW_P)
{
if(s.equals(curOuterNode.word.baseForm))
{
// try to match all nodes
ArrayList<DependencyTreeNode> subTreeNodes = new ArrayList<DependencyTreeNode>();
Queue<DependencyTreeNode> queue2 = new LinkedList<DependencyTreeNode>();
queue2.add(curOuterNode);
int unMappedLeft = BoW_P.length;
int mappedCharacterCount = 0;
int hitPathCnt = 0; // words in pattern hit the shortest path
int hitPathBetweenTwoArgCnt = 0; //words in pattern hit the shortest path and excluding the two target nodes
double mappedCharacterCountPunishment = 0; // punishment when contains [[]] (function word)
DependencyTreeNode curNode;
boolean[] matchedFlag = new boolean[BoW_P.length];
for(int idx = 0; idx < BoW_P.length; idx ++) {matchedFlag[idx] = false;}

while (unMappedLeft > 0 && (curNode=queue2.poll())!=null)
{
if (curNode.word.isIgnored) continue;
int idx = 0;
for (String ss : BoW_P)
{
// words in pattern only can be matched once
if (!matchedFlag[idx])
{
// check word
if (ss.equals(curNode.word.baseForm))
{
unMappedLeft --;
subTreeNodes.add(curNode);
queue2.addAll(curNode.childrenList);
matchedFlag[idx] = true;
mappedCharacterCount += ss.length();
if(shortestPath.contains(curNode))
{
hitPathCnt++;
if(curNode!=n1 && curNode!=n2)
hitPathBetweenTwoArgCnt++;
}
break;
}
// check POS tag
else if (ss.startsWith("[") && posSame(curNode.word.posTag, ss))
{
unMappedLeft --;
subTreeNodes.add(curNode);
queue2.addAll(curNode.childrenList);
matchedFlag[idx] = true;
mappedCharacterCount += curNode.word.baseForm.length();
mappedCharacterCountPunishment += 0.01;
break;
}
}
idx ++;
}
}
int unMatchedNoneStopWordCount = 0;
int matchedNoneStopWordCount = 0;
for (int idx = 0; idx < BoW_P.length; idx ++) {
if (BoW_P[idx].startsWith("[")) continue;
if (!matchedFlag[idx]) {
if (!Globals.stopWordsList.isStopWord(BoW_P[idx])) // unmatched
unMatchedNoneStopWordCount ++;
}
else {
if (!Globals.stopWordsList.isStopWord(BoW_P[idx])) // matched
matchedNoneStopWordCount ++;
}
}

if (unMatchedNoneStopWordCount > notMatchedCountThreshold) {
if(qlog.MODE_debug) System.out.println("----But the pattern\"" + pattern + "\" is not a subtree.");
break outer;
}
// MUST have notional words matched, non stop words > 0
if (matchedNoneStopWordCount == 0){
if(qlog.MODE_debug) System.out.println("----But the matching for pattern \"" + pattern + "\" does not have content words.");
break outer;
}
// IF partial match and be covered by other pattern, give up the current pattern
if (unMappedLeft > 0) {
StringBuilder subpattern = new StringBuilder();
for (int idx = 0; idx < BoW_P.length; idx ++) {
if (matchedFlag[idx]) {
subpattern.append(BoW_P[idx]);
subpattern.append(' ');
}
}
subpattern.deleteCharAt(subpattern.length()-1);
if (pd.nlPattern_2_predicateList.containsKey(subpattern)) {
if(qlog.MODE_debug) System.out.println("----But the partially matched pattern \"" + pattern + "\" is another pattern.");
break outer;
}
}
// !Preposition | suppose only have one preposition
// TODO: consider more preposition | the first preposition may be wrong
DependencyTreeNode prep = null;
for (DependencyTreeNode dtn : subTreeNodes) {
outer2:
for (DependencyTreeNode dtn_child : dtn.childrenList) {
if(pd.prepositions.contains(dtn_child.word.baseForm)) {
prep = dtn_child;
break outer2;
}
}
}
boolean isContained = false;
for(DependencyTreeNode dtn_contain : subTreeNodes) {
if(dtn_contain == prep) isContained = true;
}
if(!isContained && prep != null) {
subTreeNodes.add(prep);
}
// Relation extracted, set COVER flags
for (DependencyTreeNode dtn : subTreeNodes)
{
dtn.word.isCovered = true;
}
int cnt = 0;
double matched_score = ((double)(BoW_P.length-unMappedLeft))/((double)(BoW_P.length));
if (matched_score > 0.95)
matched_score *= 10; // Award for WHOLE match
// The match ratio between pattern and path larger, the score higher; especially when uncovered with the two target nodes
if(hitPathCnt != 0)
{
double hitScore = 1 + (double)hitPathCnt/(double)BoW_P.length;
if(hitPathBetweenTwoArgCnt == hitPathCnt)
hitScore += 1;
else if(shortestPath.size() >= 4) // If path long enough, pattern still cover with the target nodes, punishment
{
//hitScore = 0.5;
if(hitPathBetweenTwoArgCnt == 0) // If path long enough, pattern cover with target nodes totally, punishment a lot
hitScore = 0.25;
}
matched_score *= hitScore;
}
matched_score = matched_score * Math.sqrt(mappedCharacterCount) - mappedCharacterCountPunishment; // the longer, the better (unsuitable in some cases)
if (qlog.MODE_debug) System.out.println("☆" + pattern + ", score=" + matched_score);

DependencyTreeNode subject = n1;
DependencyTreeNode object = n2;
if (subject != object)
{
SimpleRelation sr = new SimpleRelation();
sr.arg1Word = subject.word;
sr.arg2Word = object.word;
sr.relationParaphrase = pattern;
sr.matchingScore = matched_score;
sr.extractingMethod = extractingMethod;
if (subject.dep_father2child.endsWith("subj"))
sr.preferredSubj = sr.arg1Word;
sr.arg1Word.setIsCovered();
sr.arg2Word.setIsCovered();
sr.setPasList(pattern, matched_score, matchedFlag);
sr.setPreferedSubjObjOrder(T);
ret.add(sr);
cnt ++;
//String binaryRelation = "<" + subjectString + "> <" + pattern + "> <" + objectString + ">";
}
if (cnt == 0) break outer;
}
}
}
}
// [[det]], [[num]], [[adj]], [[pro]], [[prp]], [[con]], [[mod]]
public boolean posSame(String tag, String posWithBracket) {
if ( (posWithBracket.charAt(2) == 'd' && tag.equals("DT"))
|| (posWithBracket.charAt(2) == 'n' && tag.equals("CD"))
|| (posWithBracket.charAt(2) == 'a' && (tag.startsWith("JJ") || tag.startsWith("RB")))
|| (posWithBracket.charAt(2) == 'c' && tag.startsWith("CC"))//TODO: how about "IN: subordinating conjunction"?
|| (posWithBracket.charAt(2) == 'm' && tag.equals("MD"))) {
return true;
}
else if (posWithBracket.charAt(2) == 'p') {
if ( (posWithBracket.charAt(4) == 'o' && tag.startsWith("PR"))
|| (posWithBracket.charAt(4) == 'p' && (tag.equals("IN") || tag.equals("TO")))) {
return true;
}
}
return false;
}
public HashMap<Integer, SemanticRelation> groupSimpleRelationsByArgsAndMapPredicate (ArrayList<SimpleRelation> simpleRelations) {
System.out.println("==========Group Simple Relations=========");
HashMap<Integer, SemanticRelation> ret = new HashMap<Integer, SemanticRelation>();
HashMap<Integer, HashMap<Integer, StringAndDouble>> key2pasMap = new HashMap<Integer, HashMap<Integer, StringAndDouble>>();
for(SimpleRelation simr : simpleRelations)
{
int key = simr.getHashCode();
if (!ret.keySet().contains(key))
{
ret.put(key, new SemanticRelation(simr));
key2pasMap.put(key, new HashMap<Integer, StringAndDouble>());
}
SemanticRelation semr = ret.get(key);
HashMap<Integer, StringAndDouble> pasMap = key2pasMap.get(key);
// Just use to display.
if (simr.matchingScore > semr.LongestMatchingScore)
{
semr.LongestMatchingScore = simr.matchingScore;
semr.relationParaphrase = simr.relationParaphrase;
}
// for pid=x, no wonder from which pattern, we only record the highest score and the related pattern.
for (int pid : simr.pasList.keySet()) {
double score = simr.pasList.get(pid);
if (!pasMap.containsKey(pid)) {
pasMap.put(pid, new StringAndDouble(simr.relationParaphrase, score));
}
else if (score > pasMap.get(pid).score) {
pasMap.put(pid, new StringAndDouble(simr.relationParaphrase, score));
}
}
}
for (Integer key : key2pasMap.keySet()) {
SemanticRelation semr = ret.get(key);
HashMap<Integer, StringAndDouble> pasMap = key2pasMap.get(key);
semr.predicateMappings = new ArrayList<PredicateMapping>();
//System.out.print("<"+semr.arg1Word.getFullEntityName() + "," + semr.arg2Word.getFullEntityName() + ">:");
for (Integer pid : pasMap.keySet())
{
semr.predicateMappings.add(new PredicateMapping(pid, pasMap.get(pid).score, pasMap.get(pid).str));
//System.out.print("[" + Globals.pd.getPredicateById(pid) + "," + pasMap.get(pid).str + "," + pasMap.get(pid).score + "]");
}
Collections.sort(semr.predicateMappings);
}
System.out.println("=========================================");
return ret;
}
}

class StringAndDouble {
public String str;
public double score;
public StringAndDouble (String str, double score) {
this.str = str;
this.score = score;
}
}

+ 358
- 0
src/qa/extract/TypeRecognition.java View File

@@ -0,0 +1,358 @@
package qa.extract;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;

import nlp.ds.Word;
import nlp.tool.StopWordsList;
//import fgmt.RelationFragment;
import fgmt.TypeFragment;
import lcn.SearchInTypeShortName;
import log.QueryLogger;
import qa.Globals;
import rdf.PredicateMapping;
import rdf.SemanticRelation;
import rdf.Triple;
import rdf.TypeMapping;

/*
* 2016-6-17
* 1. Recognize types (include YAGO type)
* 2、Add some type mapping manually, eg, "US State"-"yago:StatesOfTheUnitedStates"
* 3、Add some extend variable, (generalization of [variable with inherit type] -> [variable with inherit triples]) eg, ?canadian <birthPlace> <Canada>
* */
public class TypeRecognition {
// dbpedia 2014
//public static final int[] type_Person = {180,279};
//public static final int[] type_Place = {49,228};
//public static final int[] type_Organisation = {419,53};
//dbpedia 2016
public static final int[] type_Person = {5828,15985};
public static final int[] type_Place = {11197,2188};
public static final int[] type_Organisation = {1335,4716};
public static HashMap<String, String> extendTypeMap = null;
public static HashMap<String, Triple> extendVariableMap = null;
SearchInTypeShortName st = new SearchInTypeShortName();
static
{
extendTypeMap = new HashMap<String, String>();
extendVariableMap = new HashMap<String, Triple>();
Triple triple = null;
//!Handwriting for convenience | TODO: approximate/semantic match of type
extendTypeMap.put("NonprofitOrganizations", "dbo:Non-ProfitOrganisation");
extendTypeMap.put("GivenNames", "dbo:GivenName");
extendTypeMap.put("JamesBondMovies","yago:JamesBondFilms");
extendTypeMap.put("TVShows", "dbo:TelevisionShow");
extendTypeMap.put("USState", "yago:StatesOfTheUnitedStates");
extendTypeMap.put("USStates", "yago:StatesOfTheUnitedStates");
extendTypeMap.put("Europe", "yago:EuropeanCountries");
extendTypeMap.put("Africa", "yago:AfricanCountries");
//!The following IDs are based on DBpedia 2014.
//!extend variable (embedded triples) | eg, [?E|surfers]-?uri dbo:occupation res:Surfing | canadians��<?canadian> <birthPlace> <Canada>
//1) <?canadians> <birthPlace> <Canada> | [country people] <birthPlace|1639> [country]
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 1639, 2112902, "Canada", null, 100);
extendVariableMap.put("canadian", triple);
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 1639, 883747, "Germany", null, 100);
extendVariableMap.put("german", triple);
//2) ?bandleader <occupation|6690> <Bandleader>
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 6690, 5436853, "Bandleader", null, 100);
extendVariableMap.put("bandleader", triple);
triple = new Triple(Triple.VAR_ROLE_ID, Triple.VAR_NAME, 6690, 5436854, "Surfing>", null, 100);
extendVariableMap.put("surfer", triple);
}
public static void recognizeExtendVariable(Word w)
{
String key = w.baseForm;
if(extendVariableMap.containsKey(key))
{
w.mayExtendVariable = true;
Triple triple = extendVariableMap.get(key).copy();
if(triple.subjId == Triple.VAR_ROLE_ID && triple.subject.equals(Triple.VAR_NAME))
triple.subject = "?" + w.originalForm;
if(triple.objId == Triple.VAR_ROLE_ID && triple.object.equals(Triple.VAR_NAME))
triple.object = "?" + w.originalForm;
w.embbededTriple = triple;
}
}
public ArrayList<TypeMapping> getExtendTypeByStr(String allUpperFormWord)
{
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>();
//Do not consider SINGLE-word type (most are useless) | eg, Battle, War, Daughter
if(allUpperFormWord.length() > 1 && allUpperFormWord.substring(1).equals(allUpperFormWord.substring(1).toLowerCase()))
return null;
//search in YAGO type
if(TypeFragment.yagoTypeList.contains(allUpperFormWord))
{
//YAGO prefix
String typeName = "yago:"+allUpperFormWord;
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1);
tmList.add(tm);
}
else if(extendTypeMap.containsKey(allUpperFormWord))
{
String typeName = extendTypeMap.get(allUpperFormWord);
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1);
tmList.add(tm);
}
if(tmList.size()>0)
return tmList;
else
return null;
}
public ArrayList<TypeMapping> getTypeIDsAndNamesByStr (String baseform)
{
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>();
try
{
tmList = st.searchTypeScore(baseform, 0.4, 0.8, 10);
Collections.sort(tmList);
if (tmList.size()>0)
return tmList;
else
return null;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public ArrayList<Integer> recognize (String baseform) {
char c = baseform.charAt(baseform.length()-1);
if (c >= '0' && c <= '9') {
baseform = baseform.substring(0, baseform.length()-2);
}
try {
ArrayList<String> ret = st.searchType(baseform, 0.4, 0.8, 10);
ArrayList<Integer> ret_in = new ArrayList<Integer>();
for (String s : ret) {
System.out.println("["+s+"]");
ret_in.addAll(TypeFragment.typeShortName2IdList.get(s));
}
if (ret_in.size()>0) return ret_in;
else return null;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}

public static void AddTypesOfWhwords (HashMap<Integer, SemanticRelation> semanticRelations) {
ArrayList<TypeMapping> ret = null;
for (Integer it : semanticRelations.keySet())
{
SemanticRelation sr = semanticRelations.get(it);
if(!sr.arg1Word.mayType)
{
ret = recognizeSpecial(sr.arg1Word.baseForm);
if (ret != null)
{
sr.arg1Word.tmList = ret;
}
}
if(!sr.arg2Word.mayType)
{
ret = recognizeSpecial(sr.arg2Word.baseForm);
if (ret != null)
{
sr.arg2Word.tmList = ret;
}
}
}
}
public static ArrayList<TypeMapping> recognizeSpecial (String wordSpecial)
{
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>();
if (wordSpecial.toLowerCase().equals("who"))
{
for (Integer i : type_Person)
{
tmList.add(new TypeMapping(i,"Person",1));
}
//"who" can also means organization
for (Integer i : type_Organisation)
{
tmList.add(new TypeMapping(i,"Organization",1));
}
return tmList;
}
else if (wordSpecial.toLowerCase().equals("where"))
{
for (Integer i : type_Place)
{
tmList.add(new TypeMapping(i,"Place",1));
}
for (Integer i : type_Organisation)
{
tmList.add(new TypeMapping(i,"Organization",1));
}
return tmList;
}
//TODO: When ...
return null;
}
/*
* 1. Priority: mayEnt(Uppercase)>mayType>mayEnt
* 2. mayEnt=1: Constant
* 3. mayType=1:
* (1)Variable, a triple will be added when evaluation. | eg, Which [books] by Kerouac were published by Viking Press?
* (2)Constant, it modify other words. | eg, Are tree frogs a type of [amphibian]?
* 4、extend variable (a variable embedded triples)
* */
public static void constantVariableRecognition(HashMap<Integer, SemanticRelation> semanticRelations, QueryLogger qlog)
{
Word[] words = qlog.s.words;
//NOTICE: modifiers(implicit relation) have not been considered.
for (Integer it : semanticRelations.keySet())
{
SemanticRelation sr = semanticRelations.get(it);
int arg1WordPos = sr.arg1Word.position - 1;
int arg2WordPos = sr.arg2Word.position - 1;
// extend variable recognition
recognizeExtendVariable(sr.arg1Word);
recognizeExtendVariable(sr.arg2Word);
// constant or variable
if(sr.arg1Word.mayExtendVariable)
{
//eg, ?canadian <birthPlace> <Canada> (both extendVariable & type)
if(sr.arg1Word.mayType)
sr.arg1Word.mayType = false;
if(sr.arg1Word.mayEnt)
{
//rule: [extendVaraible & ent] + noun -> ent |eg, Canadian movies -> ent:Canada
if(arg1WordPos+1 < words.length && words[arg1WordPos+1].posTag.startsWith("N"))
{
sr.arg1Word.mayExtendVariable = false;
sr.isArg1Constant = true;
}
else
{
sr.arg1Word.mayEnt = false;
}
}
}
// type
else if(sr.arg1Word.mayType)
{
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries
if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of"))
&& !words[arg1WordPos-2].posTag.startsWith("V"))
{
sr.isArg1Constant = true;
double largerScore = 1000;
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0)
largerScore = sr.predicateMappings.get(0).score * 2;
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]");
sr.predicateMappings.add(0,nPredicate);
//constant type should be object
sr.preferredSubj = sr.arg2Word;
}
}
//ent: constant
else if(sr.arg1Word.mayEnt)
{
sr.isArg1Constant = true;
}
// constant or variable
if(sr.arg2Word.mayExtendVariable)
{
if(sr.arg2Word.mayType)
sr.arg2Word.mayType = false;
if(sr.arg2Word.mayEnt)
{
if(arg2WordPos+1 < words.length && words[arg2WordPos+1].posTag.startsWith("N"))
{
sr.arg2Word.mayExtendVariable = false;
sr.isArg2Constant = true;
}
else
{
sr.arg2Word.mayEnt = false;
}
}
}
// type
else if(sr.arg2Word.mayType)
{
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries
if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of"))
&& !words[arg2WordPos-2].posTag.startsWith("V") )
{
sr.isArg2Constant = true;
double largerScore = 1000;
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0)
largerScore = sr.predicateMappings.get(0).score * 2;
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]");
sr.predicateMappings.add(0,nPredicate);
sr.preferredSubj = sr.arg1Word;
}
//rule: Be ... a type?
if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a"))
{
sr.isArg2Constant = true;
double largerScore = 1000;
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0)
largerScore = sr.predicateMappings.get(0).score * 2;
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]");
sr.predicateMappings.add(0,nPredicate);
sr.preferredSubj = sr.arg1Word;
}
}
else if(sr.arg2Word.mayEnt)
{
sr.isArg2Constant = true;
}
if(sr.arg1Word != sr.preferredSubj)
sr.swapArg1Arg2();
}
}
public static void main (String[] args)
{
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
String type = "space mission";
try
{
TypeFragment.load();
Globals.stopWordsList = new StopWordsList();
TypeRecognition tr = new TypeRecognition();
while(true)
{
System.out.print("Input query type: ");
type = br.readLine();
tr.recognize(type);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}

+ 690
- 0
src/qa/mapping/CompatibilityChecker.java View File

@@ -0,0 +1,690 @@
package qa.mapping;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;

import qa.Globals;
import rdf.Sparql;
import rdf.Triple;
import fgmt.EntityFragment;
import fgmt.RelationFragment;
import fgmt.TypeFragment;
import fgmt.VariableFragment;

/**
* Notice: one compatiblityChecker can be only used once to check a SPARQL.
* @author husen
*/
public class CompatibilityChecker {
static int EnumerateThreshold = 1000;
public EntityFragmentDict efd = null;
public HashMap<String, VariableFragment> variable_fragment = null;
public CompatibilityChecker(EntityFragmentDict efd) {
this.efd = efd;
variable_fragment = new HashMap<String, VariableFragment>();
}

// Run this check function after pass "single triple check" (recoded)
// Recoded: variable will find suitable entities, depend on the inMemory INDEX. Notice when variable = literal
public boolean isSparqlCompatible3 (Sparql spq)
{
boolean[] isFixed = new boolean[spq.tripleList.size()]; // record triple's compatibility whether need check
for (int i = 0; i < spq.tripleList.size(); i ++) {
isFixed[i] = false;
}
//System.out.println("tripleList size="+spq.tripleList.size());
Iterator<Triple> it;
boolean shouldContinue = true;
// shouldContinue when: triple with variables updates variable fragment, then use updated variable fragment check the previous triples
while (shouldContinue)
{
shouldContinue = false;
it = spq.tripleList.iterator();
int t_cnt = 0;
while (it.hasNext()) {
Triple t = it.next();
switch (getTripleType(t)) {
case 1: // (1) E1, P, E2
if (!isFixed[t_cnt])
{
int ret = hs_check1_E1PE2(t);
if (ret == 0)
isFixed[t_cnt] = true;
else if (ret == 5)
return false;
}
break;
case 2: // (2) E, P, V
if(!isFixed[t_cnt])
{
int ret = hs_check2_EPV(t);
if (ret == 5)
return false;
else
{
isFixed[t_cnt] = true; // Now V has set entities or literal; notice E/P->V maybe not unique, eg, xx's starring
if (ret == 1)
shouldContinue = true;
}
}
break;
case 3: // (3) E, <type1>, T
if (!isFixed[t_cnt])
{
int ret = check3_Etype1T(t);
if (ret == -2) return false;
if (ret == 0) isFixed[t_cnt] = true;
}
break;
case 4: // (4) V, P, E
if(!isFixed[t_cnt])
{
int ret = hs_check4_VPE(t);
if (ret == 5)
return false;
else
{
isFixed[t_cnt] = true; // Now V has set entities or literal; notice E/P->V maybe not unique, eg, xx's starring
if (ret == 1)
shouldContinue = true;
}
}
break;
case 5: // (5) V1, P, V2 (The most important and time consuming)
if(!isFixed[t_cnt])
{
int ret = hs_check5_V1PV2(t);
if (ret == 5)
return false;
else
{
isFixed[t_cnt] = true; // Just set once and no re-check
if (ret == 1)
shouldContinue = true;
}
}
break;
case 6: // (6) V, <type1>, T
if (!isFixed[t_cnt])
{
int ret = hs_check6_Vtype1T(t);
if (ret == -2) return false;
else
{
isFixed[t_cnt] = true;
if (ret == 1)
shouldContinue = true;
}
}
break;
case 7:
// do nothing
break;
case 8:
default:
return false;
}
t_cnt ++;
}
}
return true;
}
/**
* Get Triple's category
* (1) E1, P, E2
* (2) E, P, V
* (3) E, <type>, T
* (4) V, P, E
* (5) V1, P, V2
* (6) V, <type>, T
* (7) E, <type>, V
* (8) error
*
* E: Entity
* P: Predicate (exclude <type>)
* V: Variable
* T: Type
*
* @param t
* @return
*/
public int getTripleType (Triple t) {
if (t.predicateID == Globals.pd.typePredicateID) {
boolean s = t.subject.startsWith("?");
boolean o = t.object.startsWith("?");
if (s && !o) return 6;
else if (o && !s) return 7;
else if (!s && !o) return 3;
else return 8;
}
else if (t.subject.startsWith("?")) {
if (t.object.startsWith("?")) return 5;
else return 4;
}
else {
if (t.object.startsWith("?")) return 2;
else return 1;
}
}

public int hs_check1_E1PE2(Triple t)
{
int pid = t.predicateID;
EntityFragment E1 = efd.getEntityFragmentByEid(t.subjId);
EntityFragment E2 = efd.getEntityFragmentByEid(t.objId);

// E2 is E1's one depth neighbor, connected with predicate "p"
if(E1.outEntMap.containsKey(E2.eId))
{
ArrayList<Integer> pList = E1.outEntMap.get(E2.eId);
if(pList.contains(pid))
return 0;
}
return 5;
}
public int hs_check2_EPV(Triple t)
{
int pid = t.predicateID;
EntityFragment E = efd.getEntityFragmentByEid(t.subjId);
VariableFragment V = variable_fragment.get(t.object);
// P ∈ E.outEdges
if (!E.outEdges.contains(pid)) {
return 5;
}

// Set V, notice maybe literal
if(V == null)
{
variable_fragment.put(t.object, new VariableFragment());
V = variable_fragment.get(t.object);
for(int vid: E.outEntMap.keySet())
{
if(E.outEntMap.get(vid).contains(pid))
{
V.candEntities.add(vid);
}
}
// E's outEdges contain p, but cannot find neighbor ENT by p, then V maybe literal
if(V.candEntities.size() == 0)
{
V.mayLiteral = true;
return 0;
}
}
else
{
// just okay if V is literal, because fragment has not stored the literal information
if(V.mayLiteral)
return 0;
// Update V's binding by current neighbor of E
HashSet<Integer> newCandEntities = new HashSet<Integer>();
if(V.candEntities.size() > 0 && V.candEntities.size() < E.outEntMap.size())
{
for(int vid: V.candEntities)
{
if(E.outEntMap.containsKey(vid) && E.outEntMap.get(vid).contains(pid))
{
newCandEntities.add(vid);
}
}
}
else
{
for(int vid: E.outEntMap.keySet())
{
if(E.outEntMap.get(vid).contains(pid) && (V.candEntities.size() == 0 || V.candEntities.contains(vid)))
{
newCandEntities.add(vid);
}
}
}
V.candEntities = newCandEntities;
}
if(V.candEntities.size() > 0)
return 0;
else
return 5;
}
public int check3_Etype1T(Triple t) {
String[] T = t.object.split("\\|"); // ע��"|"��Ҫת��
EntityFragment E = efd.getEntityFragmentByEid(t.subjId);

String newTypeString = "";
boolean contained = false;

// check whether each type int T is proper for E
if (T.length == 0) return -2;
for (String s : T) {
contained = false;
for (Integer i : TypeFragment.typeShortName2IdList.get(s)) {
if (E.types.contains(i)) {
if (!contained) {
contained = true;
newTypeString += s;
newTypeString += "|";
}
}
}
}
if (newTypeString.length() > 1) {
t.object = newTypeString.substring(0, newTypeString.length()-1);
return 0;
}
else return -2;
}
public int hs_check4_VPE(Triple t)
{
int pid = t.predicateID;
EntityFragment E = efd.getEntityFragmentByEid(t.objId);
VariableFragment V = variable_fragment.get(t.subject);
TypeFragment subjTf = SemanticItemMapping.getTypeFragmentByWord(t.getSubjectWord());
// P ∈ E.inEdges
if (!E.inEdges.contains(pid)) {
return 5;
}

// Set V, notice V cannot be literal, because now V is subject
if(V == null)
{
variable_fragment.put(t.subject, new VariableFragment());
V = variable_fragment.get(t.subject);
for(int vid: E.inEntMap.keySet())
{
if(E.inEntMap.get(vid).contains(pid) && (subjTf == null || subjTf.entSet.contains(vid)))
{
V.candEntities.add(vid);
}
}
// E's inEdges contain p, but cannot find neighbor ENT by p, now V is subject and cannot be literal, so match fail
if(V.candEntities.size() == 0)
{
return 5;
}
}
else
{
// if V is literal, fail because subject cannot be literal
if(V.mayLiteral)
return 5;
// update V's binding by current E's neighbors
HashSet<Integer> newCandEntities = new HashSet<Integer>();
if(V.candEntities.size() > 0 && V.candEntities.size() < E.inEntMap.size())
{
for(int vid: V.candEntities)
{
if(E.inEntMap.containsKey(vid) && E.inEntMap.get(vid).contains(pid))
{
newCandEntities.add(vid);
}
}
}
else
{
for(int vid: E.inEntMap.keySet())
{
if(E.inEntMap.get(vid).contains(pid) && (V.candEntities.size() == 0 || V.candEntities.contains(vid)))
{
newCandEntities.add(vid);
}
}
}
V.candEntities = newCandEntities;
}
if(V.candEntities.size() > 0)
return 0;
else
return 5;
}
public int check5_V1PV2(Triple t) {
ArrayList<Integer> pidList = new ArrayList<Integer>();
pidList.add(t.predicateID);
VariableFragment V1 = variable_fragment.get(t.subject);
VariableFragment V2 = variable_fragment.get(t.object);
// V1 & V2's types, equal with types of one fragment of P
Iterator<Integer> it_int = pidList.iterator();
ArrayList<HashSet<Integer>> newCandTypes1 = new ArrayList<HashSet<Integer>>();
ArrayList<HashSet<Integer>> newCandTypes2 = new ArrayList<HashSet<Integer>>();
while (it_int.hasNext()) {
Integer i = it_int.next();
ArrayList<RelationFragment> flist = RelationFragment.relFragments.get(i);
Iterator<RelationFragment> it_rln = flist.iterator();
while (it_rln.hasNext()) {
RelationFragment rf = it_rln.next();
if (V1 == null && V2 == null) {
newCandTypes1.add(rf.inTypes);
newCandTypes2.add(rf.outTypes);
}
else if (V1 == null && V2 != null) {
if (V2.containsAll(rf.outTypes)) {
newCandTypes1.add(rf.inTypes);
newCandTypes2.add(rf.outTypes);
}
}
else if (V2 == null && V1 != null) {
if (V1.containsAll(rf.inTypes)) {
newCandTypes1.add(rf.inTypes);
newCandTypes2.add(rf.outTypes);
}
}
else {
if (V1.containsAll(rf.inTypes) && V2.containsAll(rf.outTypes))
{
newCandTypes1.add(rf.inTypes);
newCandTypes2.add(rf.outTypes);
}
}
}
}
if (newCandTypes1.size() > 0 && newCandTypes2.size() > 0) {
if (V1 == null && V2 == null) {
variable_fragment.put(t.subject, new VariableFragment());
variable_fragment.get(t.subject).candTypes = newCandTypes1;
variable_fragment.put(t.object, new VariableFragment());
variable_fragment.get(t.object).candTypes = newCandTypes2;
return 1;
}
else if (V1 == null && V2 != null) {
variable_fragment.put(t.subject, new VariableFragment());
variable_fragment.get(t.subject).candTypes = newCandTypes1;
if (V2.candTypes.size() > newCandTypes2.size()) {
V2.candTypes = newCandTypes2;
return 1;
}
else return 0;
}
else if (V2 == null && V1 != null) {
variable_fragment.put(t.object, new VariableFragment());
variable_fragment.get(t.object).candTypes = newCandTypes2;

if (V1.candTypes.size() > newCandTypes1.size()) {
V1.candTypes = newCandTypes1;
return 1;
}
else return 0;
}
else {
if (V1.candTypes.size() > newCandTypes1.size() || V2.candTypes.size() > newCandTypes2.size()) {
V1.candTypes = newCandTypes1;
V2.candTypes = newCandTypes2;
return 1;
}
else return 0;
}
}
else return 5;
}
public int hs_check5_V1PV2(Triple t)
{
int pid = t.predicateID;
VariableFragment V1 = variable_fragment.get(t.subject);
VariableFragment V2 = variable_fragment.get(t.object);
if(V1 == null && V2 == null) // The WORST case, current relation fragment has no records of two target entities, cannot check without types, so we should put this triple in the end
{
return 0; // in fact should return 1, just expect the unchecked triples can provide candidates of V1,V2 then can check in the next turn
}
else if(V2 == null)
{
if(V1.mayLiteral)
return 5;
variable_fragment.put(t.object, new VariableFragment());
V2 = variable_fragment.get(t.object);
HashSet<Integer> newV1cands = new HashSet<Integer>();
int cnt = 0;
for(int v1id: V1.candEntities)
{
cnt++;
if(cnt > EnumerateThreshold)
break;
EntityFragment E = efd.getEntityFragmentByEid(v1id);
if(E != null && E.outEdges.contains(pid))
{
newV1cands.add(v1id);
for(int v2id: E.outEntMap.keySet())
{
if(E.outEntMap.get(v2id).contains(pid))
V2.candEntities.add(v2id);
}
}
}
V1.candEntities = newV1cands;
}
else if(V1 == null)
{
if(V2.mayLiteral)
return 0;
variable_fragment.put(t.subject, new VariableFragment());
V1 = variable_fragment.get(t.subject);
HashSet<Integer> newV2cands = new HashSet<Integer>();
int cnt = 0;
for(int v2id: V2.candEntities)
{
cnt++;
if(cnt > EnumerateThreshold)
break;
EntityFragment E = efd.getEntityFragmentByEid(v2id);
if(E != null && E.inEdges.contains(pid))
{
newV2cands.add(v2id);
for(int v1id: E.inEntMap.keySet())
{
if(E.inEntMap.get(v1id).contains(pid))
V1.candEntities.add(v1id);
}
}
}
V2.candEntities = newV2cands;
}
else
{
if(V1.mayLiteral)
return 5;
if(V2.mayLiteral)
return 0;
HashSet<Integer> newV1cands = new HashSet<Integer>();
HashSet<Integer> newV2cands = new HashSet<Integer>();
for(int v1id: V1.candEntities)
{
EntityFragment E1 = efd.getEntityFragmentByEid(v1id);
if(E1 != null && E1.outEdges.contains(pid))
newV1cands.add(v1id);
}
V1.candEntities = newV1cands;
for(int v2id: V2.candEntities)
{
EntityFragment E2 = efd.getEntityFragmentByEid(v2id);
if(E2 != null && E2.inEdges.contains(pid))
newV2cands.add(v2id);
}
V2.candEntities = newV2cands;
newV1cands = new HashSet<Integer>();
newV2cands = new HashSet<Integer>();
for(int v1id: V1.candEntities)
{
EntityFragment E1 = efd.getEntityFragmentByEid(v1id);
for(int v2id: V2.candEntities)
{
if(E1.outEntMap.containsKey(v2id) && E1.outEntMap.get(v2id).contains(pid))
{
newV1cands.add(v1id);
newV2cands.add(v2id);
}
}
}
V1.candEntities = newV1cands;
V2.candEntities = newV2cands;
}
if(V1.candEntities.size() == 0 || (V2.candEntities.size() == 0 && !RelationFragment.isLiteral(pid)))
return 5;
else
return 0;
}
public int check6_Vtype1T(Triple t) {
String[] T = t.object.split("\\|"); // notice "|" need "\\|"
VariableFragment V = variable_fragment.get(t.subject);

String newTypeString = "";
boolean contained = false;

// check whether each type in T is proper for V
if (T.length == 0) return -2;
ArrayList<HashSet<Integer>> newCandTypes = new ArrayList<HashSet<Integer>>();
for (String s : T)
{
contained = false;
//YAGO type (uncoded types), just return because we have no INDEX to check it
if(!TypeFragment.typeShortName2IdList.containsKey(s))
return 0;
for (Integer i : TypeFragment.typeShortName2IdList.get(s))
{
if (V == null) {
// constraint V by user given types, flag it due to possible incomplete type
HashSet<Integer> set = new HashSet<Integer>();
set.add(i);
set.add(VariableFragment.magic_number);
newCandTypes.add(set);
if (!contained) {
contained = true;
newTypeString += s;
newTypeString += "|";
}
}
else if (V.contains(i)) {
if (!contained) {
contained = true;
newTypeString += s;
newTypeString += "|";
}
}
}
}
// check whether each fragment in V is proper for T
// if not, delete the fragment (that means we can narrow the scope)
ArrayList<HashSet<Integer>> deleteCandTypes = new ArrayList<HashSet<Integer>>();
if (V != null)
{
Iterator<HashSet<Integer>> it = V.candTypes.iterator();
while(it.hasNext()) {
HashSet<Integer> set = it.next();
boolean isCandTypeOkay = false;
//v get [constraint types] through other triples, at least one type can reserve, otherwise delete the [constriant types]
for (String s : T)
{
for (Integer i : TypeFragment.typeShortName2IdList.get(s)) {
if (set.contains(i)) {
isCandTypeOkay = true;
break;
}
}
}
if (!isCandTypeOkay) {
deleteCandTypes.add(set);
}
}
V.candTypes.removeAll(deleteCandTypes);
}
if (V == null) {
variable_fragment.put(t.subject, new VariableFragment());
variable_fragment.get(t.subject).candTypes = newCandTypes;
}
if (newTypeString.length() > 1) {
t.object = newTypeString.substring(0, newTypeString.length()-1);
if (deleteCandTypes.size() > 0) {
return 1;
}
else {
return 0;
}
}
else return -2;
}

public int hs_check6_Vtype1T(Triple t)
{
String[] tList = t.object.split("\\|"); // ע��"|"��Ҫת��
VariableFragment V = variable_fragment.get(t.subject);

if (tList.length == 0) return -2;
// Simplify, only consider the first one
if(!TypeFragment.typeShortName2IdList.containsKey(tList[0]))
return 0;
int tid = TypeFragment.typeShortName2IdList.get(tList[0]).get(0);
TypeFragment T = TypeFragment.typeFragments.get(tid);
if(V == null)
{
variable_fragment.put(t.subject, new VariableFragment());
V = variable_fragment.get(t.subject);
V.candEntities = T.entSet;
}
else
{
if(V.mayLiteral) //literal cannot be subject
return -2;
HashSet<Integer> newVcands = new HashSet<Integer>();
for(int vid: V.candEntities)
{
EntityFragment E = efd.getEntityFragmentByEid(vid);
if(E.types.contains(tid))
newVcands.add(vid);
}
V.candEntities = newVcands;
}
if(V.candEntities.size() == 0)
return -2;
else
return 0;
}

public void swapTriple (Triple t) {
String temp = t.subject;
t.subject = t.object;
t.object = temp;
}
};

+ 164
- 0
src/qa/mapping/DBpediaLookup.java View File

@@ -0,0 +1,164 @@
package qa.mapping;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;

import lcn.EntityFragmentFields;
import log.QueryLogger;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;

import fgmt.EntityFragment;
import rdf.EntityMapping;

public class DBpediaLookup {
//There are two websites of the DBpediaLookup online service.
//public static final String baseURL = "http://en.wikipedia.org/w/api.php?action=opensearch&format=xml&limit=10&search=";
//public static final String baseURL = "http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=5&QueryString=";
public static final String baseURL = "http://172.31.222.72:1234/api/search/KeywordSearch?MaxHits=5&QueryString=";
public HttpClient ctripHttpClient = null;
//public static final String begin = "<Text xml:space=\"preserve\">";
//public static final String begin = "<Result>\n <Label>";
public static final String begin = "<Result>\n <Label>";
public static final int begin_length = begin.length();
//public static final String end = "</Text>";
public static final String end = "</Label>";
public static final int end_length = end.length();
public static HashMap<String, String>entMentionDict = null; // TODO: base on redirect data & wikipedia click data to build mention2ent's dictionary, now just manually
public DBpediaLookup()
{
ctripHttpClient = new HttpClient();
ctripHttpClient.setTimeout(3000);
entMentionDict = new HashMap<String, String>();
entMentionDict.put("Prince_Charles", "Charles,_Prince_of_Wales");
}
public ArrayList<EntityMapping> getEntityMappings(String searchString, QueryLogger qlog)
{
ArrayList<String> slist = new ArrayList<String>();
if(entMentionDict.containsKey(searchString))
slist.add(entMentionDict.get(searchString));
else
slist = lookForEntityNames(searchString, qlog);
if (slist.size() == 0 && searchString.contains(". "))
slist.addAll(lookForEntityNames(searchString.replaceAll(". ", "."), qlog));
ArrayList<EntityMapping> emlist = new ArrayList<EntityMapping>();
// Now string use "_" as delimiter (original)
String[] sa = searchString.split("_");
int UpperCnt = 0;
for(String str: sa)
{
if( (str.charAt(0)>='A'&&str.charAt(0)<='Z') || (str.charAt(0)>='0'&&str.charAt(0)<='9') )
UpperCnt ++;
}
System.out.print("DBpediaLookup find: " + slist + ", ");
int count = 40;
for (String s : slist)
{
//consider ABBR only when all UPPER; drop when too long edit distance
if(UpperCnt < sa.length && EntityFragment.calEditDistance(s, searchString.replace("_", ""))>searchString.length()/2)
continue;
int eid = -1;
s = s.replace(" ", "_");
if(EntityFragmentFields.entityName2Id.containsKey(s))
{
eid = EntityFragmentFields.entityName2Id.get(s);
emlist.add(new EntityMapping(eid, s, count));
count -=2 ;
}
else
{
System.out.print("Drop "+s+" because it not in Entity Dictionary. ");
}
}
System.out.println("DBpediaLookup select: " + emlist);
return emlist;
}
public ArrayList<String> lookForEntityNames (String searchString, QueryLogger qlog) {
// URL transition: " " -> %20
GetMethod getMethod = new GetMethod((baseURL+searchString).replaceAll(" ", "%20"));
ArrayList<String> ret = new ArrayList<String>();
int statusCode;
try {
statusCode = ctripHttpClient.executeMethod(getMethod);
} catch (HttpException e) {
e.printStackTrace();
return ret;
} catch (IOException e) {
e.printStackTrace();
return ret;
}
if (statusCode!=200) return null;
String response = getMethod.getResponseBodyAsString();
if (qlog != null && qlog.MODE_debug) {
System.out.println("searchString=" + searchString);
System.out.println("statusCode=" + statusCode);
System.out.println("response=" + getMethod.getResponseBodyAsString());
}
getMethod.releaseConnection();
//System.out.println(response);
if (response == null || response.isEmpty())
return ret;
int idx1 = response.indexOf(begin);
while (idx1 != -1) {
int idx2 = response.indexOf(end, idx1+begin_length);
String ss = response.substring(idx1+begin_length, idx2);
ret.add(ss);
//System.out.println(ss);
idx1 = response.indexOf(begin, idx2 + end_length);
}

return ret;
}
public static void main(String argv[]){
DBpediaLookup dbplook = new DBpediaLookup();
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
try {
while (true) {
System.out.println("Test DBpediaLookup.");
System.out.print("Please input the search string: ");
String searchString = br.readLine();
try {
long t1 = System.currentTimeMillis();
ArrayList<String> res = dbplook.lookForEntityNames(searchString, null);
long t2 = System.currentTimeMillis();
System.out.println(res);
System.out.println("time=" + (t2-t1) + "ms");
} catch (Exception e) {
e.printStackTrace();
}
}
} catch (IOException e) {
e.printStackTrace();
}

return;
}
}

+ 44
- 0
src/qa/mapping/EntityFragmentDict.java View File

@@ -0,0 +1,44 @@
package qa.mapping;

import java.util.HashMap;

//import lcn.EntityFragmentFields;
//import qa.Globals;
import fgmt.EntityFragment;

public class EntityFragmentDict {
//public HashMap<String, EntityFragment> entityFragmentDictionary = new HashMap<String, EntityFragment>();
public HashMap<Integer, EntityFragment> entityFragmentDictionary = new HashMap<Integer, EntityFragment>();
public EntityFragment getEntityFragmentByEid (Integer eid)
{
if (!entityFragmentDictionary.containsKey(eid))
{
entityFragmentDictionary.put(eid, EntityFragment.getEntityFragmentByEntityId(eid));
}
return entityFragmentDictionary.get(eid);

}
/*
* Old version, search by name
* */
// public EntityFragment getEntityFragmentByName (String name) {
// if (name.startsWith("?")) {
// return null;
// }
// if (!entityFragmentDictionary.containsKey(name)) {
// String fgmt = EntityFragment.getEntityFgmtStringByName(name);
// if (fgmt != null)
// {
// int eid = EntityFragmentFields.entityName2Id.get(name);
// entityFragmentDictionary.put(name, new EntityFragment(eid, fgmt));
// }
// else {
// entityFragmentDictionary.put(name, null);
// }
// }
// return entityFragmentDictionary.get(name);
//
// }
}

+ 811
- 0
src/qa/mapping/SemanticItemMapping.java View File

@@ -0,0 +1,811 @@
package qa.mapping;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;

import nlp.ds.Word;
import nlp.ds.Sentence.SentenceType;
import fgmt.EntityFragment;
import fgmt.RelationFragment;
import fgmt.TypeFragment;
import log.QueryLogger;
import qa.Globals;
import rdf.EntityMapping;
import rdf.PredicateMapping;
import rdf.SemanticRelation;
import rdf.Sparql;
import rdf.Triple;
import rdf.TypeMapping;

public class SemanticItemMapping {
public HashMap<Word, ArrayList<EntityMapping>> entityDictionary = new HashMap<Word, ArrayList<EntityMapping>>();
public static int k = 10; // useless now
public static int t = 10; // Depth of enumerating candidates of each node/edge. O(t^n).
ArrayList<Sparql> rankedSparqls = new ArrayList<Sparql>();
HashSet<String> checkedSparqlStrs = new HashSet<String>();
public ArrayList<ArrayList<EntityMapping>> entityPhrasesList = new ArrayList<ArrayList<EntityMapping>>();
public ArrayList<Word> entityWordList = new ArrayList<Word>();
public HashMap<Integer, EntityMapping> currentEntityMappings = new HashMap<Integer, EntityMapping>();
public ArrayList<ArrayList<PredicateMapping>> predicatePhraseList = new ArrayList<ArrayList<PredicateMapping>>();
public ArrayList<SemanticRelation> predicateSrList = new ArrayList<SemanticRelation>();
public HashMap<Integer, PredicateMapping> currentPredicateMappings = new HashMap<Integer, PredicateMapping>();
public HashMap<Integer, SemanticRelation> semanticRelations = null;
public QueryLogger qlog = null;
public EntityFragmentDict efd = new EntityFragmentDict();
public boolean isAnswerFound = false;
public int tripleCheckCallCnt = 0;
public int sparqlCheckCallCnt = 0;
public int sparqlCheckId = 0;
SemanticRelation firstFalseSr = null;
long tripleCheckTime = 0;
long sparqlCheckTime = 0;
/*
* A best-first top-down method, enumerate all possible query graph and sort.
* Notice, we use fragment checking to simulate graph matching and generate the TOP-k SPARQL queries, which can be executed via GStore or Virtuoso.
* */
public void process(QueryLogger qlog, HashMap<Integer, SemanticRelation> semRltn)
{
semanticRelations = semRltn;
this.qlog = qlog;
long t1;
t = 10; // Notice, t is adjustable.

entityPhrasesList.clear();
entityWordList.clear();
currentEntityMappings.clear();
predicatePhraseList.clear();
predicateSrList.clear();
currentPredicateMappings.clear();
// 1. collect info of constant nodes(entities)
Iterator<Map.Entry<Integer, SemanticRelation>> it = semanticRelations.entrySet().iterator();
while(it.hasNext())
{
Map.Entry<Integer, SemanticRelation> entry = it.next();
SemanticRelation sr = entry.getValue();
//We now only tackle Constant of Entity & Type. TODO: consider Literal.
if(sr.isArg1Constant && !sr.arg1Word.mayType && !sr.arg1Word.mayEnt || sr.isArg2Constant && !sr.arg2Word.mayType && !sr.arg2Word.mayEnt)
{
it.remove();
continue;
}
//Type constant will be solved in ScoreAndRanking function.
if(sr.isArg1Constant && sr.arg1Word.mayEnt)
{
if(!entityDictionary.containsKey(sr.arg1Word))
entityDictionary.put(sr.arg1Word, sr.arg1Word.emList);
entityPhrasesList.add(sr.arg1Word.emList);
entityWordList.add(sr.arg1Word);
}
if(sr.isArg2Constant && !sr.arg2Word.mayType)
{
if (!entityDictionary.containsKey(sr.arg2Word))
entityDictionary.put(sr.arg2Word, sr.arg2Word.emList);
entityPhrasesList.add(sr.arg2Word.emList);
entityWordList.add(sr.arg2Word);
}
}
// 2. collect info of edges(relations).
for (Integer key : semanticRelations.keySet())
{
SemanticRelation sr = semanticRelations.get(key);
predicatePhraseList.add(sr.predicateMappings);
predicateSrList.add(sr);
// Reduce t when structure enumeration needed.
if(Globals.evaluationMethod > 1 && !sr.isSteadyEdge)
t = 5;
}
// 3. top-k join
t1 = System.currentTimeMillis();
if(semanticRelations.size()>0)
topkJoin(semanticRelations);
else
System.out.println("No Valid SemanticRelations.");
qlog.timeTable.put("TopkJoin", (int)(System.currentTimeMillis()-t1));
qlog.timeTable.put("TripleCheck", (int)tripleCheckTime);
qlog.timeTable.put("SparqlCheck", (int)sparqlCheckTime);

Collections.sort(rankedSparqls);
// Notice, use addAll because we may have more than one node recognition decision.
qlog.rankedSparqls.addAll(rankedSparqls);
qlog.entityDictionary = entityDictionary;
System.out.println("Check query graph count: " + tripleCheckCallCnt + "\nPass single check: " + sparqlCheckCallCnt + "\nPass final check: " + rankedSparqls.size());
System.out.println("TopkJoin time=" + qlog.timeTable.get("TopkJoin"));
}

public void topkJoin (HashMap<Integer, SemanticRelation> semanticRelations)
{
dfs_entityName(0);
}
// Each level for a CERTAIN entity
public void dfs_entityName (int level_i)
{
// All entities ready.
if (level_i == entityPhrasesList.size())
{
dfs_predicate(0);
return;
}
ArrayList<EntityMapping> list = entityPhrasesList.get(level_i);
Word w = entityWordList.get(level_i);
int tcount = 0;
for(EntityMapping em : list)
{
if (tcount == t || isAnswerFound) break;
currentEntityMappings.put(w.hashCode(), em);
dfs_entityName(level_i+1);
currentEntityMappings.remove(w.hashCode());
tcount ++;
}
}
public void dfs_predicate(int level_i)
{
// All entities & predicates ready, start generate SPARQL.
if (level_i == predicatePhraseList.size())
{
scoringAndRanking();
return;
}
ArrayList<PredicateMapping> list = predicatePhraseList.get(level_i);
SemanticRelation sr = predicateSrList.get(level_i);
if (sr.dependOnSemanticRelation != null)
{
dfs_predicate(level_i+1);
}
else
{
int tcount=0;
for (PredicateMapping pm : list)
{
if (tcount==t || isAnswerFound) break;
currentPredicateMappings.put(sr.hashCode(), pm);
dfs_predicate(level_i+1);
currentPredicateMappings.remove(sr.hashCode());
tcount++;
// Pruning (If we do not change predicate of firstFalseSr, it will still false, so just return)
if(firstFalseSr != null)
{
if(firstFalseSr != sr) return;
else firstFalseSr = null;
}
}
// "null" means we drop this edge, this is how we enumerate structure.
if(Globals.evaluationMethod == 2 && sr.isSteadyEdge == false)
{
currentPredicateMappings.put(sr.hashCode(), null);
dfs_predicate(level_i+1);
currentPredicateMappings.remove(sr.hashCode());
tcount++;
}
}
}

/*
* Run this function when all nodes/edges have set value (through currentEntityMappings、currentPredicateMappings)
* Generate SPARQL according current ENTs and RELATIONs, then fragment checking
* Notice: add embedded type information:
* eg, ?who <height> ?how --add--> ?who <type1> <Person> | ?book <author> <Tom> --add--> ?book <type1> <Book>
* Notice: add constant type information:
* eg, ask: <YaoMing> <type1> <BasketballPlayer>
* Notice: add embedded triple information:
* eg, ?Canadians <residence> <Unitied_State> --add--> ?Canadians <birthPlace> <Canada>
* */
public void scoringAndRanking()
{
firstFalseSr = null;
Sparql sparql = new Sparql(semanticRelations);

// A simple way to judge connectivity (may incorrect when nodes number >= 6)
//TODO: a standard method to judge CONNECTIVITY
HashMap<Integer, Integer> count = new HashMap<Integer, Integer>();
int edgeCnt = 0;
for (Integer key : semanticRelations.keySet())
{
SemanticRelation sr = semanticRelations.get(key);
if(currentPredicateMappings.get(sr.hashCode()) == null)
continue;
edgeCnt++;
int v1 = sr.arg1Word.hashCode(), v2 = sr.arg2Word.hashCode();
if(!count.containsKey(v1))
count.put(v1, 1);
else
count.put(v1, count.get(v1)+1);
if(!count.containsKey(v2))
count.put(v2, 1);
else
count.put(v2, count.get(v2)+1);
}
if(count.size() < qlog.semanticUnitList.size())
return;
if(edgeCnt == 0)
return;
if(edgeCnt > 1)
{
for (Integer key : semanticRelations.keySet())
{
SemanticRelation sr = semanticRelations.get(key);
if(currentPredicateMappings.get(sr.hashCode()) == null)
continue;
int v1 = sr.arg1Word.hashCode(), v2 = sr.arg2Word.hashCode();
if(count.get(v1) == 1 && count.get(v2) == 1)
return;
}
}
// Now the graph is connected, start to generate SPARQL.
HashSet<String> typeSetFlag = new HashSet<String>();
for (Integer key : semanticRelations.keySet())
{
SemanticRelation sr = semanticRelations.get(key);
String sub, obj;
int subjId = -1, objId = -1;
int pid;
double score = 1;
boolean isSubjObjOrderSameWithSemRltn = true;
// argument1
if(sr.isArg1Constant && (sr.arg1Word.mayEnt || sr.arg1Word.mayType) ) // Constant
{
// For subject, entity has higher priority.
if(sr.arg1Word.mayEnt)
{
EntityMapping em = currentEntityMappings.get(sr.arg1Word.hashCode());
subjId = em.entityID;
sub = em.entityName;
score *= em.score;
}
else
{
TypeMapping tm = sr.arg1Word.tmList.get(0);
subjId = Triple.TYPE_ROLE_ID;
sub = tm.typeName;
score *= (tm.score*100); // Generalization. type score: [0,1], entity score: [0,100].
}
}
else // Variable
{
subjId = Triple.VAR_ROLE_ID;
sub = "?" + sr.arg1Word.originalForm;
}
// Embedded Type info of argument1(variable type) | eg, ?book <type> <Book>
// Notice, mayType & mayExtendVariable is mutual-exclusive. (see constantVariableRecognition)
// Notice, we do NOT consider types of [?who,?where...] now.
Triple subt = null;
if (!sr.isArg1Constant && sr.arg1Word.mayType && sr.arg1Word.tmList != null && sr.arg1Word.tmList.size() > 0 && !typeSetFlag.contains(sub))
{
StringBuilder type = new StringBuilder("");
for (TypeMapping tm: sr.arg1Word.tmList)
{
Integer tt = tm.typeID;
if(tt != -1)
type.append(TypeFragment.typeId2ShortName.get(tt));
else
type.append(tm.typeName);
type.append('|');
}
String ttt = type.substring(0, type.length()-1);
subt = new Triple(subjId, sub, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, ttt, null, 10);
subt.typeSubjectWord = sr.arg1Word;
if(sr.arg1Word.tmList.get(0).prefferdRelation == -1)
subt = null;
}
// predicate
SemanticRelation dep = sr.dependOnSemanticRelation;
PredicateMapping pm = null;
if (dep == null)
pm = currentPredicateMappings.get(sr.hashCode());
else
pm = currentPredicateMappings.get(dep.hashCode());
if(pm == null)
continue;
pid = pm.pid;
score *= pm.score;
// argument2
if(sr.isArg2Constant && (sr.arg2Word.mayEnt || sr.arg2Word.mayType) )
{
if(!sr.arg2Word.mayType)
{
EntityMapping em = currentEntityMappings.get(sr.arg2Word.hashCode());
objId = em.entityID;
obj = em.entityName;
score *= em.score;
}
else
{
TypeMapping tm = sr.arg2Word.tmList.get(0);
objId = Triple.TYPE_ROLE_ID;
obj = tm.typeName;
score *= (tm.score*100);
}
}
else
{
objId = Triple.VAR_ROLE_ID;
obj = "?" + sr.arg2Word.getFullEntityName();
}
// Type info of argument2
Triple objt = null;
if (sr.arg2Word.tmList != null && sr.arg2Word.tmList.size() > 0 && !typeSetFlag.contains(obj) && !sr.isArg2Constant)
{
StringBuilder type = new StringBuilder("");
for (TypeMapping tm : sr.arg2Word.tmList)
{
Integer tt = tm.typeID;
if(tt != -1)
type.append(TypeFragment.typeId2ShortName.get(tt));
else
type.append(tm.typeName);
type.append('|');
}
String ttt = type.substring(0, type.length()-1);
objt = new Triple(objId, obj, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, ttt, null, 10);
objt.typeSubjectWord = sr.arg2Word;
if(sr.arg2Word.tmList.get(0).prefferdRelation == -1)
objt = null;
}
// Prune.
if(objId == Triple.TYPE_ROLE_ID && pid != Globals.pd.typePredicateID)
return;
// Consider orders rely on LITERAL relations | at least one argument has TYPE info
if (RelationFragment.isLiteral(pid) && (subt != null || objt != null))
{
if (sub.startsWith("?") && obj.startsWith("?")) // two variables
{
// two variables have both possibility as object literal
if (subt != null) {
subt.object += ("|" + "literal_HRZ");
}
if (objt != null) {
objt.object += ("|" + "literal_HRZ");
}
if (subt==null && objt!=null)
{
// if object has type, subject has no type, more possible to change sub/obj because literal has no type in general [however maybe have yago:type]
String temp = sub;
int tmpId = subjId;
sub = obj;
subjId = objId;
obj = temp;
objId = tmpId;
isSubjObjOrderSameWithSemRltn=!isSubjObjOrderSameWithSemRltn;
}
}
else if (sub.startsWith("?") && !obj.startsWith("?")) {
// need change subj/obj order
if (subt != null) {
subt.object += ("|" + "literal_HRZ");
}
String temp = sub;
int tmpId = subjId;
sub = obj;
subjId = objId;
obj = temp;
objId = tmpId;
isSubjObjOrderSameWithSemRltn=!isSubjObjOrderSameWithSemRltn;
//System.out.println("here: "+sub+obj);
}
else if (obj.startsWith("?") && !sub.startsWith("?")) {
if (objt != null) {
objt.object += ("|" + "literal_HRZ");
}
}
}
Triple t = new Triple(subjId, sub, pid, objId, obj, sr, score,isSubjObjOrderSameWithSemRltn);
//System.out.println("triple: "+t+" "+isTripleCompatibleCanSwap(t));
sparql.addTriple(t);
// score of subject/object's type should correlative with the score of triple itself
if (subt != null)
{
subt.score += t.score*0.2;
sparql.addTriple(subt);
typeSetFlag.add(subt.subject); // be cautious to NOT use sub, it may has changed subj/obj order
}
if (objt != null)
{
objt.score += t.score*0.2;
sparql.addTriple(objt);
typeSetFlag.add(objt.subject);
}
// add argument' embedded triple, eg, ?canadian <birthPlace> <Canada>
if(!sr.isArg1Constant && sr.arg1Word.mayExtendVariable && sr.arg1Word.embbededTriple != null)
{
sparql.addTriple(sr.arg1Word.embbededTriple);
}
if(!sr.isArg2Constant && sr.arg2Word.mayExtendVariable && sr.arg2Word.embbededTriple != null)
{
sparql.addTriple(sr.arg2Word.embbededTriple);
}
sparql.adjustTriplesOrder();
}
// deduplicate
sparql.deduplicate();
if(checkedSparqlStrs.contains(sparql.toStringForGStore2()))
return;
checkedSparqlStrs.add(sparql.toStringForGStore2());
if (!qlog.MODE_fragment) {
// Method 1: do NOT check compatibility
rankedSparqls.add(sparql);
isAnswerFound = true;
}
else {
// Method 2: check compatibility by FRAGMENT (offline index)
//1. single-triple check (a quickly prune), allow to swap subject and object. Try to adjust to the best order.
tripleCheckCallCnt++;
long t1 = System.currentTimeMillis();
for (Triple t : sparql.tripleList)
if(t.predicateID!=Globals.pd.typePredicateID && !isTripleCompatibleCanSwap(t))
{
firstFalseSr = t.semRltn;
return;
}
tripleCheckTime += (System.currentTimeMillis()-t1);
//2. SPARQL check (consider the interact between all triples), allow to swap subject and object.
t1 = System.currentTimeMillis();
sparqlCheckCallCnt++;
enumerateSubjObjOrders(sparql, new Sparql(sparql.semanticRelations), 0);
sparqlCheckTime += (System.currentTimeMillis()-t1);
}
}
/*
* Notice:
* typeId=-1 then no data fragment
* */
public static TypeFragment getTypeFragmentByWord(Word word)
{
TypeFragment tf = null;
if(word!=null && word.tmList!=null && word.tmList.size()>0)
{
int typeId = word.tmList.get(0).typeID;
if(typeId != -1)
tf = TypeFragment.typeFragments.get(typeId);
}
return tf;
}
/*
* (Just PRE CHECK [single triple check] in this function, the final check in enumerateSubjObjOrders which utilize more INDEX)
* notice: predicate = type cannot entrance this function
* */
public boolean isTripleCompatibleCanSwap (Triple t) {
if (qlog.s.sentenceType==SentenceType.GeneralQuestion)
{
if (fragmentCompatible2(t.subjId, t.predicateID, t.objId) >
fragmentCompatible2(t.objId, t.predicateID, t.subjId))
t.swapSubjObjOrder();
if (fragmentCompatible(t.subjId, t.predicateID, t.objId))
return true;
return false;
}
else
{
//var & var
if(t.subject.startsWith("?") && t.object.startsWith("?"))
{
Word subjWord = t.getSubjectWord(), objWord = t.getObjectWord();
TypeFragment subjTf = getTypeFragmentByWord(subjWord), objTf = getTypeFragmentByWord(objWord);
//based on whether the two varabile's type fragment's in/out edge contain predicate, calculate whether need change order
//just vote
int nowOrderCnt = 0, reverseOrderCnt = 0;
if(subjTf == null || subjTf.outEdges.contains(t.predicateID))
nowOrderCnt ++;
if(objTf == null || objTf.inEdges.contains(t.predicateID))
nowOrderCnt ++;
if(subjTf == null || subjTf.inEdges.contains(t.predicateID))
reverseOrderCnt ++;
if(objTf == null || objTf.outEdges.contains(t.predicateID))
reverseOrderCnt ++;
if(nowOrderCnt<2 && reverseOrderCnt<2)
return false;
else if(nowOrderCnt > reverseOrderCnt)
{
// do nothing
}
else if(reverseOrderCnt > nowOrderCnt)
{
t.swapSubjObjOrder();
}
else //now order and reverse order both passed type fragment checking, need SELECT one
{
//rule1: ?inventor <occupation> ?occupation || ... <name> ?name -> more similar string will be put latter
String p = Globals.pd.getPredicateById(t.predicateID);
int ed1 = EntityFragment.calEditDistance(subjWord.baseForm, p);
int ed2 = EntityFragment.calEditDistance(objWord.baseForm, p);
if(ed1 < ed2)
{
t.swapSubjObjOrder();
}
}
return true;
}
///ent & ent || var & ent
else
{
boolean flag = false;
if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) {
flag = true;
}
else if (fragmentCompatible(t.objId, t.predicateID, t.subjId)) {
t.swapSubjObjOrder();
flag = true;
}
// Var & Ent | ?city <type1> <City> & <Chile_Route_68> <country> ?city : <country> is invalid for City | Notice: the data often dirty and can not prune correctly.
if(flag == true && (t.subject.startsWith("?") || t.object.startsWith("?")))
{
Word subjWord = t.getSubjectWord(), objWord = t.getObjectWord();
TypeFragment subjTf = getTypeFragmentByWord(subjWord), objTf = getTypeFragmentByWord(objWord);
if(subjTf != null)
{
if(subjTf.outEdges.contains(t.predicateID))
flag = true;
else if(subjTf.inEdges.contains(t.predicateID))
{
t.swapSubjObjOrder();
flag = true;
}
else
flag = false;
}
else if(objTf != null)
{
if(objTf.inEdges.contains(t.predicateID))
flag = true;
else if(objTf.outEdges.contains(t.predicateID))
{
t.swapSubjObjOrder();
flag = true;
}
else
flag = false;
}
}
return flag;
}
}
}
public boolean isTripleCompatibleNotSwap (Triple t) {
if (t.predicateID == Globals.pd.typePredicateID) {
return true;
}
else if (fragmentCompatible(t.subjId, t.predicateID, t.objId)) {
return true;
}
else {
return false;
}
}

public boolean fragmentCompatible (int id1, int pid, int id2) {
EntityFragment ef1 = efd.getEntityFragmentByEid(id1);
EntityFragment ef2 = efd.getEntityFragmentByEid(id2);
// valid entity MUST has fragment
if (id1!=Triple.TYPE_ROLE_ID && id1!=Triple.VAR_ROLE_ID && ef1 == null) return false;
if (id2!=Triple.TYPE_ROLE_ID && id2!=Triple.VAR_ROLE_ID && ef2 == null) return false;
boolean ef1_constant = (ef1==null)?false:true;
boolean ef2_constant = (ef2==null)?false:true;
int entityCnt=0,compatibleCnt=0;
if(ef1_constant) {
entityCnt++;
if (ef1.outEdges.contains(pid))
compatibleCnt++;
// else // <e1,p> Ϊ false pair
// {
// falseEntPres.add(new Pair(id1,pid));
// }
}
if (ef2_constant) {
entityCnt++;
if (ef2.inEdges.contains(pid))
compatibleCnt++;
// else // <p,e2> Ϊfalse pair
// {
// falsePreEnts.add(new Pair(pid,id2));
// }
}
// for SELECT sparql, EXCAT match between predicate and subject and object, ASK sparql can be relaxed
if (qlog.s.sentenceType==SentenceType.GeneralQuestion)
return entityCnt-compatibleCnt<=1;
else
return entityCnt==compatibleCnt;
}
public int fragmentCompatible2 (int id1, int pid, int id2) {
EntityFragment ef1 = efd.getEntityFragmentByEid(id1);
EntityFragment ef2 = efd.getEntityFragmentByEid(id2);

int entityCnt=0,compatibleCnt=0;
if(id1 != Triple.VAR_ROLE_ID && id1 != Triple.TYPE_ROLE_ID) {
entityCnt++;
if (ef1!=null && ef1.outEdges.contains(pid))
compatibleCnt++;
}
if (id2 != Triple.VAR_ROLE_ID && id2 != Triple.TYPE_ROLE_ID) {
entityCnt++;
if (ef2!=null && ef2.inEdges.contains(pid))
compatibleCnt++;
}

return entityCnt-compatibleCnt;
}
public boolean checkConstantConsistency (Sparql spql) {
HashMap<String, String> constants = new HashMap<String, String>();
for (Triple t : spql.tripleList) {
if (!t.subject.startsWith("?")) {
String e = t.getSubjectWord().getFullEntityName();
if (!constants.containsKey(e))
constants.put(e, t.subject);
else {
if (!constants.get(e).equals(t.subject))
return false;
}
}
if (!t.object.startsWith("?")) {
String e = t.getObjectWord().getFullEntityName();
if (!constants.containsKey(e))
constants.put(e, t.object);
else {
if (!constants.get(e).equals(t.object))
return false;
}
}
}
return true;
}
public void reviseScoreByTripleOrders(Sparql spq)
{
Triple shouldDel = null;
for(Triple triple: spq.tripleList)
{
// eg, ?who <president> <United_States_Navy> need punished (or dropped).
if(triple.subject.toLowerCase().equals("?who"))
{
String rel = Globals.pd.id_2_predicate.get(triple.predicateID);
if(rel.equals("president") || rel.equals("starring") || rel.equals("producer"))
{
spq.score -= triple.score;
triple.score /= 10;
spq.score += triple.score;
if(triple.semRltn!=null && triple.semRltn.isSteadyEdge == false)
shouldDel = triple;
}
}
}
if(shouldDel != null)
spq.delTriple(shouldDel);
}
// enumerate subject/object order, fragment check
// Modify score of "ask one triple"
public boolean enumerateSubjObjOrders (Sparql originalSpq, Sparql currentSpq, int level)
{
if (level == originalSpq.tripleList.size())
{
if(currentSpq.tripleList.size() == 0)
return false;
CompatibilityChecker cc = new CompatibilityChecker(efd);
if (qlog.s.sentenceType==SentenceType.GeneralQuestion) //ask where sparql: no need for fragment check
{
if(cc.isSparqlCompatible3(currentSpq)) //reward score for "TRUE"
{
for(Triple triple: currentSpq.tripleList)
triple.addScore(triple.getScore());
}
rankedSparqls.add(currentSpq.copy());
return true;
}
try
{
sparqlCheckId++;
if (cc.isSparqlCompatible3(currentSpq))
{
//eg, ?who <president> <United_States_Navy>
//When query graph contains circle, we just prune this edge
Sparql sparql = currentSpq.copy();
reviseScoreByTripleOrders(sparql);
if(!rankedSparqls.contains(sparql))
rankedSparqls.add(sparql);
return true;
}
}
catch (Exception e) {
System.out.println("[CompatibilityChecker ERROR]"+currentSpq);
e.printStackTrace();
}
return false;
}
Triple cur_t = originalSpq.tripleList.get(level);
// first try default order
currentSpq.addTriple(cur_t);
boolean flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1);
currentSpq.removeLastTriple();
// !deprecated: not change triple order for [literal relation]
// if (RelationFragment.isLiteral(cur_t.predicateID)) return false;
// Enumerate reserve/drop the type info
if (cur_t.predicateID == Globals.pd.typePredicateID)
{
flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1);
return flag;
}
else
{
// single triple check after swap
Triple swapped_t = cur_t.copySwap();
swapped_t.score = swapped_t.score*0.8;
if (isTripleCompatibleNotSwap(swapped_t))
{
currentSpq.addTriple(swapped_t);
flag = enumerateSubjObjOrders(originalSpq, currentSpq, level+1);
currentSpq.removeLastTriple();
}
return flag;
}
}

}

+ 1201
- 0
src/qa/parsing/BuildQueryGraph.java
File diff suppressed because it is too large
View File


+ 208
- 0
src/qa/parsing/QuestionParsing.java View File

@@ -0,0 +1,208 @@
package qa.parsing;

import org.maltparser.core.exception.MaltChainedException;

import log.QueryLogger;
import nlp.ds.DependencyTree;
import nlp.ds.DependencyTreeNode;
import nlp.ds.Word;
import nlp.ds.Sentence.SentenceType;
import qa.Globals;
import rdf.Sparql;
import rdf.Triple;

public class QuestionParsing {
public void process(QueryLogger qlog) {
getDependenciesAndNER(qlog);
recognizeSentenceType(qlog);
}
public void getDependenciesAndNER (QueryLogger qlog) {
long t1 = System.currentTimeMillis();
try {
qlog.s.dependencyTreeStanford = new DependencyTree(qlog.s, Globals.stanfordParser);
}catch(Exception e){
e.printStackTrace();
}
long t2 = System.currentTimeMillis();
try{
qlog.s.dependencyTreeMalt = new DependencyTree(qlog.s, Globals.maltParser);
}catch(Exception e){
//if errors occur, abandon malt tree
qlog.s.dependencyTreeMalt = qlog.s.dependencyTreeStanford;
System.err.println("MALT parser error! Use stanford parser instead.");
}
try {
long t3 = System.currentTimeMillis();
Globals.nerRecognizer.recognize(qlog.s);
long t4 = System.currentTimeMillis();
System.out.println("====StanfordDependencies("+(t2-t1)+"ms)====");
System.out.println(qlog.s.dependencyTreeStanford);
System.out.println("====MaltDependencies("+(t3-t2)+"ms)====");
System.out.println(qlog.s.dependencyTreeMalt);
System.out.println("====NameEntityRecognition("+(t4-t3)+"ms)====");
qlog.s.printNERResult();
qlog.timeTable.put("StanfordParser", (int)(t2-t1));
qlog.timeTable.put("MaltParser", (int)(t3-t2));
qlog.timeTable.put("NER", (int)(t4-t3));
} catch (Exception e) {
e.printStackTrace();
}
}
public void recognizeSentenceType(QueryLogger qlog)
{
boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford)||
recognizeImperativeSentence(qlog.s.dependencyTreeMalt);
if (IsImperativeSentence)
{
qlog.s.sentenceType = SentenceType.ImperativeSentence;
//two dependencyTree's ignored words should equal
for (DependencyTreeNode sNode : qlog.s.dependencyTreeStanford.nodesList)
for (DependencyTreeNode mNode : qlog.s.dependencyTreeMalt.nodesList)
if (sNode.equals(mNode) && (sNode.word.isIgnored||mNode.word.isIgnored))
sNode.word.isIgnored = mNode.word.isIgnored = true;
return;
}
boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford)||
recognizeSpecialQuestion(qlog.s.dependencyTreeMalt);
if (IsSpecialQuestion)
{
qlog.s.sentenceType = SentenceType.SpecialQuestion;
return;
}
boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford)||
recognizeGeneralQuestion(qlog.s.dependencyTreeMalt);
if (IsGeneralQuestion)
{
qlog.s.sentenceType = SentenceType.GeneralQuestion;
return;
}
//default is special
qlog.s.sentenceType = SentenceType.SpecialQuestion;
}
//if imperative, omitting those polite words
private boolean recognizeImperativeSentence(DependencyTree tree) {
if(tree.getRoot().word.posTag.startsWith("V") || tree.getRoot().word.posTag.startsWith("NN")) {
DependencyTreeNode dobj = null;
DependencyTreeNode iobj = null;
for (DependencyTreeNode n : tree.getRoot().childrenList) {
if (n.dep_father2child.equals("dobj")) {
dobj = n;
}
else if (n.dep_father2child.equals("iobj")) {
iobj = n;
}
}
if (dobj != null && iobj != null) {
tree.getRoot().word.isIgnored = true;
iobj.word.isIgnored = true;
// give me a list of ..
if (dobj.word.baseForm.equals("list"))
{
dobj.word.isIgnored = true;
}
return true;
}
//start with "List": List all games by GMT.
if (dobj != null && tree.getRoot().word.baseForm.equals("list"))
{
//System.out.println("isListSentence!");
tree.getRoot().word.isIgnored = true;
return true;
}
}
return false;
}
private boolean recognizeSpecialQuestion(DependencyTree tree)
{
DependencyTreeNode firstNode = null;
for (DependencyTreeNode dtn : tree.nodesList)
if (dtn.word.position == 1)
{
firstNode = dtn;
break;
}
//eg. In which city...
if (firstNode!=null &&
(firstNode.word.posTag.equals("IN")||firstNode.word.posTag.equals("TO"))&&
firstNode.dep_father2child.startsWith("prep"))
{
firstNode = null;
for (DependencyTreeNode dtn : tree.nodesList)
if (dtn.word.position == 2)
{
firstNode = dtn;
break;
}
}

if (firstNode != null)
{
if (firstNode.word.posTag.startsWith("W"))
return true;
}
return false;
}
private boolean recognizeGeneralQuestion(DependencyTree tree)
{
DependencyTreeNode firstNode = null;
for (DependencyTreeNode dtn : tree.nodesList)
if (dtn.word.position == 1)
{
firstNode = dtn;
break;
}
if (firstNode != null)
{
String dep = firstNode.dep_father2child;
String pos = firstNode.word.posTag;
String baseform = firstNode.word.baseForm;
if ((baseform.equals("be")||baseform.equals("do")) &&
pos.startsWith("VB") &&
(dep.equals("root")||dep.equals("cop")||dep.startsWith("aux")))
return true;
}
return false;
}
public static String detectQuestionFocus(Sparql spq) {
String ret = null;
int posi = Integer.MAX_VALUE;
for (Triple t : spq.tripleList) {
if (!t.isSubjConstant()) {
Word subj = t.getSubjectWord();
if (subj!=null && subj.position < posi) {
posi = subj.position;
ret = t.subject;
}
}
if (!t.isObjConstant()) {
Word obj = t.getObjectWord();
if (obj!=null && obj.position < posi) {
posi = obj.position;
ret = t.object;
}
}
}
if (ret != null) return ret.replace(' ', '_');
else return null;
}
}

+ 40
- 0
src/rdf/EntityMapping.java View File

@@ -0,0 +1,40 @@
package rdf;

import fgmt.EntityFragment;

public class EntityMapping implements Comparable<EntityMapping> {
public int entityID = -1;
public String entityName = null;
public double score = 0;
public EntityFragment entityFragment = null;
public EntityMapping(int eid, String en, double sco) {
entityID = eid;
entityName = en;
score = sco;
//punishment if entity start with "?"
if (entityName.startsWith("?"))
score *=0.5;
}
// In descending order: big --> small
public int compareTo(EntityMapping o) {
double diff = this.score - o.score;
if (diff > 0) return -1;
else if (diff < 0) return 1;
else return 0;
}
public int hashCode()
{
return new Integer(entityID).hashCode();
}
public String toString()
{
StringBuilder res = new StringBuilder(entityName+"("+score+")");
return res.toString();
}
}

+ 77
- 0
src/rdf/ImplicitRelation.java View File

@@ -0,0 +1,77 @@
package rdf;

import fgmt.TypeFragment;
import qa.Globals;
import lcn.EntityFragmentFields;

public class ImplicitRelation {

public String subj = null;
public String obj = null;
public int pId = -1;
public double score = 0;
//Role : 1|ent , 2|type_ , 3|var
public enum roleEnum {ENTITY, TYPE_CONSTANT, TYPE_VARIABLE, VARIABLE};
public int subjRole = -1;
public int objRole = -1;
public int subjId = -1;
public int objId = -1;
public ImplicitRelation(String s, String o, int pid, double sc)
{
pId = pid;
subj = s;
obj = o;
score = sc;
subjId = EntityFragmentFields.entityName2Id.get(s);
if(pId != Globals.pd.typePredicateID)
objId = EntityFragmentFields.entityName2Id.get(o);
else
objId = TypeFragment.typeShortName2IdList.get(o).get(0);
}
public ImplicitRelation(Integer sId, Integer oId, int pid, double sc)
{
pId = pid;
subjId = sId;
objId = oId;
score = sc;
}
public void setSubjectId(Integer s)
{
subjId = s;
}
public void setObjectId(Integer o)
{
objId = o;
}
public void setSubject(String s)
{
subj = s;
}
public void setObject(String o)
{
obj = o;
}
public int hashCode()
{
return new Integer(pId).hashCode() ^ new Integer(subjId).hashCode() ^ new Integer(objId).hashCode();
}
@Override
public boolean equals(Object ir)
{
ImplicitRelation tmpIr = (ImplicitRelation) ir;
if (pId == tmpIr.pId && subjId == tmpIr.subjId && objId == tmpIr.objId)
return true;
else return false;
}
}

+ 41
- 0
src/rdf/MergedWord.java View File

@@ -0,0 +1,41 @@
package rdf;

import java.util.ArrayList;

import rdf.EntityMapping;
import rdf.TypeMapping;

public class MergedWord implements Comparable<MergedWord>
{
//original position
public int st,ed;
//position after merge (unselected is -1)
public int mergedPos = -1;
public String name;
public boolean mayCategory = false;
public boolean mayLiteral = false;
public boolean mayEnt = false;
public boolean mayType = false;
public ArrayList<EntityMapping> emList = null;
public ArrayList<TypeMapping> tmList = null;
public String category = null;
public MergedWord(int s,int e,String n)
{
st = s;
ed = e;
name = n;
}
@Override
//long to short
public int compareTo(MergedWord o)
{
int lenDiff = (this.ed-this.st) - (o.ed-o.st);
if (lenDiff > 0) return -1;
else if (lenDiff < 0) return 1;
return 0;
}
}

+ 24
- 0
src/rdf/NodeSelectedWithScore.java View File

@@ -0,0 +1,24 @@
package rdf;

import java.util.ArrayList;

public class NodeSelectedWithScore implements Comparable<NodeSelectedWithScore>
{
public ArrayList<Integer> selected;
int size; //split key to st and ed
public double score = 0;
public NodeSelectedWithScore(ArrayList<Integer> a, double b)
{
selected = a;
score = b;
}
// In descending order: big --> small
public int compareTo(NodeSelectedWithScore o) {
double diff = this.score - o.score;
if (diff > 0) return -1;
else if (diff < 0) return 1;
else return 0;
}
}

+ 28
- 0
src/rdf/PredicateMapping.java View File

@@ -0,0 +1,28 @@
package rdf;

public class PredicateMapping implements Comparable<PredicateMapping> {
public int pid = -1;
public double score = 0;
public String parapharase = null;
public PredicateMapping (int pid, double sco, String para) {
this.pid = pid;
score = sco;
parapharase = para;
}
// In descending order: big --> small
public int compareTo(PredicateMapping o) {
double diff = this.score - o.score;
if (diff > 0) return -1;
else if (diff < 0) return 1;
else return 0;
}
@Override
public String toString() {
String ret = "";
ret = "<"+pid+" : "+parapharase+" : "+score+">";
return ret;
}
}

+ 180
- 0
src/rdf/SemanticQueryGraph.java View File

@@ -0,0 +1,180 @@
package rdf;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;

import qa.Globals;
import nlp.ds.Word;

public class SemanticQueryGraph implements Comparable<SemanticQueryGraph>
{
public ArrayList<SemanticUnit> semanticUnitList = null;
public HashMap<Integer, SemanticRelation> semanticRelations = new HashMap<>();
public double score = 0;
public SemanticQueryGraph(ArrayList<SemanticUnit> suList)
{
semanticUnitList = suList; //TODO: need copy?
// Calculate Score by a reward function (TODO: using SVM-Rank)
}

public SemanticQueryGraph(SemanticQueryGraph head)
{
semanticUnitList = new ArrayList<>();
for(SemanticUnit su: head.semanticUnitList)
semanticUnitList.add(su.copy());
score = head.score;
}
public void connect(SemanticUnit u, SemanticUnit v)
{
if(u.equals(v))
return;
SemanticUnit su1 = null, su2 = null;
for(SemanticUnit su: this.semanticUnitList)
if(su.equals(u))
su1 = su;
else if(su.equals(v))
su2 = su;
if(su1 != null && su2 != null)
if(!su1.neighborUnitList.contains(su2) && !su2.neighborUnitList.contains(su1))
{
su1.neighborUnitList.add(su2);
su2.neighborUnitList.add(su1);
}
}
public void merge(SemanticUnit u, SemanticUnit v)
{
SemanticUnit su1 = null, su2 = null;
for(SemanticUnit su: this.semanticUnitList)
if(su.equals(u))
su1 = su;
else if(su.equals(v))
su2 = su;
if(su1 != null && su2 != null)
{
for(SemanticUnit su: this.semanticUnitList)
if(su != su2 && su.neighborUnitList.contains(su1) && !su.neighborUnitList.contains(su2)) //TODO: Notice, now REJECT multi-edges; The hash function of SR should be modified to allow multi-edges.
su.neighborUnitList.add(su2);
this.semanticUnitList.remove(su1);
su2.neighborUnitList.remove(su1);
}
}

@Override
public int hashCode() {
int code = 0;
for(SemanticUnit su: this.semanticUnitList)
code ^= su.hashCode();
return code;
}
@Override
public boolean equals(Object o)
{
if (o instanceof SemanticQueryGraph)
{
int matchCnt = 0;
for(SemanticUnit su1: ((SemanticQueryGraph) o).semanticUnitList)
for(SemanticUnit su2: this.semanticUnitList)
{
if(su1.equals(su2))
{
if(su1.neighborUnitList.containsAll(su2.neighborUnitList) && su2.neighborUnitList.containsAll(su1.neighborUnitList))
matchCnt++;
}
}
if(matchCnt == ((SemanticQueryGraph) o).semanticUnitList.size() && matchCnt == this.semanticUnitList.size())
return true;
}
return false;
}
@Override
public int compareTo(SemanticQueryGraph o)
{
double diff = this.score - o.score;
if (diff > 0) return -1;
else if (diff < 0) return 1;
else return 0;
}
public boolean isFinalState()
{
if(semanticUnitList == null || semanticUnitList.isEmpty())
return false;
// Basic assumption: a final Semantic Query Graph should be Connected.
HashSet<SemanticUnit> visited = new HashSet<>();
SemanticUnit start = semanticUnitList.get(0);
visited.add(start);
dfs(start, visited);
if(visited.size() == semanticUnitList.size())
return true;
return false;
}
private void dfs(SemanticUnit headNode, HashSet<SemanticUnit> visited)
{
for(SemanticUnit curNode: headNode.neighborUnitList)
if(!visited.contains(curNode))
{
visited.add(curNode);
dfs(curNode, visited);
}
for(SemanticUnit curNode: semanticUnitList)
{
if(curNode.neighborUnitList.contains(headNode) || headNode.neighborUnitList.contains(curNode))
{
if(!visited.contains(curNode))
{
visited.add(curNode);
dfs(curNode, visited);
}
}
}
}

public void calculateScore(HashMap<Integer, SemanticRelation> potentialSemanticRelations)
{
// 1. entity/type score
double entSco = 0;
for(SemanticUnit su: this.semanticUnitList)
{
Word w = su.centerWord;
if(w.mayEnt && w.emList.size()>0)
entSco += w.emList.get(0).score * 100;
if(w.mayType && w.tmList.size()>0)
entSco += w.tmList.get(0).score;
}
// 2. relation score
double relSco = 0;
int relCnt = 0;
for(SemanticUnit su1: this.semanticUnitList)
for(SemanticUnit su2: su1.neighborUnitList)
{
//Deduplicate
if(su1.centerWord.position > su2.centerWord.position)
continue;
relCnt++;
int key = su1.centerWord.getNnHead().hashCode() ^ su2.centerWord.getNnHead().hashCode();
SemanticRelation sr = potentialSemanticRelations.get(key);
if(sr == null)
System.err.println("No semantic relation for: " + su1 + " & " + su2);
else
{
relSco += sr.predicateMappings.get(0).score;
semanticRelations.put(key, sr);
}
}
relSco/=relCnt; //average
this.score = entSco + relSco;
}
}

+ 171
- 0
src/rdf/SemanticRelation.java View File

@@ -0,0 +1,171 @@
package rdf;

import java.util.ArrayList;

import rdf.SimpleRelation;

import nlp.ds.Word;

public class SemanticRelation {
public Word arg1Word = null;
public Word arg2Word = null;
public String relationParaphrase = null; // longest match
public double LongestMatchingScore = 0; // longest match score
//judge difference when copy semantic relation from special pattern
public int arg1SuffixId = 0;
public int arg2SuffixId = 0;
public Word arg1Word_beforeCRR = null;
public Word arg2Word_beforeCRR = null;
public ArrayList<PredicateMapping> predicateMappings = null;

public boolean isArg1Constant = false;
public boolean isArg2Constant = false;
public char extractingMethod = ' '; // S: StanfordParser; M: MaltParser; N: N-gram; R: rules
public SemanticRelation dependOnSemanticRelation = null;
public Word preferredSubj = null;
public boolean isSteadyEdge = true;
public SemanticRelation(SemanticRelation r2) {
arg1Word = r2.arg1Word;
arg2Word = r2.arg2Word;
relationParaphrase = r2.relationParaphrase;
LongestMatchingScore = r2.LongestMatchingScore;
arg1SuffixId = r2.arg1SuffixId;
arg2SuffixId = r2.arg2SuffixId;
arg1Word_beforeCRR = r2.arg1Word_beforeCRR;
arg2Word_beforeCRR = r2.arg2Word_beforeCRR;
arg1Word.emList = r2.arg1Word.emList;
arg2Word.emList = r2.arg2Word.emList;
predicateMappings = r2.predicateMappings;
// arg1Types = r2.arg1Types;
// arg2Types = r2.arg2Types;
isArg1Constant = r2.isArg1Constant;
isArg2Constant = r2.isArg2Constant;
extractingMethod = r2.extractingMethod;
dependOnSemanticRelation = r2.dependOnSemanticRelation;
preferredSubj = r2.preferredSubj;
}
public void swapArg1Arg2()
{
Word tmpWord = arg1Word;
arg1Word = arg2Word;
arg2Word = tmpWord;
int tmpSuffixId = arg1SuffixId;
arg1SuffixId = arg2SuffixId;
arg2SuffixId = tmpSuffixId;
tmpWord = arg1Word_beforeCRR;
arg1Word_beforeCRR = arg2Word_beforeCRR;
arg2Word_beforeCRR = tmpWord;
boolean tmpBool = isArg1Constant;
isArg1Constant = isArg2Constant;
isArg2Constant = tmpBool;
}
public SemanticRelation (SimpleRelation simr) {
if (simr.preferredSubj == null) {
if (simr.arg1Word.compareTo(simr.arg2Word) < 0) {
this.arg1Word = simr.arg1Word;
this.arg2Word = simr.arg2Word;
this.arg1Word_beforeCRR = simr.arg1Word_beforeCRR;
this.arg2Word_beforeCRR = simr.arg2Word_beforeCRR;
}
else {
this.arg1Word = simr.arg2Word;
this.arg2Word = simr.arg1Word;
this.arg1Word_beforeCRR = simr.arg2Word_beforeCRR;
this.arg2Word_beforeCRR = simr.arg1Word_beforeCRR;
}
this.extractingMethod = simr.extractingMethod;
}
else {
if (simr.arg1Word == simr.preferredSubj) {
this.arg1Word = simr.arg1Word;
this.arg2Word = simr.arg2Word;
this.arg1Word_beforeCRR = simr.arg1Word_beforeCRR;
this.arg2Word_beforeCRR = simr.arg2Word_beforeCRR;
this.preferredSubj = simr.preferredSubj;
}
else {
this.arg1Word = simr.arg2Word;
this.arg2Word = simr.arg1Word;
this.arg1Word_beforeCRR = simr.arg2Word_beforeCRR;
this.arg2Word_beforeCRR = simr.arg1Word_beforeCRR;
this.preferredSubj = simr.preferredSubj;
}
this.extractingMethod = simr.extractingMethod;
}
}
@Override
public int hashCode() {
return arg1Word.hashCode() ^ arg2Word.hashCode() + arg1SuffixId + arg2SuffixId;
}
@Override
public boolean equals(Object o) {
if (o instanceof SemanticRelation) {
SemanticRelation sr2 = (SemanticRelation) o;
if (this.arg1Word.equals(sr2.arg1Word)
&& this.arg2Word.equals(sr2.arg2Word)
&& this.arg1SuffixId == sr2.arg1SuffixId
&& this.arg2SuffixId == sr2.arg2SuffixId
&& this.relationParaphrase.equals(sr2.relationParaphrase)
&& this.LongestMatchingScore == sr2.LongestMatchingScore) {
return true;
}
}
return false;
}
@Override
public String toString() {
return arg1Word.originalForm + "," + arg2Word.originalForm + "," + relationParaphrase + "," + LongestMatchingScore + "["+extractingMethod+"]";
// return arg1Word.getFullEntityName() + "," + arg2Word.getFullEntityName() + "," + relationParaphrase + "," + LongestMatchingScore + "["+extractingMethod+"]";
}
public void normalizeScore()
{
double maxScore;
if (arg1Word.emList!=null && !arg1Word.emList.isEmpty())
{
maxScore=0.0;
for (EntityMapping em : arg1Word.emList)
maxScore = Math.max(maxScore, em.score);
for (EntityMapping em : arg1Word.emList)
em.score = em.score/maxScore;
}

if (arg2Word.emList!=null && !arg2Word.emList.isEmpty())
{
maxScore=0.0;
for (EntityMapping em : arg2Word.emList)
maxScore = Math.max(maxScore, em.score);
for (EntityMapping em : arg2Word.emList)
em.score = em.score/maxScore;
}
if (predicateMappings!=null && !predicateMappings.isEmpty())
{
maxScore=0.0;
for (PredicateMapping pm : predicateMappings)
maxScore = Math.max(maxScore, pm.score);
for (PredicateMapping pm : predicateMappings)
pm.score = pm.score/maxScore;
}
}
}

+ 61
- 0
src/rdf/SemanticUnit.java View File

@@ -0,0 +1,61 @@
package rdf;

import java.util.ArrayList;
import java.util.HashMap;

import rdf.SemanticRelation;
import nlp.ds.DependencyTreeNode;
import nlp.ds.Word;

public class SemanticUnit
{
public Word centerWord = null;
public ArrayList<DependencyTreeNode> describeNodeList = new ArrayList<DependencyTreeNode>();
public ArrayList<SemanticUnit> neighborUnitList = new ArrayList<SemanticUnit>();
public HashMap<Word, SemanticRelation> RelationList = new HashMap<Word, SemanticRelation>();
public boolean isSubj = true;
public Integer prefferdType = null;
public SemanticUnit(Word center, boolean isSubJ)
{
centerWord = center;
isSubj = isSubJ;
}
public SemanticUnit copy()
{
SemanticUnit su = new SemanticUnit(this.centerWord, this.isSubj);
su.describeNodeList = (ArrayList<DependencyTreeNode>) this.describeNodeList.clone();
su.neighborUnitList = (ArrayList<SemanticUnit>) this.neighborUnitList.clone();
su.RelationList = (HashMap<Word, SemanticRelation>) this.RelationList.clone();
return su;
}
@Override
public int hashCode() {
return centerWord.hashCode();
}
@Override
public boolean equals(Object o) {
if (o instanceof SemanticUnit) {
SemanticUnit su2 = (SemanticUnit) o;
if(this.centerWord.equals(su2.centerWord))
return true;
}
return false;
}
@Override
public String toString()
{
String ret = "<" + centerWord + ", {";
for(SemanticUnit su: neighborUnitList)
ret += su.centerWord + ", ";
ret += "}>";
return ret;
}
}

+ 88
- 0
src/rdf/SimpleRelation.java View File

@@ -0,0 +1,88 @@
package rdf;

import java.util.ArrayList;
import java.util.HashMap;

import paradict.PredicateIDAndSupport;
import qa.Globals;

import nlp.ds.DependencyTree;
import nlp.ds.DependencyTreeNode;
import nlp.ds.Word;

// allow repetition
public class SimpleRelation {
public Word arg1Word = null;
public Word arg2Word = null;
public String relationParaphrase = null;
public double matchingScore = 0;
public Word arg1Word_beforeCRR = null;
public Word arg2Word_beforeCRR = null;
public HashMap<Integer, Double> pasList = new HashMap<Integer, Double>();
public Word preferredSubj = null;
public char extractingMethod = ' '; // S: StanfordParser; M: MaltParser; N: N-gram; R: rules
public SimpleRelation()
{
}
public SimpleRelation(SimpleRelation sr)
{
arg1Word = sr.arg1Word;
arg2Word = sr.arg2Word;
relationParaphrase = sr.relationParaphrase;
matchingScore = sr.matchingScore;
arg1Word_beforeCRR = sr.arg1Word_beforeCRR;
arg2Word_beforeCRR = sr.arg2Word_beforeCRR;
pasList = sr.pasList;
preferredSubj = sr.preferredSubj;
extractingMethod = 'R';
}

@Override
public String toString() {
return arg1Word.originalForm + "," + arg2Word.originalForm + "," + relationParaphrase + "," + matchingScore + "["+extractingMethod+"]";
//return arg1Word.getFullEntityName() + "," + arg2Word.getFullEntityName() + "," + relationParaphrase + "," + matchingScore + "["+extractingMethod+"]";
}
public int getHashCode() {
return arg1Word.hashCode() ^ arg2Word.hashCode();
}
public void setPasList (String pattern, double matchingScore, boolean[] matchedFlag) {
ArrayList<PredicateIDAndSupport> list = Globals.pd.nlPattern_2_predicateList.get(pattern);
for (PredicateIDAndSupport pidsup : list) {
double sumSelectivity = 0;
for (int i = 0; i < matchedFlag.length; i ++) {
if (matchedFlag[i]) {
sumSelectivity += pidsup.wordSelectivity[i];
}
}
sumSelectivity = matchingScore*sumSelectivity*pidsup.support;
int pid = pidsup.predicateID;
if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5; //����dbo�е�predicate //pid ���ܲ��� dbo �У�
if (!pasList.containsKey(pid))
pasList.put(pid, sumSelectivity);
else if (sumSelectivity > pasList.get(pid))
pasList.put(pid, sumSelectivity);
}
}
public void setPreferedSubjObjOrder(DependencyTree tree) {
DependencyTreeNode n1 = tree.getNodeByIndex(this.arg1Word.position).getNNTopTreeNode(tree);
DependencyTreeNode n2 = tree.getNodeByIndex(this.arg2Word.position).getNNTopTreeNode(tree);
if (n1.father != null && n1.father.word.baseForm.equals("of") && n1.dep_father2child.equals("pobj")) {
this.preferredSubj = this.arg1Word;
}
else if (n2.father != null && n2.father.word.baseForm.equals("of") && n2.dep_father2child.equals("pobj")) {
this.preferredSubj = this.arg2Word;
}
}

}

+ 305
- 0
src/rdf/Sparql.java View File

@@ -0,0 +1,305 @@
package rdf;

import java.util.ArrayList;
import java.util.Collections;
//import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;

import log.QueryLogger;
import nlp.ds.Sentence;
import nlp.ds.Sentence.SentenceType;
import qa.Globals;

public class Sparql implements Comparable<Sparql>
{
public ArrayList<Triple> tripleList = new ArrayList<Triple>();
public boolean countTarget = false;
public String mostStr = null;
public String moreThanStr = null;
public double score = 0;
public String questionFocus = null; // The answer variable
public HashSet<String> variables = new HashSet<String>();
public enum QueryType {Select,Ask}
public QueryType queryType = QueryType.Select;
public HashMap<Integer, SemanticRelation> semanticRelations = null;

public void addTriple(Triple t)
{
if(!tripleList.contains(t))
{
tripleList.add(t);
score += t.score;
}
}
public void delTriple(Triple t)
{
if(tripleList.contains(t))
{
tripleList.remove(t);
score -= t.score;
}
}

@Override
public String toString()
{
String ret = "";
for (Triple t : tripleList) {
ret += t.toString();
ret += '\n';
}
return ret;
}
public void deduplicate()
{
HashSet<String> set = new HashSet<String>();
ArrayList<Triple> list = new ArrayList<Triple>();
for(Triple t: tripleList)
{
String st = t.toStringWithoutScore();
if(set.contains(st))
list.add(t);
set.add(st);
}
for(Triple t: list)
this.delTriple(t);
}
// Is it a Basic Graph Pattern without filter and aggregation?
public boolean isBGP()
{
if(moreThanStr != null || mostStr != null || countTarget)
return false;
return true;
}
//Use to display (can not be executed)
public String toStringForGStore()
{
String ret = "";
for (Triple t : tripleList)
{
// !Omit obvious LITERAL
if(t.object.equals("literal_HRZ"))
continue;
// !Omit some bad TYPEs
if(t.predicateID==Globals.pd.typePredicateID && Globals.pd.bannedTypes.contains(t.object))
continue;
ret += t.toStringForGStore();
ret += '\n';
}
return ret;
}
/**
* @description:
* 1. Select all variables for BGP queries to display specific information.
* 2. DO NOT select all variables when Aggregation like "HAVING" "COUNT" ...
* (It may involves too many results, e.g. "which countries have more than 1000 caves?", caves is no need to display)
* @param: NULL.
* @return: A SPARQL query can be executed by GStore (NO prefix of entities/predicates).
*/
public String toStringForGStore2()
{
String ret = "";
variables.clear();
for(Triple t: tripleList)
{
if (!t.isSubjConstant()) variables.add(t.subject.replaceAll(" ", "_"));
if (!t.isObjConstant()) variables.add(t.object.replaceAll(" ", "_"));
}
if(variables.size() == 0)
queryType = QueryType.Ask;
// part1: select / ask ...
if (queryType==QueryType.Ask)
ret += "ask";
else if(countTarget)
ret += ("select COUNT(DISTINCT " + questionFocus + ")");
else
{
if(!isBGP()) // AGG: select question focus
ret += ("select DISTINCT " + questionFocus);
else // BGP: select all variables
{
ret += "select DISTINCT ";
for (String v : variables)
ret += v + " ";
}
}
// part2: triples
ret += " where\n{\n";
for(Triple t : tripleList)
{
if (!t.object.equals("literal_HRZ")) { // need not display literal
ret += t.toStringForGStore();
ret += " .\n";
}
}
ret += "}\n";
// part3: order by / group by ...
if(moreThanStr != null)
ret += moreThanStr+"\n";
if(mostStr != null)
ret += mostStr+"\n";
// part4: limit
if(queryType != QueryType.Ask && (mostStr == null || !mostStr.contains("LIMIT")))
ret += "LIMIT " + Globals.MaxAnswerNum;
return ret;
}
//Use to execute (select all variables; format 'aggregation' and 'ask')
public String toStringForVirtuoso()
{
String ret = "";
HashSet<String> variables = new HashSet<String>();
// prefix
if (queryType==QueryType.Ask)
ret += "ask where";
else if(countTarget)
ret += ("select COUNT(DISTINCT " + questionFocus + ") where");
else
{
// AGG: select question focus
if(moreThanStr != null || mostStr != null)
ret += ("select DISTINCT " + questionFocus + " where");
// BGP: select all variables
else
{
for (Triple t: tripleList)
{
if (!t.isSubjConstant()) variables.add(t.subject.replaceAll(" ", "_"));
if (!t.isObjConstant()) variables.add(t.object.replaceAll(" ", "_"));
}
ret += "select ";
for (String v : variables)
ret += v + " ";
ret += "where";
}
}
ret += "\n{\n";
if(variables.size() == 0)
variables.add(questionFocus);
// triples
for (Triple t : tripleList)
{
if (!t.object.equals("literal_HRZ")) {
ret += t.toStringForGStore();
ret += " .\n";
}
}
ret += "}\n";
// suffix
if(moreThanStr != null)
{
ret += moreThanStr+"\n";
}
if(mostStr != null)
{
ret += mostStr+"\n";
}
return ret;
}
public int getVariableNumber()
{
int res = 0;
for (Triple t: tripleList)
{
if (!t.isSubjConstant()) res++;
if (!t.isObjConstant()) res++;
}
return res;
}

public void adjustTriplesOrder()
{
Collections.sort(this.tripleList);
}

public int compareTo(Sparql o)
{
double diff = this.score - o.score;
if (diff > 0)
return -1;
else if (diff < 0)
return 1;
else
return 0;
}
@Override
public int hashCode()
{
int key = 0;
for(Triple t: this.tripleList)
key ^= t.hashCode();
return key;
}
@Override
public boolean equals(Object spq)
{
Sparql tempSparql= (Sparql) spq;
String s1 = this.toStringForGStore2(), s2 = tempSparql.toStringForGStore2();
if(this.toStringForGStore2().equals(tempSparql.toStringForGStore2()))
return true;
else
return false;
}
public Sparql(){}
public Sparql(HashMap<Integer, SemanticRelation> semanticRelations)
{
this.semanticRelations = semanticRelations;
}
public Sparql copy()
{
Sparql spq = new Sparql(this.semanticRelations);
for (Triple t : this.tripleList)
spq.addTriple(t);
return spq;
}
public void removeLastTriple()
{
int idx = tripleList.size()-1;
score -= tripleList.get(idx).score;
tripleList.remove(idx);
}
public Sparql removeAllTypeInfo ()
{
score = 0;
ArrayList<Triple> newTripleList = new ArrayList<Triple>();
for (Triple t : tripleList)
{
if (t.predicateID != Globals.pd.typePredicateID)
{
newTripleList.add(t);
score += t.score;
}
}
tripleList = newTripleList;
return this;
}

};

+ 257
- 0
src/rdf/Triple.java View File

@@ -0,0 +1,257 @@
package rdf;

import nlp.ds.Word;
import qa.Globals;

public class Triple implements Comparable<Triple>{
public String subject = null; // subject/object after disambiguation.
public String object = null;
static public int TYPE_ROLE_ID = -5;
static public int VAR_ROLE_ID = -2;
static public int CAT_ROLE_ID = -8; // Category
static public String VAR_NAME = "?xxx";
// subjId/objId: entity id | TYPE_ROLE_ID | VAR_ROLE_ID
public int subjId = -1;
public int objId = -1;
public int predicateID = -1;
public Word subjWord = null; // only be used when semRltn == null
public Word objWord = null;
public SemanticRelation semRltn = null;
public double score = 0;
public boolean isSubjObjOrderSameWithSemRltn = true;
public boolean isSubjObjOrderPrefered = false;
public Word typeSubjectWord = null; // for "type" triples only
public Triple (Triple t) {
subject = t.subject;
object = t.object;
subjId = t.subjId;
objId = t.objId;
predicateID = t.predicateID;
semRltn = t.semRltn;
score = t.score;
isSubjObjOrderSameWithSemRltn = t.isSubjObjOrderSameWithSemRltn;
isSubjObjOrderPrefered = t.isSubjObjOrderPrefered;
}
// A final triple (subject/object order will not changed), does not rely on semantic relation (sr == null), from one word (type variable | embedded info)
public Triple (int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco) {
subjId = sId;
objId = oId;
subject = s;
predicateID = p;
object = o;
semRltn = sr;
score = sco;
}

// A triple translated from a semantic relation (subject/object order can be changed in later)
public Triple (int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco, boolean isSwap) {
subjId = sId;
objId = oId;
subject = s;
predicateID = p;
object = o;
semRltn = sr;
score = sco;
isSubjObjOrderSameWithSemRltn = isSwap;
}
// A final triple (subject/object order will not changed), does not rely on semantic relation (sr == null), from two word (implicit relations of modifier)
public Triple(int sId, String s, int p, int oId, String o, SemanticRelation sr, double sco, Word subj, Word obj) {
subjId = sId;
objId = oId;
subject = s;
predicateID = p;
object = o;
semRltn = sr;
score = sco;
subjWord = subj;
objWord = obj;
}

public Triple copy() {
Triple t = new Triple(this);
return t;
}
public Triple copySwap() {
Triple t = new Triple(this);
String temp;
int tmpId;

tmpId = t.subjId;
t.subjId = t.objId;
t.objId = tmpId;
temp = t.subject;
t.subject = t.object;
t.object = temp;
t.isSubjObjOrderSameWithSemRltn = !this.isSubjObjOrderSameWithSemRltn;
t.isSubjObjOrderPrefered = !this.isSubjObjOrderPrefered;
return t;
}
public void addScore(double s) {
score += s;
}
public double getScore() {
return score;
}
@Override
public int hashCode()
{
return new Integer(subjId).hashCode() ^ new Integer(objId).hashCode() ^ new Integer(predicateID).hashCode();
}
@Override
public String toString() {
return subjId+":<" + subject + "> <" + Globals.pd.getPredicateById(predicateID) + "> "+objId+":<" + object + ">" + " : " + score;
}

public String toStringForGStore() {
StringBuilder sb = new StringBuilder("");
String _subject = subject;
if(_subject.startsWith("?"))
sb.append(_subject+"\t");
else
sb.append("<" + _subject + ">\t");
sb.append("<" + Globals.pd.getPredicateById(predicateID) + ">\t");
String _object;
if(predicateID == Globals.pd.typePredicateID && object.contains("|"))
_object = object.substring(0, object.indexOf('|'));
else
_object = object;
if(_object.startsWith("?"))
sb.append(_object);
else
sb.append("<" + _object + ">");
return sb.toString().replace(' ', '_');
}
public String toStringWithoutScore() {
return "<" + subject + "> <" + Globals.pd.getPredicateById(predicateID) + "> <" + object + ">";
}
public Word getSubjectWord () {
if (predicateID == Globals.pd.typePredicateID) {
return typeSubjectWord;
}
else if(semRltn == null)
{
return subjWord;
}
else {
if (isSubjObjOrderSameWithSemRltn) return semRltn.arg1Word;
else return semRltn.arg2Word;
}
}
public Word getObjectWord () {
if (predicateID == Globals.pd.typePredicateID) {
return typeSubjectWord;
}
else if(semRltn == null)
{
return objWord;
}
else {
if (isSubjObjOrderSameWithSemRltn) return semRltn.arg2Word;
else return semRltn.arg1Word;
}
}
public boolean isSubjConstant () {
if (predicateID == Globals.pd.typePredicateID) {
return !subject.startsWith("?");
}
else {
// Triple from semantic (obvious) relation
if(semRltn != null)
{
if (isSubjObjOrderSameWithSemRltn) return semRltn.isArg1Constant;
else return semRltn.isArg2Constant;
}
// Triple from implicit relation (no semantic relation), it is final triple
else
{
if(subjId != Triple.VAR_ROLE_ID && subjId != Triple.TYPE_ROLE_ID)
return true;
else
return false;
}
}
}
public boolean isObjConstant () {
if (predicateID == Globals.pd.typePredicateID) {
return !object.startsWith("?");
}
else {
if(semRltn != null)
{
if (isSubjObjOrderSameWithSemRltn) return semRltn.isArg2Constant;
else return semRltn.isArg1Constant;
}
else
{
if(objId != Triple.VAR_ROLE_ID && objId != Triple.TYPE_ROLE_ID)
return true;
else
return false;
}
}
}
public int compareTo(Triple o)
{
//Order: Type, Ent&Ent, Ent&Var, Var&Var
if(this.predicateID == Globals.pd.typePredicateID)
{
if(o.predicateID == Globals.pd.typePredicateID)
return 0;
else
return -1;
}
int cnt1 = 0, cnt2 = 0;
if(!this.subject.startsWith("?"))
cnt1++;
if(!this.object.startsWith("?"))
cnt1++;
if(!o.subject.startsWith("?"))
cnt2++;
if(!o.object.startsWith("?"))
cnt2++;
if(cnt1 == cnt2)
return 0;
else if(cnt1 > cnt2)
return -1;
else
return 1;
}
public void swapSubjObjOrder() {
String temp = subject;
int tmpId = subjId;
subject = object;
subjId = objId;
object = temp;
objId = tmpId;
isSubjObjOrderSameWithSemRltn = !isSubjObjOrderSameWithSemRltn;
}
};

+ 53
- 0
src/rdf/TypeMapping.java View File

@@ -0,0 +1,53 @@
package rdf;

import qa.Globals;

public class TypeMapping implements Comparable<TypeMapping>
{
public Integer typeID = null;
public String typeName = null;
public double score = 0;
/*
* 1, For standard type (DBO type in DBpedia), relation = typePredicateID (rdf:type)
* 2, For nonstandard type, typeID = -1
* 3, If add type into triples, need relation | eg, Which professional surfers were born in Australia? (?uri dbo:occupation res:Surfing) relation = dbo:occupation
* 4, If needn't add type, relation = -1 | eg, Who was the father of [Queen] Elizabeth II
* */
public int prefferdRelation = Globals.pd.typePredicateID;
public TypeMapping(Integer tid, String type, double sco)
{
typeID = tid;
typeName = type;
score = sco;
}
public TypeMapping(Integer tid, String type, Integer relation, double sco)
{
typeID = tid;
typeName = type.replace("_", "");
score = sco;
prefferdRelation = relation;
}
// In descending order: big --> small
public int compareTo(TypeMapping o)
{
double diff = this.score - o.score;
if (diff > 0) return -1;
else if (diff < 0) return 1;
else return 0;
}
public int hashCode()
{
return typeID.hashCode();
}
public String toString()
{
StringBuilder res = new StringBuilder(typeName+"("+score+")");
return res.toString();
}
}

+ 91
- 0
src/utils/FileUtil.java View File

@@ -0,0 +1,91 @@
package utils;

import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;


public class FileUtil {
public static List<String> readFile(String filePath){
List<String> lines = new ArrayList<String>();
try {
BufferedReader br = new BufferedReader(new FileReader(filePath));
String line = null;
while( (line = br.readLine()) != null ){
lines.add(line);
}
br.close();
}catch(Exception e){
e.printStackTrace();
}finally {
return lines;
}
}

public static Set<String> readFileAsSet(String filePath){
Set<String> lines = new HashSet<String>();
try {
BufferedReader br = new BufferedReader(new FileReader(filePath));
String line = null;
while( (line = br.readLine()) != null ){
lines.add(line);
}
br.close();
}catch(Exception e){
e.printStackTrace();
}finally {
return lines;
}
}

public static List<String> readFile(InputStream is){
List<String> lines = new ArrayList<String>();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String line = null;
while( (line = br.readLine()) != null ){
lines.add(line);
}
br.close();
}catch(Exception e){
e.printStackTrace();
}finally {
return lines;
}
}

public static String readFileAsALine(InputStream is){
List<String> lines = readFile(is);
StringBuffer buffer = new StringBuffer();
for(String line : lines){
buffer.append(line);
}
return buffer.toString();
}

public static void writeFile(List<String> lines, String filePath){
try{
BufferedWriter bw = new BufferedWriter(new FileWriter(filePath));
for(String line : lines){
bw.write(line+"\n");
}
bw.close();
}catch(Exception e){
e.printStackTrace();
}
}

public static void writeFile(List<String> lines, String filePath, boolean ifContinueWrite){
try{
BufferedWriter bw = new BufferedWriter(new FileWriter(filePath, ifContinueWrite));
for(String line : lines){
bw.write(line+"\n");
}
bw.close();
}catch(Exception e){
e.printStackTrace();
}
}
}

+ 114
- 0
src/utils/HttpRequest.java View File

@@ -0,0 +1,114 @@
package utils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;

public class HttpRequest
{
public static String sendGet(String url, String param) {
String result = "";
BufferedReader in = null;
try {
String urlNameString = url + "?" + param;
URL realUrl = new URL(urlNameString);
URLConnection connection = realUrl.openConnection();
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
connection.connect();
Map<String, List<String>> map = connection.getHeaderFields();
for (String key : map.keySet()) {
System.out.println(key + "--->" + map.get(key));
}
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("Error when sending GET request: " + e);
e.printStackTrace();
}
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}

public static String sendPost(String url, String param) {
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
URLConnection conn = realUrl.openConnection();
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
conn.setDoOutput(true);
conn.setDoInput(true);
out = new PrintWriter(conn.getOutputStream());
out.print(param);
out.flush();
in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("Error when sending POST request: "+e);
e.printStackTrace();
}
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}


public static String getPostData(InputStream in, int size, String charset) {
if (in != null && size > 0) {
byte[] buf = new byte[size];
try {
in.read(buf);
if (charset == null || charset.length() == 0)
return new String(buf);
else {
return new String(buf, charset);
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
}

Loading…
Cancel
Save