@@ -0,0 +1,7 @@ | |||
.project | |||
.classpath | |||
target/ | |||
Result/ | |||
Model/ | |||
.settings/ |
@@ -0,0 +1,15 @@ | |||
基于自然语言处理的情感分析工具 | |||
本程序依赖data目录下面的data.zip,先解压缩 data 目录下面的 data.zip到当前目录。 | |||
1、基于词典和贝叶斯模型的情感分析 | |||
主程序:eshore.cn.it.sentiment.Sentiment 此类通过 | |||
data/Sentiment_Dictionary中的正负面词语建立模型。 | |||
测试: eshore.cn.it.sentiment.SentimentTest | |||
通过这个类就可以测试 data/500trainblogxml中的某个文件夹下面的博客的情感。 | |||
2、直接利用lingpipe的情感分析模块测试情感分析 | |||
直接运行程序: eshore.cn.it.sentiment.ChinesePolarityBasic | |||
程序就会通过: data/polarity_corpus/hotel_reviews/train2训练 | |||
然后自动测试: data/polarity_corpus/hotel_reviews/test2 | |||
最后给出程序测试结果。 |
@@ -0,0 +1,5 @@ | |||
500trainblogxml/ | |||
nerws_corpus/ | |||
output/ | |||
polarity_corpus/ | |||
Sentiment_Dictionary/ |
@@ -0,0 +1,58 @@ | |||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |||
<modelVersion>4.0.0</modelVersion> | |||
<groupId>eshore.cn.it</groupId> | |||
<artifactId>nlp-sentiment</artifactId> | |||
<version>0.0.1-SNAPSHOT</version> | |||
<packaging>jar</packaging> | |||
<name>nlp-sentiment</name> | |||
<url>http://maven.apache.org</url> | |||
<properties> | |||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |||
<commons.io.version>2.4</commons.io.version> | |||
<dom4j.version>1.6.1</dom4j.version> | |||
<lingpipe.version>4.1.0</lingpipe.version> | |||
<jieba.version>1.0.0</jieba.version> | |||
<!-- 无法到中央仓库下载的jar包就集中存放到这个位置 --> | |||
<maven.libs.home>F:/java_git_projects/nlp-sentiment/libs</maven.libs.home> | |||
</properties> | |||
<dependencies> | |||
<dependency> | |||
<groupId>junit</groupId> | |||
<artifactId>junit</artifactId> | |||
<version>3.8.1</version> | |||
<scope>test</scope> | |||
</dependency> | |||
<dependency> | |||
<groupId>commons-io</groupId> | |||
<artifactId>commons-io</artifactId> | |||
<version>${commons.io.version}</version> | |||
</dependency> | |||
<dependency> | |||
<groupId>dom4j</groupId> | |||
<artifactId>dom4j</artifactId> | |||
<version>${dom4j.version}</version> | |||
</dependency> | |||
<!-- 此处需要手动到lingpipe官网下载lingpipe的corejar包 --> | |||
<dependency> | |||
<groupId>com.aliasi</groupId> | |||
<artifactId>lingpipe</artifactId> | |||
<version>${lingpipe.version}</version> | |||
<scope>system</scope> | |||
<systemPath>${maven.libs.home}/lingpipe-4.1.0.jar</systemPath> | |||
</dependency> | |||
<!-- 此处添加结巴分词器 --> | |||
<dependency> | |||
<groupId>com.huaban</groupId> | |||
<artifactId>jieba-analysis</artifactId> | |||
<version>${jieba.version}</version> | |||
</dependency> | |||
</dependencies> | |||
</project> |
@@ -0,0 +1,146 @@ | |||
package eshore.cn.it.sentiment; | |||
import java.io.File; | |||
import java.io.FileReader; | |||
import java.io.IOException; | |||
import java.util.List; | |||
import org.apache.commons.io.IOUtils; | |||
import com.aliasi.classify.Classification; | |||
import com.aliasi.classify.Classified; | |||
import com.aliasi.classify.DynamicLMClassifier; | |||
import com.aliasi.lm.NGramProcessLM; | |||
import com.aliasi.util.Files; | |||
import com.huaban.analysis.jieba.JiebaSegmenter; | |||
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; | |||
import com.huaban.analysis.jieba.SegToken; | |||
/** | |||
* ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类 | |||
* lingpipe适合做增量分析 | |||
* @clebeg 2015-03-13 | |||
* @version 0.0.1 | |||
* */ | |||
public class ChinesePolarityBasic { | |||
private String[] mCategories = new String[]{"+1", "-1"}; | |||
//这就是分类模型 | |||
private DynamicLMClassifier<NGramProcessLM> mClassifier; | |||
private int numTests = 0; | |||
private int numCorrect = 0; | |||
private static final String TRAINFILES_INFO = | |||
"data/polarity_corpus/hotel_reviews/train2.rlabelclass"; | |||
private static final String TRAINFILES_DIR = | |||
"data/polarity_corpus/hotel_reviews/train2"; | |||
private static final String TESTFILES_DIR = | |||
"data/polarity_corpus/hotel_reviews/test2"; | |||
private static final String TESTFILES_INFO = | |||
"data/polarity_corpus/hotel_reviews/test2.rlabelclass"; | |||
private static final String ENCODING = "GBK"; | |||
private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter(); | |||
private final SegMode segMode = SegMode.INDEX; | |||
public static void main(String[] args) { | |||
try { | |||
new ChinesePolarityBasic().run(); | |||
} catch (Throwable t) { | |||
System.out.println("Thrown: " + t); | |||
t.printStackTrace(System.out); | |||
} | |||
} | |||
public ChinesePolarityBasic() { | |||
super(); | |||
int nGram = 8; | |||
mClassifier | |||
= DynamicLMClassifier | |||
.createNGramProcess(mCategories,nGram); | |||
} | |||
private void run() throws ClassNotFoundException, | |||
IOException { | |||
train(); | |||
evaluate(); | |||
} | |||
private void train() throws IOException { | |||
FileReader input = new FileReader(new File(TRAINFILES_INFO)); | |||
List<String> trainInfos = IOUtils.readLines(input); | |||
for (String str : trainInfos){ | |||
String[] train = str.split(" "); | |||
train(train[1], new File(TRAINFILES_DIR, train[0]), ENCODING); | |||
} | |||
} | |||
private void evaluate() throws IOException { | |||
FileReader input = new FileReader(new File(TESTFILES_INFO)); | |||
List<String> trainInfos = IOUtils.readLines(input); | |||
for (String str : trainInfos){ | |||
String[] train = str.split(" "); | |||
evaluate(train[1], new File(TESTFILES_DIR, train[0]), ENCODING); | |||
} | |||
System.out.println(" # Test Cases=" | |||
+ numTests); | |||
System.out.println(" # Correct=" | |||
+ numCorrect); | |||
System.out.println(" % Correct=" | |||
+ ((double)numCorrect) | |||
/(double)numTests); | |||
} | |||
/** | |||
* 给定分类标识,给定训练文本,给定文本的编码,即可作分类训练 | |||
* 分类完成之后就会加入到分类模型中 | |||
* @throws IOException | |||
* */ | |||
private void train(String category, File trainFile, String fileEncoding) | |||
throws IOException { | |||
Classification classification = new Classification(category); | |||
String review = Files.readFromFile(trainFile, fileEncoding); | |||
//此处加入中文分词器,得到分词之后的字符串 | |||
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); | |||
review = ""; | |||
for (SegToken seg : segTokens) { | |||
review += seg.word.getToken() + " "; | |||
} | |||
Classified<CharSequence> classified | |||
= new Classified<CharSequence>(review,classification); | |||
mClassifier.handle(classified); | |||
} | |||
/** | |||
* 给定分类标识,给定测试文本,给定文本的编码,即可作测试模型 | |||
* @throws IOException | |||
* */ | |||
private void evaluate(String category, File testFile, String fileEncoding) | |||
throws IOException { | |||
String review | |||
= Files.readFromFile(testFile, fileEncoding); | |||
//同理,这里可以加入分词器,这样可以试试效果如何。 | |||
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); | |||
review = ""; | |||
for (SegToken seg : segTokens) { | |||
review += seg.word.getToken() + " "; | |||
} | |||
++numTests; | |||
Classification classification | |||
= mClassifier.classify(review); | |||
//得到训练结果 | |||
String resultCategory | |||
= classification.bestCategory(); | |||
if (resultCategory.equals(category)) | |||
++numCorrect; | |||
} | |||
} |
@@ -0,0 +1,94 @@ | |||
package eshore.cn.it.sentiment; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import com.aliasi.classify.Classification; | |||
import com.aliasi.classify.Classified; | |||
import com.aliasi.classify.DynamicLMClassifier; | |||
import com.aliasi.lm.NGramProcessLM; | |||
import com.aliasi.util.Files; | |||
public class PolarityBasic { | |||
File mPolarityDir; | |||
String[] mCategories; | |||
DynamicLMClassifier<NGramProcessLM> mClassifier; | |||
public PolarityBasic(String[] args) { | |||
mPolarityDir = new File("data/polarity_corpus","txt_sentoken"); | |||
mCategories = mPolarityDir.list(); | |||
int nGram = 8; | |||
mClassifier | |||
= DynamicLMClassifier | |||
.createNGramProcess(mCategories,nGram); | |||
} | |||
public static void main(String[] args) { | |||
try { | |||
new PolarityBasic(args).run(); | |||
} catch (Throwable t) { | |||
System.out.println("Thrown: " + t); | |||
t.printStackTrace(System.out); | |||
} | |||
} | |||
private void run() throws ClassNotFoundException, | |||
IOException { | |||
train(); | |||
evaluate(); | |||
} | |||
private void train() throws IOException { | |||
for (int i = 0; i < mCategories.length; ++i) { | |||
String category = mCategories[i]; | |||
Classification classification | |||
= new Classification(category); | |||
File dir = new File(mPolarityDir, mCategories[i]); | |||
File[] trainFiles = dir.listFiles(); | |||
for (int j = 0; j < trainFiles.length; ++j) { | |||
File trainFile = trainFiles[j]; | |||
if (isTrainingFile(trainFile)) { | |||
String review | |||
= Files.readFromFile(trainFile,"ISO-8859-1"); | |||
Classified<CharSequence> classified | |||
= new Classified<CharSequence>(review,classification); | |||
mClassifier.handle(classified); | |||
} | |||
} | |||
} | |||
} | |||
boolean isTrainingFile(File file) { | |||
return file.getName().charAt(2) != '9'; // test on fold 9 | |||
} | |||
void evaluate() throws IOException { | |||
int numTests = 0; | |||
int numCorrect = 0; | |||
for (int i = 0; i < mCategories.length; ++i) { | |||
String category = mCategories[i]; | |||
File file = new File(mPolarityDir,mCategories[i]); | |||
File[] testFiles = file.listFiles(); | |||
for (int j = 0; j < testFiles.length; ++j) { | |||
File testFile = testFiles[j]; | |||
if (!isTrainingFile(testFile)) { | |||
String review | |||
= Files.readFromFile(testFile,"ISO-8859-1"); | |||
++numTests; | |||
Classification classification | |||
= mClassifier.classify(review); | |||
String resultCategory | |||
= classification.bestCategory(); | |||
if (resultCategory.equals(category)) | |||
++numCorrect; | |||
} | |||
} | |||
} | |||
System.out.println(" # Test Cases=" | |||
+ numTests); | |||
System.out.println(" # Correct=" | |||
+ numCorrect); | |||
System.out.println(" % Correct=" | |||
+ ((double)numCorrect) | |||
/(double)numTests); | |||
} | |||
} |
@@ -0,0 +1,211 @@ | |||
package eshore.cn.it.sentiment; | |||
import java.io.*; | |||
import java.util.*; | |||
import org.apache.commons.io.FileUtils; | |||
import org.dom4j.Document; | |||
import org.dom4j.Element; | |||
import org.dom4j.io.SAXReader; | |||
public class Sentiment { | |||
static private HashSet<String> Negative, Positive; //两种情感词典 | |||
static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 | |||
static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 | |||
static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/"; | |||
static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; | |||
static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; | |||
static final String FILE_ENCODING = "UTF-8"; | |||
public static void main(String[] args) throws Exception { | |||
// TODO 自动生成的方法存根 | |||
Sentiment Sentiment = new Sentiment(); | |||
Sentiment.Model( ); | |||
Sentiment.Save_Model(); | |||
} | |||
public void Model( ) throws Exception { | |||
this.Read_Sentiment_Dictionary(); | |||
this.Sentiment_Doc_Weight(SENTIMENT_DOC_WEIGHT_PATH); | |||
} | |||
@SuppressWarnings("resource") | |||
public void Read_Sentiment_Dictionary() throws Exception { | |||
BufferedReader buf; | |||
String str; | |||
//集合,里面元素不允许重复 | |||
Negative = new HashSet<String>(); | |||
buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); | |||
while( (str = buf.readLine()) != null ) { | |||
Negative.add(str); | |||
} | |||
Positive = new HashSet<String>(); | |||
buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); | |||
while( (str = buf.readLine()) != null ) { | |||
Positive.add(str); | |||
} | |||
} | |||
public void Sentiment_Doc_Weight( String DirPath ) throws Exception { | |||
File NegativeDir = new File( DirPath + "negativeout" ); | |||
String[] NegativeFiles = NegativeDir.list(); | |||
NegativeDoc = NegativeFiles.length; | |||
ArrayList<String> NegativeCurrentList = new ArrayList<String>(); | |||
for ( int i = 0; i < NegativeFiles.length; i ++ ) { | |||
System.out.println("NegativeFiles No."+(i+1)+" "+DirPath+"negativeout/"+NegativeFiles[i]); | |||
this.ReadXML(DirPath+"negativeout/"+NegativeFiles[i], NegativeCurrentList); | |||
} | |||
NegativeWeight = HashTable( NegativeCurrentList ); | |||
/**********************************************************************************************************/ | |||
File PositiveDir = new File( DirPath + "positiveout" ); | |||
String[] PositiveFiles = PositiveDir.list(); | |||
PositiveDoc = PositiveFiles.length; | |||
ArrayList<String> PositiveCurrentList = new ArrayList<String>(); | |||
for ( int i = 0; i < PositiveFiles.length; i ++ ) { | |||
System.out.println("PositiveFiles No."+(i+1)+" "+DirPath+"positiveout/"+PositiveFiles[i]); | |||
this.ReadXML(DirPath+"positiveout/"+PositiveFiles[i], PositiveCurrentList); | |||
} | |||
PositiveWeight = HashTable( PositiveCurrentList ); | |||
/*********************************************************************************************************/ | |||
File UnsureDir = new File( DirPath + "unsureout" ); | |||
String[] UnsureFiles = UnsureDir.list(); | |||
UnsureDoc = UnsureFiles.length; | |||
ArrayList<String> UnsureCurrentList = new ArrayList<String>(); | |||
for ( int i = 0; i < UnsureFiles.length; i ++ ) { | |||
System.out.println("UnsureFiles No."+(i+1)+" "+DirPath+"unsureout/"+UnsureFiles[i]); | |||
this.ReadXML(DirPath+"unsureout/"+UnsureFiles[i], UnsureCurrentList); | |||
} | |||
UnsureWeight = HashTable( UnsureCurrentList ); | |||
/********************************************************************************************************/ | |||
System.out.println("UnsureCurrent = " + UnsureCurrentList.size() + " UnsureHashTable = " + UnsureWeight.size()); | |||
System.out.println("PositiveCurrent = " + PositiveCurrentList.size() + " PositiveHashTable = " + PositiveWeight.size()); | |||
System.out.println("NegativeCurrent = " + NegativeCurrentList.size() + " NegativeHashTable = " + NegativeWeight.size()); | |||
System.out.println("NegativeDoc = " + NegativeDoc + " PositiveDoc = " + PositiveDoc + " UnsureDoc = " + UnsureDoc); | |||
} | |||
public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 | |||
SAXReader SaxReader = new SAXReader(); | |||
Document Doc = SaxReader.read(new File(FilePath)); | |||
Element root = Doc.getRootElement(); | |||
Element content = root.element("content"); | |||
List<?> sentenses = content.elements("sentence"); //每一句话作为一项 | |||
for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) { | |||
Element sentense = (Element)iter.next(); | |||
List<?> toks = sentense.elements(); | |||
for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) { | |||
Element tok = (Element)iter1.next(); | |||
String Type = tok.attributeValue("type"); | |||
if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 | |||
GetWord( tok, currentList ); //从"group"中获取词 | |||
} | |||
} | |||
} | |||
} | |||
public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词 | |||
String Word = ""; | |||
List<?> elements = root.elements("tok"); | |||
for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) { | |||
Element tok = (Element)iter.next(); | |||
String Type = tok.attributeValue("type"); | |||
if ( Type.compareTo("atom") == 0 ) { | |||
Word += tok.getText().trim(); | |||
} | |||
else { | |||
GetWord( tok, currentList ); | |||
} | |||
} | |||
if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 | |||
currentList.add(Word); | |||
} | |||
} | |||
public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表 | |||
Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>(); | |||
for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) { | |||
String Word = (String)iter.next(); | |||
if ( HashTable.containsKey(Word) ) { | |||
Integer Weight = HashTable.get(Word); | |||
HashTable.put(Word, Weight+1); | |||
} | |||
else { | |||
HashTable.put(Word, 1); | |||
} | |||
} | |||
return HashTable; | |||
} | |||
@SuppressWarnings("resource") | |||
public void Save_Model( ) throws Exception { | |||
ObjectOutputStream OOS; | |||
File ModelPath = new File("Model"); | |||
File NegativeModel = new File(ModelPath, "NegativeModel.txt"); | |||
File PositiveModel = new File(ModelPath, "PositiveModel.txt"); | |||
File UnsureModel = new File(ModelPath, "UnsureModel.txt"); | |||
if ( !ModelPath.exists() ) { ModelPath.mkdir(); } | |||
System.out.println("Saving NegativeModel..."); | |||
OOS = new ObjectOutputStream( new FileOutputStream( NegativeModel ) ); //对象流直接写入 | |||
OOS.writeObject(NegativeDoc); | |||
OOS.writeObject(NegativeWeight); | |||
System.out.println("Saving PositiveModel..."); | |||
OOS = new ObjectOutputStream( new FileOutputStream( PositiveModel ) ); | |||
OOS.writeObject(PositiveDoc); | |||
OOS.writeObject(PositiveWeight); | |||
System.out.println("Saving UnsureModel..."); | |||
OOS = new ObjectOutputStream( new FileOutputStream( UnsureModel ) ); | |||
OOS.writeObject(UnsureDoc); | |||
OOS.writeObject(UnsureWeight); | |||
Enumeration<String> Keys; | |||
System.out.println("Saving NegativeWeight..."); | |||
Keys = NegativeWeight.keys(); | |||
while( Keys.hasMoreElements() ) { | |||
String Key = Keys.nextElement(); | |||
FileUtils.writeStringToFile(new File("Model", "NegativeWeight.txt"), Key+"\t\t\t"+NegativeWeight.get(Key)+"\r\n", "UTF-8", true); | |||
} | |||
System.out.println("Saving PositiveWeight..."); | |||
Keys = PositiveWeight.keys(); | |||
while( Keys.hasMoreElements() ) { | |||
String Key = Keys.nextElement(); | |||
FileUtils.writeStringToFile(new File("Model", "PositiveWeight.txt"), Key+"\t\t\t"+PositiveWeight.get(Key)+"\r\n", "UTF-8", true); | |||
} | |||
System.out.println("Saving UnsureWeight..."); | |||
Keys = UnsureWeight.keys(); | |||
while( Keys.hasMoreElements() ) { | |||
String Key = Keys.nextElement(); | |||
FileUtils.writeStringToFile(new File("Model", "UnsureWeight.txt"), Key+"\t\t\t"+UnsureWeight.get(Key)+"\r\n", "UTF-8", true); | |||
} | |||
System.out.println("Save Success!"); | |||
} | |||
} |
@@ -0,0 +1,244 @@ | |||
package eshore.cn.it.sentiment; | |||
import java.io.*; | |||
import java.util.*; | |||
import org.apache.commons.io.FileUtils; | |||
import org.dom4j.Document; | |||
import org.dom4j.Element; | |||
import org.dom4j.io.SAXReader; | |||
public class SentimentTest { | |||
static private HashSet<String> Negative, Positive; //两种情感词典 | |||
static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 | |||
static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 | |||
static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/negativeout/"; | |||
static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; | |||
static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; | |||
static final String FILE_ENCODING = "UTF-8"; | |||
public static void main(String[] args) throws Exception { | |||
// TODO 自动生成的方法存根 | |||
SentimentTest Sentiment_Test = new SentimentTest(); | |||
Sentiment_Test.Read_Model(); //读取模型 | |||
Sentiment_Test.Classify_Directory(SENTIMENT_DOC_WEIGHT_PATH); | |||
} | |||
@SuppressWarnings({ "resource", "unchecked" }) | |||
public void Read_Model() throws Exception { | |||
this.Read_Sentiment_Dictionary(); | |||
ObjectInputStream OIS; //对象流直接读入 | |||
File ModelPath = new File("Model"); | |||
File NegativeModel = new File(ModelPath, "NegativeModel.txt"); | |||
File PositiveModel = new File(ModelPath, "PositiveModel.txt"); | |||
File UnsureModel = new File(ModelPath, "UnsureModel.txt"); | |||
System.out.println("Reading NegativeModel..."); | |||
OIS = new ObjectInputStream( new FileInputStream( NegativeModel ) ); | |||
NegativeDoc = (Integer) OIS.readObject(); | |||
NegativeWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||
System.out.println("Reading PositiveModel..."); | |||
OIS = new ObjectInputStream( new FileInputStream( PositiveModel ) ); | |||
PositiveDoc = (Integer) OIS.readObject(); | |||
PositiveWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||
System.out.println("Reading UnsureModel..."); | |||
OIS = new ObjectInputStream( new FileInputStream( UnsureModel ) ); | |||
UnsureDoc = (Integer) OIS.readObject(); | |||
UnsureWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||
System.out.println("Read Success."); | |||
} | |||
@SuppressWarnings("resource") | |||
public void Read_Sentiment_Dictionary( ) throws Exception { //读入情感词典 | |||
BufferedReader buf; | |||
String str; | |||
Negative = new HashSet<String>(); | |||
buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); | |||
while( (str = buf.readLine()) != null ) { | |||
Negative.add(str); | |||
} | |||
Positive = new HashSet<String>(); | |||
buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); | |||
while( (str = buf.readLine()) != null ) { | |||
Positive.add(str); | |||
} | |||
} | |||
public void Classify_Directory( String DirectoryPath ) throws Exception { | |||
int PositiveNum = 0, NegativeNum = 0, UnsureNum = 0; | |||
String[] Text_Path = new File( DirectoryPath ).list(); | |||
for ( int i = 0; i < Text_Path.length; i ++ ) { | |||
Classify( DirectoryPath+Text_Path[i] ); | |||
double Ans = Classify( DirectoryPath+Text_Path[i] ); //对当前目录下的每一个文件进行测试 | |||
if ( Ans < 0 ) { //根据测试结果将测试文本进行分类 | |||
FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Positive"), Text_Path[i])); | |||
PositiveNum ++; | |||
} | |||
else if ( Ans > 0 ) { | |||
FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Negative"), Text_Path[i])); | |||
NegativeNum ++; | |||
} | |||
else { | |||
FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Unsure"), Text_Path[i])); | |||
UnsureNum ++; | |||
} | |||
System.out.print( "No." + (i+1) + " " + Text_Path[i] + ": " ); | |||
if ( Ans < 0 ) { System.out.println("Positive"); } | |||
else if ( Ans > 0 ) { System.out.println("Negative"); } | |||
else { System.out.println("Unsure"); } | |||
} | |||
System.out.println("End."); | |||
System.out.println("NegativeNum = " + NegativeNum + " PositiveNum = " + PositiveNum + " UnsureNum = " + UnsureNum); | |||
} | |||
public double Classify( String FilePath ) throws Exception { | |||
Hashtable<String, Integer> FileHashTable = Read_TestFile( FilePath ); | |||
Enumeration<String> Keys; | |||
double NegativeAns = 1, PositiveAns = 1; | |||
Keys = FileHashTable.keys(); | |||
while( Keys.hasMoreElements() ) { | |||
String Word = Keys.nextElement(); | |||
NegativeAns *= ( Math.pow(this.PostProbability(Word, NegativeWeight), FileHashTable.get(Word)) ); | |||
} | |||
NegativeAns *= this.PriorProbability(NegativeDoc); | |||
Keys = FileHashTable.keys(); | |||
while( Keys.hasMoreElements() ) { | |||
String Word = Keys.nextElement(); | |||
PositiveAns *= ( Math.pow(this.PostProbability(Word, PositiveWeight), FileHashTable.get(Word)) ); | |||
} | |||
PositiveAns *= this.PriorProbability(PositiveDoc); | |||
return ( NegativeAns-PositiveAns ); | |||
} | |||
public Hashtable<String, Integer> Read_TestFile( String FilePath ) throws Exception { | |||
ArrayList<String> FileCurrentList = new ArrayList<String>(); | |||
ReadXML( FilePath, FileCurrentList ); | |||
Hashtable<String, Integer> FileHashTable = HashTable( FileCurrentList ); | |||
return FileHashTable; | |||
} | |||
public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 | |||
SAXReader SaxReader = new SAXReader(); | |||
Document Doc = SaxReader.read(new File(FilePath)); | |||
Element root = Doc.getRootElement(); | |||
Element content = root.element("content"); | |||
List<?> sentenses = content.elements("sentence"); //每一句话作为一项 | |||
for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) { | |||
Element sentense = (Element)iter.next(); | |||
List<?> toks = sentense.elements(); | |||
for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) { | |||
Element tok = (Element)iter1.next(); | |||
String Type = tok.attributeValue("type"); | |||
if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 | |||
GetWord( tok, currentList ); //从"group"中获取词 | |||
} | |||
} | |||
} | |||
} | |||
public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词 | |||
String Word = ""; | |||
List<?> elements = root.elements("tok"); | |||
for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) { | |||
Element tok = (Element)iter.next(); | |||
String Type = tok.attributeValue("type"); | |||
if ( Type.compareTo("atom") == 0 ) { | |||
Word += tok.getText().trim(); | |||
} | |||
else { | |||
GetWord( tok, currentList ); | |||
} | |||
} | |||
if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 | |||
currentList.add(Word); | |||
} | |||
} | |||
public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表 | |||
Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>(); | |||
for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) { | |||
String Word = (String)iter.next(); | |||
if ( HashTable.containsKey(Word) ) { | |||
Integer Weight = HashTable.get(Word); | |||
HashTable.put(Word, Weight+1); | |||
} | |||
else { | |||
HashTable.put(Word, 1); | |||
} | |||
} | |||
return HashTable; | |||
} | |||
public double PriorProbability( Integer SentimentDoc ) { | |||
double Ans = 1; | |||
Ans = ( (double)SentimentDoc/( (double)NegativeDoc+(double)PositiveDoc+(double)UnsureDoc ) ); | |||
return Ans; | |||
} | |||
public double PostProbability( String Word, Hashtable<String, Integer> SentimentWeight ) { | |||
double Ans, V, E; | |||
double Weight = 0, Weights = 0; | |||
if ( SentimentWeight.containsKey(Word) ) | |||
Weight = (double)SentimentWeight.get(Word); | |||
Weights = PostWeights( SentimentWeight ); | |||
V = PostWeights( NegativeWeight ) + PostWeights( PositiveWeight ) + PostWeights( UnsureWeight ); | |||
E = 1/Math.abs(V); | |||
Ans = ( Weight + E )/( Weights + E*Math.abs(V) ); | |||
return Ans; | |||
} | |||
public double PostWeights( Hashtable<String, Integer> SentimentWeight ) { | |||
double Weights = 0; | |||
Enumeration<String> Keys; | |||
Keys = SentimentWeight.keys(); | |||
while( Keys.hasMoreElements() ) { | |||
String Key = Keys.nextElement(); | |||
Weights += (double)SentimentWeight.get(Key); | |||
} | |||
return Weights; | |||
} | |||
} |