@@ -0,0 +1,7 @@ | |||||
.project | |||||
.classpath | |||||
target/ | |||||
Result/ | |||||
Model/ | |||||
.settings/ |
@@ -0,0 +1,15 @@ | |||||
基于自然语言处理的情感分析工具 | |||||
本程序依赖data目录下面的data.zip,先解压缩 data 目录下面的 data.zip到当前目录。 | |||||
1、基于词典和贝叶斯模型的情感分析 | |||||
主程序:eshore.cn.it.sentiment.Sentiment 此类通过 | |||||
data/Sentiment_Dictionary中的正负面词语建立模型。 | |||||
测试: eshore.cn.it.sentiment.SentimentTest | |||||
通过这个类就可以测试 data/500trainblogxml中的某个文件夹下面的博客的情感。 | |||||
2、直接利用lingpipe的情感分析模块测试情感分析 | |||||
直接运行程序: eshore.cn.it.sentiment.ChinesePolarityBasic | |||||
程序就会通过: data/polarity_corpus/hotel_reviews/train2训练 | |||||
然后自动测试: data/polarity_corpus/hotel_reviews/test2 | |||||
最后给出程序测试结果。 |
@@ -0,0 +1,5 @@ | |||||
500trainblogxml/ | |||||
nerws_corpus/ | |||||
output/ | |||||
polarity_corpus/ | |||||
Sentiment_Dictionary/ |
@@ -0,0 +1,58 @@ | |||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |||||
<modelVersion>4.0.0</modelVersion> | |||||
<groupId>eshore.cn.it</groupId> | |||||
<artifactId>nlp-sentiment</artifactId> | |||||
<version>0.0.1-SNAPSHOT</version> | |||||
<packaging>jar</packaging> | |||||
<name>nlp-sentiment</name> | |||||
<url>http://maven.apache.org</url> | |||||
<properties> | |||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |||||
<commons.io.version>2.4</commons.io.version> | |||||
<dom4j.version>1.6.1</dom4j.version> | |||||
<lingpipe.version>4.1.0</lingpipe.version> | |||||
<jieba.version>1.0.0</jieba.version> | |||||
<!-- 无法到中央仓库下载的jar包就集中存放到这个位置 --> | |||||
<maven.libs.home>F:/java_git_projects/nlp-sentiment/libs</maven.libs.home> | |||||
</properties> | |||||
<dependencies> | |||||
<dependency> | |||||
<groupId>junit</groupId> | |||||
<artifactId>junit</artifactId> | |||||
<version>3.8.1</version> | |||||
<scope>test</scope> | |||||
</dependency> | |||||
<dependency> | |||||
<groupId>commons-io</groupId> | |||||
<artifactId>commons-io</artifactId> | |||||
<version>${commons.io.version}</version> | |||||
</dependency> | |||||
<dependency> | |||||
<groupId>dom4j</groupId> | |||||
<artifactId>dom4j</artifactId> | |||||
<version>${dom4j.version}</version> | |||||
</dependency> | |||||
<!-- 此处需要手动到lingpipe官网下载lingpipe的corejar包 --> | |||||
<dependency> | |||||
<groupId>com.aliasi</groupId> | |||||
<artifactId>lingpipe</artifactId> | |||||
<version>${lingpipe.version}</version> | |||||
<scope>system</scope> | |||||
<systemPath>${maven.libs.home}/lingpipe-4.1.0.jar</systemPath> | |||||
</dependency> | |||||
<!-- 此处添加结巴分词器 --> | |||||
<dependency> | |||||
<groupId>com.huaban</groupId> | |||||
<artifactId>jieba-analysis</artifactId> | |||||
<version>${jieba.version}</version> | |||||
</dependency> | |||||
</dependencies> | |||||
</project> |
@@ -0,0 +1,146 @@ | |||||
package eshore.cn.it.sentiment; | |||||
import java.io.File; | |||||
import java.io.FileReader; | |||||
import java.io.IOException; | |||||
import java.util.List; | |||||
import org.apache.commons.io.IOUtils; | |||||
import com.aliasi.classify.Classification; | |||||
import com.aliasi.classify.Classified; | |||||
import com.aliasi.classify.DynamicLMClassifier; | |||||
import com.aliasi.lm.NGramProcessLM; | |||||
import com.aliasi.util.Files; | |||||
import com.huaban.analysis.jieba.JiebaSegmenter; | |||||
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; | |||||
import com.huaban.analysis.jieba.SegToken; | |||||
/** | |||||
* ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类 | |||||
* lingpipe适合做增量分析 | |||||
* @clebeg 2015-03-13 | |||||
* @version 0.0.1 | |||||
* */ | |||||
public class ChinesePolarityBasic { | |||||
private String[] mCategories = new String[]{"+1", "-1"}; | |||||
//这就是分类模型 | |||||
private DynamicLMClassifier<NGramProcessLM> mClassifier; | |||||
private int numTests = 0; | |||||
private int numCorrect = 0; | |||||
private static final String TRAINFILES_INFO = | |||||
"data/polarity_corpus/hotel_reviews/train2.rlabelclass"; | |||||
private static final String TRAINFILES_DIR = | |||||
"data/polarity_corpus/hotel_reviews/train2"; | |||||
private static final String TESTFILES_DIR = | |||||
"data/polarity_corpus/hotel_reviews/test2"; | |||||
private static final String TESTFILES_INFO = | |||||
"data/polarity_corpus/hotel_reviews/test2.rlabelclass"; | |||||
private static final String ENCODING = "GBK"; | |||||
private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter(); | |||||
private final SegMode segMode = SegMode.INDEX; | |||||
public static void main(String[] args) { | |||||
try { | |||||
new ChinesePolarityBasic().run(); | |||||
} catch (Throwable t) { | |||||
System.out.println("Thrown: " + t); | |||||
t.printStackTrace(System.out); | |||||
} | |||||
} | |||||
public ChinesePolarityBasic() { | |||||
super(); | |||||
int nGram = 8; | |||||
mClassifier | |||||
= DynamicLMClassifier | |||||
.createNGramProcess(mCategories,nGram); | |||||
} | |||||
private void run() throws ClassNotFoundException, | |||||
IOException { | |||||
train(); | |||||
evaluate(); | |||||
} | |||||
private void train() throws IOException { | |||||
FileReader input = new FileReader(new File(TRAINFILES_INFO)); | |||||
List<String> trainInfos = IOUtils.readLines(input); | |||||
for (String str : trainInfos){ | |||||
String[] train = str.split(" "); | |||||
train(train[1], new File(TRAINFILES_DIR, train[0]), ENCODING); | |||||
} | |||||
} | |||||
private void evaluate() throws IOException { | |||||
FileReader input = new FileReader(new File(TESTFILES_INFO)); | |||||
List<String> trainInfos = IOUtils.readLines(input); | |||||
for (String str : trainInfos){ | |||||
String[] train = str.split(" "); | |||||
evaluate(train[1], new File(TESTFILES_DIR, train[0]), ENCODING); | |||||
} | |||||
System.out.println(" # Test Cases=" | |||||
+ numTests); | |||||
System.out.println(" # Correct=" | |||||
+ numCorrect); | |||||
System.out.println(" % Correct=" | |||||
+ ((double)numCorrect) | |||||
/(double)numTests); | |||||
} | |||||
/** | |||||
* 给定分类标识,给定训练文本,给定文本的编码,即可作分类训练 | |||||
* 分类完成之后就会加入到分类模型中 | |||||
* @throws IOException | |||||
* */ | |||||
private void train(String category, File trainFile, String fileEncoding) | |||||
throws IOException { | |||||
Classification classification = new Classification(category); | |||||
String review = Files.readFromFile(trainFile, fileEncoding); | |||||
//此处加入中文分词器,得到分词之后的字符串 | |||||
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); | |||||
review = ""; | |||||
for (SegToken seg : segTokens) { | |||||
review += seg.word.getToken() + " "; | |||||
} | |||||
Classified<CharSequence> classified | |||||
= new Classified<CharSequence>(review,classification); | |||||
mClassifier.handle(classified); | |||||
} | |||||
/** | |||||
* 给定分类标识,给定测试文本,给定文本的编码,即可作测试模型 | |||||
* @throws IOException | |||||
* */ | |||||
private void evaluate(String category, File testFile, String fileEncoding) | |||||
throws IOException { | |||||
String review | |||||
= Files.readFromFile(testFile, fileEncoding); | |||||
//同理,这里可以加入分词器,这样可以试试效果如何。 | |||||
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); | |||||
review = ""; | |||||
for (SegToken seg : segTokens) { | |||||
review += seg.word.getToken() + " "; | |||||
} | |||||
++numTests; | |||||
Classification classification | |||||
= mClassifier.classify(review); | |||||
//得到训练结果 | |||||
String resultCategory | |||||
= classification.bestCategory(); | |||||
if (resultCategory.equals(category)) | |||||
++numCorrect; | |||||
} | |||||
} |
@@ -0,0 +1,94 @@ | |||||
package eshore.cn.it.sentiment; | |||||
import java.io.File; | |||||
import java.io.IOException; | |||||
import com.aliasi.classify.Classification; | |||||
import com.aliasi.classify.Classified; | |||||
import com.aliasi.classify.DynamicLMClassifier; | |||||
import com.aliasi.lm.NGramProcessLM; | |||||
import com.aliasi.util.Files; | |||||
public class PolarityBasic { | |||||
File mPolarityDir; | |||||
String[] mCategories; | |||||
DynamicLMClassifier<NGramProcessLM> mClassifier; | |||||
public PolarityBasic(String[] args) { | |||||
mPolarityDir = new File("data/polarity_corpus","txt_sentoken"); | |||||
mCategories = mPolarityDir.list(); | |||||
int nGram = 8; | |||||
mClassifier | |||||
= DynamicLMClassifier | |||||
.createNGramProcess(mCategories,nGram); | |||||
} | |||||
public static void main(String[] args) { | |||||
try { | |||||
new PolarityBasic(args).run(); | |||||
} catch (Throwable t) { | |||||
System.out.println("Thrown: " + t); | |||||
t.printStackTrace(System.out); | |||||
} | |||||
} | |||||
private void run() throws ClassNotFoundException, | |||||
IOException { | |||||
train(); | |||||
evaluate(); | |||||
} | |||||
private void train() throws IOException { | |||||
for (int i = 0; i < mCategories.length; ++i) { | |||||
String category = mCategories[i]; | |||||
Classification classification | |||||
= new Classification(category); | |||||
File dir = new File(mPolarityDir, mCategories[i]); | |||||
File[] trainFiles = dir.listFiles(); | |||||
for (int j = 0; j < trainFiles.length; ++j) { | |||||
File trainFile = trainFiles[j]; | |||||
if (isTrainingFile(trainFile)) { | |||||
String review | |||||
= Files.readFromFile(trainFile,"ISO-8859-1"); | |||||
Classified<CharSequence> classified | |||||
= new Classified<CharSequence>(review,classification); | |||||
mClassifier.handle(classified); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
boolean isTrainingFile(File file) { | |||||
return file.getName().charAt(2) != '9'; // test on fold 9 | |||||
} | |||||
void evaluate() throws IOException { | |||||
int numTests = 0; | |||||
int numCorrect = 0; | |||||
for (int i = 0; i < mCategories.length; ++i) { | |||||
String category = mCategories[i]; | |||||
File file = new File(mPolarityDir,mCategories[i]); | |||||
File[] testFiles = file.listFiles(); | |||||
for (int j = 0; j < testFiles.length; ++j) { | |||||
File testFile = testFiles[j]; | |||||
if (!isTrainingFile(testFile)) { | |||||
String review | |||||
= Files.readFromFile(testFile,"ISO-8859-1"); | |||||
++numTests; | |||||
Classification classification | |||||
= mClassifier.classify(review); | |||||
String resultCategory | |||||
= classification.bestCategory(); | |||||
if (resultCategory.equals(category)) | |||||
++numCorrect; | |||||
} | |||||
} | |||||
} | |||||
System.out.println(" # Test Cases=" | |||||
+ numTests); | |||||
System.out.println(" # Correct=" | |||||
+ numCorrect); | |||||
System.out.println(" % Correct=" | |||||
+ ((double)numCorrect) | |||||
/(double)numTests); | |||||
} | |||||
} |
@@ -0,0 +1,211 @@ | |||||
package eshore.cn.it.sentiment; | |||||
import java.io.*; | |||||
import java.util.*; | |||||
import org.apache.commons.io.FileUtils; | |||||
import org.dom4j.Document; | |||||
import org.dom4j.Element; | |||||
import org.dom4j.io.SAXReader; | |||||
public class Sentiment { | |||||
static private HashSet<String> Negative, Positive; //两种情感词典 | |||||
static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 | |||||
static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 | |||||
static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/"; | |||||
static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; | |||||
static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; | |||||
static final String FILE_ENCODING = "UTF-8"; | |||||
public static void main(String[] args) throws Exception { | |||||
// TODO 自动生成的方法存根 | |||||
Sentiment Sentiment = new Sentiment(); | |||||
Sentiment.Model( ); | |||||
Sentiment.Save_Model(); | |||||
} | |||||
public void Model( ) throws Exception { | |||||
this.Read_Sentiment_Dictionary(); | |||||
this.Sentiment_Doc_Weight(SENTIMENT_DOC_WEIGHT_PATH); | |||||
} | |||||
@SuppressWarnings("resource") | |||||
public void Read_Sentiment_Dictionary() throws Exception { | |||||
BufferedReader buf; | |||||
String str; | |||||
//集合,里面元素不允许重复 | |||||
Negative = new HashSet<String>(); | |||||
buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); | |||||
while( (str = buf.readLine()) != null ) { | |||||
Negative.add(str); | |||||
} | |||||
Positive = new HashSet<String>(); | |||||
buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); | |||||
while( (str = buf.readLine()) != null ) { | |||||
Positive.add(str); | |||||
} | |||||
} | |||||
public void Sentiment_Doc_Weight( String DirPath ) throws Exception { | |||||
File NegativeDir = new File( DirPath + "negativeout" ); | |||||
String[] NegativeFiles = NegativeDir.list(); | |||||
NegativeDoc = NegativeFiles.length; | |||||
ArrayList<String> NegativeCurrentList = new ArrayList<String>(); | |||||
for ( int i = 0; i < NegativeFiles.length; i ++ ) { | |||||
System.out.println("NegativeFiles No."+(i+1)+" "+DirPath+"negativeout/"+NegativeFiles[i]); | |||||
this.ReadXML(DirPath+"negativeout/"+NegativeFiles[i], NegativeCurrentList); | |||||
} | |||||
NegativeWeight = HashTable( NegativeCurrentList ); | |||||
/**********************************************************************************************************/ | |||||
File PositiveDir = new File( DirPath + "positiveout" ); | |||||
String[] PositiveFiles = PositiveDir.list(); | |||||
PositiveDoc = PositiveFiles.length; | |||||
ArrayList<String> PositiveCurrentList = new ArrayList<String>(); | |||||
for ( int i = 0; i < PositiveFiles.length; i ++ ) { | |||||
System.out.println("PositiveFiles No."+(i+1)+" "+DirPath+"positiveout/"+PositiveFiles[i]); | |||||
this.ReadXML(DirPath+"positiveout/"+PositiveFiles[i], PositiveCurrentList); | |||||
} | |||||
PositiveWeight = HashTable( PositiveCurrentList ); | |||||
/*********************************************************************************************************/ | |||||
File UnsureDir = new File( DirPath + "unsureout" ); | |||||
String[] UnsureFiles = UnsureDir.list(); | |||||
UnsureDoc = UnsureFiles.length; | |||||
ArrayList<String> UnsureCurrentList = new ArrayList<String>(); | |||||
for ( int i = 0; i < UnsureFiles.length; i ++ ) { | |||||
System.out.println("UnsureFiles No."+(i+1)+" "+DirPath+"unsureout/"+UnsureFiles[i]); | |||||
this.ReadXML(DirPath+"unsureout/"+UnsureFiles[i], UnsureCurrentList); | |||||
} | |||||
UnsureWeight = HashTable( UnsureCurrentList ); | |||||
/********************************************************************************************************/ | |||||
System.out.println("UnsureCurrent = " + UnsureCurrentList.size() + " UnsureHashTable = " + UnsureWeight.size()); | |||||
System.out.println("PositiveCurrent = " + PositiveCurrentList.size() + " PositiveHashTable = " + PositiveWeight.size()); | |||||
System.out.println("NegativeCurrent = " + NegativeCurrentList.size() + " NegativeHashTable = " + NegativeWeight.size()); | |||||
System.out.println("NegativeDoc = " + NegativeDoc + " PositiveDoc = " + PositiveDoc + " UnsureDoc = " + UnsureDoc); | |||||
} | |||||
public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 | |||||
SAXReader SaxReader = new SAXReader(); | |||||
Document Doc = SaxReader.read(new File(FilePath)); | |||||
Element root = Doc.getRootElement(); | |||||
Element content = root.element("content"); | |||||
List<?> sentenses = content.elements("sentence"); //每一句话作为一项 | |||||
for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) { | |||||
Element sentense = (Element)iter.next(); | |||||
List<?> toks = sentense.elements(); | |||||
for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) { | |||||
Element tok = (Element)iter1.next(); | |||||
String Type = tok.attributeValue("type"); | |||||
if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 | |||||
GetWord( tok, currentList ); //从"group"中获取词 | |||||
} | |||||
} | |||||
} | |||||
} | |||||
public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词 | |||||
String Word = ""; | |||||
List<?> elements = root.elements("tok"); | |||||
for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) { | |||||
Element tok = (Element)iter.next(); | |||||
String Type = tok.attributeValue("type"); | |||||
if ( Type.compareTo("atom") == 0 ) { | |||||
Word += tok.getText().trim(); | |||||
} | |||||
else { | |||||
GetWord( tok, currentList ); | |||||
} | |||||
} | |||||
if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 | |||||
currentList.add(Word); | |||||
} | |||||
} | |||||
public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表 | |||||
Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>(); | |||||
for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) { | |||||
String Word = (String)iter.next(); | |||||
if ( HashTable.containsKey(Word) ) { | |||||
Integer Weight = HashTable.get(Word); | |||||
HashTable.put(Word, Weight+1); | |||||
} | |||||
else { | |||||
HashTable.put(Word, 1); | |||||
} | |||||
} | |||||
return HashTable; | |||||
} | |||||
@SuppressWarnings("resource") | |||||
public void Save_Model( ) throws Exception { | |||||
ObjectOutputStream OOS; | |||||
File ModelPath = new File("Model"); | |||||
File NegativeModel = new File(ModelPath, "NegativeModel.txt"); | |||||
File PositiveModel = new File(ModelPath, "PositiveModel.txt"); | |||||
File UnsureModel = new File(ModelPath, "UnsureModel.txt"); | |||||
if ( !ModelPath.exists() ) { ModelPath.mkdir(); } | |||||
System.out.println("Saving NegativeModel..."); | |||||
OOS = new ObjectOutputStream( new FileOutputStream( NegativeModel ) ); //对象流直接写入 | |||||
OOS.writeObject(NegativeDoc); | |||||
OOS.writeObject(NegativeWeight); | |||||
System.out.println("Saving PositiveModel..."); | |||||
OOS = new ObjectOutputStream( new FileOutputStream( PositiveModel ) ); | |||||
OOS.writeObject(PositiveDoc); | |||||
OOS.writeObject(PositiveWeight); | |||||
System.out.println("Saving UnsureModel..."); | |||||
OOS = new ObjectOutputStream( new FileOutputStream( UnsureModel ) ); | |||||
OOS.writeObject(UnsureDoc); | |||||
OOS.writeObject(UnsureWeight); | |||||
Enumeration<String> Keys; | |||||
System.out.println("Saving NegativeWeight..."); | |||||
Keys = NegativeWeight.keys(); | |||||
while( Keys.hasMoreElements() ) { | |||||
String Key = Keys.nextElement(); | |||||
FileUtils.writeStringToFile(new File("Model", "NegativeWeight.txt"), Key+"\t\t\t"+NegativeWeight.get(Key)+"\r\n", "UTF-8", true); | |||||
} | |||||
System.out.println("Saving PositiveWeight..."); | |||||
Keys = PositiveWeight.keys(); | |||||
while( Keys.hasMoreElements() ) { | |||||
String Key = Keys.nextElement(); | |||||
FileUtils.writeStringToFile(new File("Model", "PositiveWeight.txt"), Key+"\t\t\t"+PositiveWeight.get(Key)+"\r\n", "UTF-8", true); | |||||
} | |||||
System.out.println("Saving UnsureWeight..."); | |||||
Keys = UnsureWeight.keys(); | |||||
while( Keys.hasMoreElements() ) { | |||||
String Key = Keys.nextElement(); | |||||
FileUtils.writeStringToFile(new File("Model", "UnsureWeight.txt"), Key+"\t\t\t"+UnsureWeight.get(Key)+"\r\n", "UTF-8", true); | |||||
} | |||||
System.out.println("Save Success!"); | |||||
} | |||||
} |
@@ -0,0 +1,244 @@ | |||||
package eshore.cn.it.sentiment; | |||||
import java.io.*; | |||||
import java.util.*; | |||||
import org.apache.commons.io.FileUtils; | |||||
import org.dom4j.Document; | |||||
import org.dom4j.Element; | |||||
import org.dom4j.io.SAXReader; | |||||
public class SentimentTest { | |||||
static private HashSet<String> Negative, Positive; //两种情感词典 | |||||
static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 | |||||
static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 | |||||
static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/negativeout/"; | |||||
static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; | |||||
static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; | |||||
static final String FILE_ENCODING = "UTF-8"; | |||||
public static void main(String[] args) throws Exception { | |||||
// TODO 自动生成的方法存根 | |||||
SentimentTest Sentiment_Test = new SentimentTest(); | |||||
Sentiment_Test.Read_Model(); //读取模型 | |||||
Sentiment_Test.Classify_Directory(SENTIMENT_DOC_WEIGHT_PATH); | |||||
} | |||||
@SuppressWarnings({ "resource", "unchecked" }) | |||||
public void Read_Model() throws Exception { | |||||
this.Read_Sentiment_Dictionary(); | |||||
ObjectInputStream OIS; //对象流直接读入 | |||||
File ModelPath = new File("Model"); | |||||
File NegativeModel = new File(ModelPath, "NegativeModel.txt"); | |||||
File PositiveModel = new File(ModelPath, "PositiveModel.txt"); | |||||
File UnsureModel = new File(ModelPath, "UnsureModel.txt"); | |||||
System.out.println("Reading NegativeModel..."); | |||||
OIS = new ObjectInputStream( new FileInputStream( NegativeModel ) ); | |||||
NegativeDoc = (Integer) OIS.readObject(); | |||||
NegativeWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||||
System.out.println("Reading PositiveModel..."); | |||||
OIS = new ObjectInputStream( new FileInputStream( PositiveModel ) ); | |||||
PositiveDoc = (Integer) OIS.readObject(); | |||||
PositiveWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||||
System.out.println("Reading UnsureModel..."); | |||||
OIS = new ObjectInputStream( new FileInputStream( UnsureModel ) ); | |||||
UnsureDoc = (Integer) OIS.readObject(); | |||||
UnsureWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||||
System.out.println("Read Success."); | |||||
} | |||||
@SuppressWarnings("resource") | |||||
public void Read_Sentiment_Dictionary( ) throws Exception { //读入情感词典 | |||||
BufferedReader buf; | |||||
String str; | |||||
Negative = new HashSet<String>(); | |||||
buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); | |||||
while( (str = buf.readLine()) != null ) { | |||||
Negative.add(str); | |||||
} | |||||
Positive = new HashSet<String>(); | |||||
buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); | |||||
while( (str = buf.readLine()) != null ) { | |||||
Positive.add(str); | |||||
} | |||||
} | |||||
public void Classify_Directory( String DirectoryPath ) throws Exception { | |||||
int PositiveNum = 0, NegativeNum = 0, UnsureNum = 0; | |||||
String[] Text_Path = new File( DirectoryPath ).list(); | |||||
for ( int i = 0; i < Text_Path.length; i ++ ) { | |||||
Classify( DirectoryPath+Text_Path[i] ); | |||||
double Ans = Classify( DirectoryPath+Text_Path[i] ); //对当前目录下的每一个文件进行测试 | |||||
if ( Ans < 0 ) { //根据测试结果将测试文本进行分类 | |||||
FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Positive"), Text_Path[i])); | |||||
PositiveNum ++; | |||||
} | |||||
else if ( Ans > 0 ) { | |||||
FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Negative"), Text_Path[i])); | |||||
NegativeNum ++; | |||||
} | |||||
else { | |||||
FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Unsure"), Text_Path[i])); | |||||
UnsureNum ++; | |||||
} | |||||
System.out.print( "No." + (i+1) + " " + Text_Path[i] + ": " ); | |||||
if ( Ans < 0 ) { System.out.println("Positive"); } | |||||
else if ( Ans > 0 ) { System.out.println("Negative"); } | |||||
else { System.out.println("Unsure"); } | |||||
} | |||||
System.out.println("End."); | |||||
System.out.println("NegativeNum = " + NegativeNum + " PositiveNum = " + PositiveNum + " UnsureNum = " + UnsureNum); | |||||
} | |||||
public double Classify( String FilePath ) throws Exception { | |||||
Hashtable<String, Integer> FileHashTable = Read_TestFile( FilePath ); | |||||
Enumeration<String> Keys; | |||||
double NegativeAns = 1, PositiveAns = 1; | |||||
Keys = FileHashTable.keys(); | |||||
while( Keys.hasMoreElements() ) { | |||||
String Word = Keys.nextElement(); | |||||
NegativeAns *= ( Math.pow(this.PostProbability(Word, NegativeWeight), FileHashTable.get(Word)) ); | |||||
} | |||||
NegativeAns *= this.PriorProbability(NegativeDoc); | |||||
Keys = FileHashTable.keys(); | |||||
while( Keys.hasMoreElements() ) { | |||||
String Word = Keys.nextElement(); | |||||
PositiveAns *= ( Math.pow(this.PostProbability(Word, PositiveWeight), FileHashTable.get(Word)) ); | |||||
} | |||||
PositiveAns *= this.PriorProbability(PositiveDoc); | |||||
return ( NegativeAns-PositiveAns ); | |||||
} | |||||
public Hashtable<String, Integer> Read_TestFile( String FilePath ) throws Exception { | |||||
ArrayList<String> FileCurrentList = new ArrayList<String>(); | |||||
ReadXML( FilePath, FileCurrentList ); | |||||
Hashtable<String, Integer> FileHashTable = HashTable( FileCurrentList ); | |||||
return FileHashTable; | |||||
} | |||||
public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 | |||||
SAXReader SaxReader = new SAXReader(); | |||||
Document Doc = SaxReader.read(new File(FilePath)); | |||||
Element root = Doc.getRootElement(); | |||||
Element content = root.element("content"); | |||||
List<?> sentenses = content.elements("sentence"); //每一句话作为一项 | |||||
for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) { | |||||
Element sentense = (Element)iter.next(); | |||||
List<?> toks = sentense.elements(); | |||||
for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) { | |||||
Element tok = (Element)iter1.next(); | |||||
String Type = tok.attributeValue("type"); | |||||
if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 | |||||
GetWord( tok, currentList ); //从"group"中获取词 | |||||
} | |||||
} | |||||
} | |||||
} | |||||
public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词 | |||||
String Word = ""; | |||||
List<?> elements = root.elements("tok"); | |||||
for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) { | |||||
Element tok = (Element)iter.next(); | |||||
String Type = tok.attributeValue("type"); | |||||
if ( Type.compareTo("atom") == 0 ) { | |||||
Word += tok.getText().trim(); | |||||
} | |||||
else { | |||||
GetWord( tok, currentList ); | |||||
} | |||||
} | |||||
if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 | |||||
currentList.add(Word); | |||||
} | |||||
} | |||||
public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表 | |||||
Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>(); | |||||
for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) { | |||||
String Word = (String)iter.next(); | |||||
if ( HashTable.containsKey(Word) ) { | |||||
Integer Weight = HashTable.get(Word); | |||||
HashTable.put(Word, Weight+1); | |||||
} | |||||
else { | |||||
HashTable.put(Word, 1); | |||||
} | |||||
} | |||||
return HashTable; | |||||
} | |||||
public double PriorProbability( Integer SentimentDoc ) { | |||||
double Ans = 1; | |||||
Ans = ( (double)SentimentDoc/( (double)NegativeDoc+(double)PositiveDoc+(double)UnsureDoc ) ); | |||||
return Ans; | |||||
} | |||||
public double PostProbability( String Word, Hashtable<String, Integer> SentimentWeight ) { | |||||
double Ans, V, E; | |||||
double Weight = 0, Weights = 0; | |||||
if ( SentimentWeight.containsKey(Word) ) | |||||
Weight = (double)SentimentWeight.get(Word); | |||||
Weights = PostWeights( SentimentWeight ); | |||||
V = PostWeights( NegativeWeight ) + PostWeights( PositiveWeight ) + PostWeights( UnsureWeight ); | |||||
E = 1/Math.abs(V); | |||||
Ans = ( Weight + E )/( Weights + E*Math.abs(V) ); | |||||
return Ans; | |||||
} | |||||
public double PostWeights( Hashtable<String, Integer> SentimentWeight ) { | |||||
double Weights = 0; | |||||
Enumeration<String> Keys; | |||||
Keys = SentimentWeight.keys(); | |||||
while( Keys.hasMoreElements() ) { | |||||
String Key = Keys.nextElement(); | |||||
Weights += (double)SentimentWeight.get(Key); | |||||
} | |||||
return Weights; | |||||
} | |||||
} |