Browse Source

initial the project nlp-sentiment

fetches/sdlf/master
gitclebeg 9 years ago
commit
941597f3dd
10 changed files with 780 additions and 0 deletions
  1. +7
    -0
      .gitignore
  2. +15
    -0
      README
  3. +5
    -0
      data/.gitignore
  4. BIN
      data/data.zip
  5. BIN
      libs/lingpipe-4.1.0.jar
  6. +58
    -0
      pom.xml
  7. +146
    -0
      src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java
  8. +94
    -0
      src/main/java/eshore/cn/it/sentiment/PolarityBasic.java
  9. +211
    -0
      src/main/java/eshore/cn/it/sentiment/Sentiment.java
  10. +244
    -0
      src/test/java/eshore/cn/it/sentiment/SentimentTest.java

+ 7
- 0
.gitignore View File

@@ -0,0 +1,7 @@
.project
.classpath
target/
Result/
Model/

.settings/

+ 15
- 0
README View File

@@ -0,0 +1,15 @@
基于自然语言处理的情感分析工具
本程序依赖data目录下面的data.zip,先解压缩 data 目录下面的 data.zip到当前目录。

1、基于词典和贝叶斯模型的情感分析
主程序:eshore.cn.it.sentiment.Sentiment 此类通过
data/Sentiment_Dictionary中的正负面词语建立模型。

测试: eshore.cn.it.sentiment.SentimentTest
通过这个类就可以测试 data/500trainblogxml中的某个文件夹下面的博客的情感。

2、直接利用lingpipe的情感分析模块测试情感分析
直接运行程序: eshore.cn.it.sentiment.ChinesePolarityBasic
程序就会通过: data/polarity_corpus/hotel_reviews/train2训练
然后自动测试: data/polarity_corpus/hotel_reviews/test2
最后给出程序测试结果。

+ 5
- 0
data/.gitignore View File

@@ -0,0 +1,5 @@
500trainblogxml/
nerws_corpus/
output/
polarity_corpus/
Sentiment_Dictionary/

BIN
data/data.zip View File


BIN
libs/lingpipe-4.1.0.jar View File


+ 58
- 0
pom.xml View File

@@ -0,0 +1,58 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>eshore.cn.it</groupId>
<artifactId>nlp-sentiment</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>nlp-sentiment</name>
<url>http://maven.apache.org</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<commons.io.version>2.4</commons.io.version>
<dom4j.version>1.6.1</dom4j.version>
<lingpipe.version>4.1.0</lingpipe.version>
<jieba.version>1.0.0</jieba.version>
<!-- 无法到中央仓库下载的jar包就集中存放到这个位置 -->
<maven.libs.home>F:/java_git_projects/nlp-sentiment/libs</maven.libs.home>
</properties>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons.io.version}</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>${dom4j.version}</version>
</dependency>

<!-- 此处需要手动到lingpipe官网下载lingpipe的corejar包 -->
<dependency>
<groupId>com.aliasi</groupId>
<artifactId>lingpipe</artifactId>
<version>${lingpipe.version}</version>
<scope>system</scope>
<systemPath>${maven.libs.home}/lingpipe-4.1.0.jar</systemPath>
</dependency>
<!-- 此处添加结巴分词器 -->
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>${jieba.version}</version>
</dependency>
</dependencies>
</project>

+ 146
- 0
src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java View File

@@ -0,0 +1,146 @@
package eshore.cn.it.sentiment;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;



import org.apache.commons.io.IOUtils;

import com.aliasi.classify.Classification;
import com.aliasi.classify.Classified;
import com.aliasi.classify.DynamicLMClassifier;
import com.aliasi.lm.NGramProcessLM;
import com.aliasi.util.Files;
import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
import com.huaban.analysis.jieba.SegToken;

/**
* ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类
* lingpipe适合做增量分析
* @clebeg 2015-03-13
* @version 0.0.1
* */
public class ChinesePolarityBasic {
private String[] mCategories = new String[]{"+1", "-1"};
//这就是分类模型
private DynamicLMClassifier<NGramProcessLM> mClassifier;
private int numTests = 0;
private int numCorrect = 0;
private static final String TRAINFILES_INFO =
"data/polarity_corpus/hotel_reviews/train2.rlabelclass";
private static final String TRAINFILES_DIR =
"data/polarity_corpus/hotel_reviews/train2";
private static final String TESTFILES_DIR =
"data/polarity_corpus/hotel_reviews/test2";
private static final String TESTFILES_INFO =
"data/polarity_corpus/hotel_reviews/test2.rlabelclass";
private static final String ENCODING = "GBK";
private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter();
private final SegMode segMode = SegMode.INDEX;
public static void main(String[] args) {
try {
new ChinesePolarityBasic().run();
} catch (Throwable t) {
System.out.println("Thrown: " + t);
t.printStackTrace(System.out);
}

}


public ChinesePolarityBasic() {
super();
int nGram = 8;
mClassifier
= DynamicLMClassifier
.createNGramProcess(mCategories,nGram);
}

private void run() throws ClassNotFoundException,
IOException {
train();
evaluate();
}

private void train() throws IOException {
FileReader input = new FileReader(new File(TRAINFILES_INFO));
List<String> trainInfos = IOUtils.readLines(input);
for (String str : trainInfos){
String[] train = str.split(" ");
train(train[1], new File(TRAINFILES_DIR, train[0]), ENCODING);
}
}
private void evaluate() throws IOException {
FileReader input = new FileReader(new File(TESTFILES_INFO));
List<String> trainInfos = IOUtils.readLines(input);
for (String str : trainInfos){
String[] train = str.split(" ");
evaluate(train[1], new File(TESTFILES_DIR, train[0]), ENCODING);
}
System.out.println(" # Test Cases="
+ numTests);
System.out.println(" # Correct="
+ numCorrect);
System.out.println(" % Correct="
+ ((double)numCorrect)
/(double)numTests);
}

/**
* 给定分类标识,给定训练文本,给定文本的编码,即可作分类训练
* 分类完成之后就会加入到分类模型中
* @throws IOException
* */
private void train(String category, File trainFile, String fileEncoding)
throws IOException {
Classification classification = new Classification(category);
String review = Files.readFromFile(trainFile, fileEncoding);
//此处加入中文分词器,得到分词之后的字符串
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode);
review = "";
for (SegToken seg : segTokens) {
review += seg.word.getToken() + " ";
}
Classified<CharSequence> classified
= new Classified<CharSequence>(review,classification);
mClassifier.handle(classified);
}
/**
* 给定分类标识,给定测试文本,给定文本的编码,即可作测试模型
* @throws IOException
* */
private void evaluate(String category, File testFile, String fileEncoding)
throws IOException {
String review
= Files.readFromFile(testFile, fileEncoding);
//同理,这里可以加入分词器,这样可以试试效果如何。
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode);
review = "";
for (SegToken seg : segTokens) {
review += seg.word.getToken() + " ";
}
++numTests;
Classification classification
= mClassifier.classify(review);
//得到训练结果
String resultCategory
= classification.bestCategory();
if (resultCategory.equals(category))
++numCorrect;
}

}

+ 94
- 0
src/main/java/eshore/cn/it/sentiment/PolarityBasic.java View File

@@ -0,0 +1,94 @@
package eshore.cn.it.sentiment;

import java.io.File;
import java.io.IOException;

import com.aliasi.classify.Classification;
import com.aliasi.classify.Classified;
import com.aliasi.classify.DynamicLMClassifier;
import com.aliasi.lm.NGramProcessLM;
import com.aliasi.util.Files;

public class PolarityBasic {
File mPolarityDir;
String[] mCategories;
DynamicLMClassifier<NGramProcessLM> mClassifier;
public PolarityBasic(String[] args) {
mPolarityDir = new File("data/polarity_corpus","txt_sentoken");
mCategories = mPolarityDir.list();
int nGram = 8;
mClassifier
= DynamicLMClassifier
.createNGramProcess(mCategories,nGram);
}


public static void main(String[] args) {
try {
new PolarityBasic(args).run();
} catch (Throwable t) {
System.out.println("Thrown: " + t);
t.printStackTrace(System.out);
}
}
private void run() throws ClassNotFoundException,
IOException {
train();
evaluate();
}
private void train() throws IOException {
for (int i = 0; i < mCategories.length; ++i) {
String category = mCategories[i];
Classification classification
= new Classification(category);
File dir = new File(mPolarityDir, mCategories[i]);
File[] trainFiles = dir.listFiles();
for (int j = 0; j < trainFiles.length; ++j) {
File trainFile = trainFiles[j];
if (isTrainingFile(trainFile)) {
String review
= Files.readFromFile(trainFile,"ISO-8859-1");
Classified<CharSequence> classified
= new Classified<CharSequence>(review,classification);
mClassifier.handle(classified);
}
}
}
}
boolean isTrainingFile(File file) {
return file.getName().charAt(2) != '9'; // test on fold 9
}
void evaluate() throws IOException {
int numTests = 0;
int numCorrect = 0;
for (int i = 0; i < mCategories.length; ++i) {
String category = mCategories[i];
File file = new File(mPolarityDir,mCategories[i]);
File[] testFiles = file.listFiles();
for (int j = 0; j < testFiles.length; ++j) {
File testFile = testFiles[j];
if (!isTrainingFile(testFile)) {
String review
= Files.readFromFile(testFile,"ISO-8859-1");
++numTests;
Classification classification
= mClassifier.classify(review);
String resultCategory
= classification.bestCategory();
if (resultCategory.equals(category))
++numCorrect;
}
}
}
System.out.println(" # Test Cases="
+ numTests);
System.out.println(" # Correct="
+ numCorrect);
System.out.println(" % Correct="
+ ((double)numCorrect)
/(double)numTests);
}
}

+ 211
- 0
src/main/java/eshore/cn/it/sentiment/Sentiment.java View File

@@ -0,0 +1,211 @@
package eshore.cn.it.sentiment;

import java.io.*;
import java.util.*;

import org.apache.commons.io.FileUtils;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

public class Sentiment {

static private HashSet<String> Negative, Positive; //两种情感词典
static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值
static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值
static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/";
static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt";
static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt";
static final String FILE_ENCODING = "UTF-8";
public static void main(String[] args) throws Exception {
// TODO 自动生成的方法存根
Sentiment Sentiment = new Sentiment();
Sentiment.Model( );
Sentiment.Save_Model();
}
public void Model( ) throws Exception {
this.Read_Sentiment_Dictionary();
this.Sentiment_Doc_Weight(SENTIMENT_DOC_WEIGHT_PATH);
}
@SuppressWarnings("resource")
public void Read_Sentiment_Dictionary() throws Exception {
BufferedReader buf;
String str;
//集合,里面元素不允许重复
Negative = new HashSet<String>();
buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) );
while( (str = buf.readLine()) != null ) {
Negative.add(str);
}
Positive = new HashSet<String>();
buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) );
while( (str = buf.readLine()) != null ) {
Positive.add(str);
}
}
public void Sentiment_Doc_Weight( String DirPath ) throws Exception {
File NegativeDir = new File( DirPath + "negativeout" );
String[] NegativeFiles = NegativeDir.list();
NegativeDoc = NegativeFiles.length;
ArrayList<String> NegativeCurrentList = new ArrayList<String>();
for ( int i = 0; i < NegativeFiles.length; i ++ ) {
System.out.println("NegativeFiles No."+(i+1)+" "+DirPath+"negativeout/"+NegativeFiles[i]);
this.ReadXML(DirPath+"negativeout/"+NegativeFiles[i], NegativeCurrentList);
}
NegativeWeight = HashTable( NegativeCurrentList );
/**********************************************************************************************************/
File PositiveDir = new File( DirPath + "positiveout" );
String[] PositiveFiles = PositiveDir.list();
PositiveDoc = PositiveFiles.length;
ArrayList<String> PositiveCurrentList = new ArrayList<String>();
for ( int i = 0; i < PositiveFiles.length; i ++ ) {
System.out.println("PositiveFiles No."+(i+1)+" "+DirPath+"positiveout/"+PositiveFiles[i]);
this.ReadXML(DirPath+"positiveout/"+PositiveFiles[i], PositiveCurrentList);
}
PositiveWeight = HashTable( PositiveCurrentList );
/*********************************************************************************************************/
File UnsureDir = new File( DirPath + "unsureout" );
String[] UnsureFiles = UnsureDir.list();
UnsureDoc = UnsureFiles.length;
ArrayList<String> UnsureCurrentList = new ArrayList<String>();
for ( int i = 0; i < UnsureFiles.length; i ++ ) {
System.out.println("UnsureFiles No."+(i+1)+" "+DirPath+"unsureout/"+UnsureFiles[i]);
this.ReadXML(DirPath+"unsureout/"+UnsureFiles[i], UnsureCurrentList);
}
UnsureWeight = HashTable( UnsureCurrentList );
/********************************************************************************************************/
System.out.println("UnsureCurrent = " + UnsureCurrentList.size() + " UnsureHashTable = " + UnsureWeight.size());
System.out.println("PositiveCurrent = " + PositiveCurrentList.size() + " PositiveHashTable = " + PositiveWeight.size());
System.out.println("NegativeCurrent = " + NegativeCurrentList.size() + " NegativeHashTable = " + NegativeWeight.size());
System.out.println("NegativeDoc = " + NegativeDoc + " PositiveDoc = " + PositiveDoc + " UnsureDoc = " + UnsureDoc);
}
public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回
SAXReader SaxReader = new SAXReader();
Document Doc = SaxReader.read(new File(FilePath));
Element root = Doc.getRootElement();
Element content = root.element("content");
List<?> sentenses = content.elements("sentence"); //每一句话作为一项
for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) {
Element sentense = (Element)iter.next();
List<?> toks = sentense.elements();
for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) {
Element tok = (Element)iter1.next();
String Type = tok.attributeValue("type");
if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中
GetWord( tok, currentList ); //从"group"中获取词
}
}
}
}
public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词
String Word = "";
List<?> elements = root.elements("tok");
for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) {
Element tok = (Element)iter.next();
String Type = tok.attributeValue("type");
if ( Type.compareTo("atom") == 0 ) {
Word += tok.getText().trim();
}
else {
GetWord( tok, currentList );
}
}
if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词
currentList.add(Word);
}
}
public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表
Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>();
for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) {
String Word = (String)iter.next();
if ( HashTable.containsKey(Word) ) {
Integer Weight = HashTable.get(Word);
HashTable.put(Word, Weight+1);
}
else {
HashTable.put(Word, 1);
}
}
return HashTable;
}
@SuppressWarnings("resource")
public void Save_Model( ) throws Exception {
ObjectOutputStream OOS;
File ModelPath = new File("Model");
File NegativeModel = new File(ModelPath, "NegativeModel.txt");
File PositiveModel = new File(ModelPath, "PositiveModel.txt");
File UnsureModel = new File(ModelPath, "UnsureModel.txt");
if ( !ModelPath.exists() ) { ModelPath.mkdir(); }
System.out.println("Saving NegativeModel...");
OOS = new ObjectOutputStream( new FileOutputStream( NegativeModel ) ); //对象流直接写入
OOS.writeObject(NegativeDoc);
OOS.writeObject(NegativeWeight);
System.out.println("Saving PositiveModel...");
OOS = new ObjectOutputStream( new FileOutputStream( PositiveModel ) );
OOS.writeObject(PositiveDoc);
OOS.writeObject(PositiveWeight);
System.out.println("Saving UnsureModel...");
OOS = new ObjectOutputStream( new FileOutputStream( UnsureModel ) );
OOS.writeObject(UnsureDoc);
OOS.writeObject(UnsureWeight);
Enumeration<String> Keys;
System.out.println("Saving NegativeWeight...");
Keys = NegativeWeight.keys();
while( Keys.hasMoreElements() ) {
String Key = Keys.nextElement();
FileUtils.writeStringToFile(new File("Model", "NegativeWeight.txt"), Key+"\t\t\t"+NegativeWeight.get(Key)+"\r\n", "UTF-8", true);
}
System.out.println("Saving PositiveWeight...");
Keys = PositiveWeight.keys();
while( Keys.hasMoreElements() ) {
String Key = Keys.nextElement();
FileUtils.writeStringToFile(new File("Model", "PositiveWeight.txt"), Key+"\t\t\t"+PositiveWeight.get(Key)+"\r\n", "UTF-8", true);
}
System.out.println("Saving UnsureWeight...");
Keys = UnsureWeight.keys();
while( Keys.hasMoreElements() ) {
String Key = Keys.nextElement();
FileUtils.writeStringToFile(new File("Model", "UnsureWeight.txt"), Key+"\t\t\t"+UnsureWeight.get(Key)+"\r\n", "UTF-8", true);
}
System.out.println("Save Success!");
}
}

+ 244
- 0
src/test/java/eshore/cn/it/sentiment/SentimentTest.java View File

@@ -0,0 +1,244 @@
package eshore.cn.it.sentiment;


import java.io.*;
import java.util.*;

import org.apache.commons.io.FileUtils;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;


public class SentimentTest {

static private HashSet<String> Negative, Positive; //两种情感词典
static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值
static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值
static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/negativeout/";
static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt";
static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt";
static final String FILE_ENCODING = "UTF-8";
public static void main(String[] args) throws Exception {
// TODO 自动生成的方法存根
SentimentTest Sentiment_Test = new SentimentTest();
Sentiment_Test.Read_Model(); //读取模型
Sentiment_Test.Classify_Directory(SENTIMENT_DOC_WEIGHT_PATH);
}
@SuppressWarnings({ "resource", "unchecked" })
public void Read_Model() throws Exception {
this.Read_Sentiment_Dictionary();
ObjectInputStream OIS; //对象流直接读入
File ModelPath = new File("Model");
File NegativeModel = new File(ModelPath, "NegativeModel.txt");
File PositiveModel = new File(ModelPath, "PositiveModel.txt");
File UnsureModel = new File(ModelPath, "UnsureModel.txt");
System.out.println("Reading NegativeModel...");
OIS = new ObjectInputStream( new FileInputStream( NegativeModel ) );
NegativeDoc = (Integer) OIS.readObject();
NegativeWeight = (Hashtable<String, Integer>) OIS.readObject();
System.out.println("Reading PositiveModel...");
OIS = new ObjectInputStream( new FileInputStream( PositiveModel ) );
PositiveDoc = (Integer) OIS.readObject();
PositiveWeight = (Hashtable<String, Integer>) OIS.readObject();
System.out.println("Reading UnsureModel...");
OIS = new ObjectInputStream( new FileInputStream( UnsureModel ) );
UnsureDoc = (Integer) OIS.readObject();
UnsureWeight = (Hashtable<String, Integer>) OIS.readObject();
System.out.println("Read Success.");
}
@SuppressWarnings("resource")
public void Read_Sentiment_Dictionary( ) throws Exception { //读入情感词典
BufferedReader buf;
String str;
Negative = new HashSet<String>();
buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) );
while( (str = buf.readLine()) != null ) {
Negative.add(str);
}
Positive = new HashSet<String>();
buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) );
while( (str = buf.readLine()) != null ) {
Positive.add(str);
}
}
public void Classify_Directory( String DirectoryPath ) throws Exception {
int PositiveNum = 0, NegativeNum = 0, UnsureNum = 0;
String[] Text_Path = new File( DirectoryPath ).list();
for ( int i = 0; i < Text_Path.length; i ++ ) {
Classify( DirectoryPath+Text_Path[i] );
double Ans = Classify( DirectoryPath+Text_Path[i] ); //对当前目录下的每一个文件进行测试
if ( Ans < 0 ) { //根据测试结果将测试文本进行分类
FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Positive"), Text_Path[i]));
PositiveNum ++;
}
else if ( Ans > 0 ) {
FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Negative"), Text_Path[i]));
NegativeNum ++;
}
else {
FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Unsure"), Text_Path[i]));
UnsureNum ++;
}
System.out.print( "No." + (i+1) + " " + Text_Path[i] + ": " );
if ( Ans < 0 ) { System.out.println("Positive"); }
else if ( Ans > 0 ) { System.out.println("Negative"); }
else { System.out.println("Unsure"); }
}
System.out.println("End.");
System.out.println("NegativeNum = " + NegativeNum + " PositiveNum = " + PositiveNum + " UnsureNum = " + UnsureNum);
}
public double Classify( String FilePath ) throws Exception {
Hashtable<String, Integer> FileHashTable = Read_TestFile( FilePath );
Enumeration<String> Keys;
double NegativeAns = 1, PositiveAns = 1;
Keys = FileHashTable.keys();
while( Keys.hasMoreElements() ) {
String Word = Keys.nextElement();
NegativeAns *= ( Math.pow(this.PostProbability(Word, NegativeWeight), FileHashTable.get(Word)) );
}
NegativeAns *= this.PriorProbability(NegativeDoc);
Keys = FileHashTable.keys();
while( Keys.hasMoreElements() ) {
String Word = Keys.nextElement();
PositiveAns *= ( Math.pow(this.PostProbability(Word, PositiveWeight), FileHashTable.get(Word)) );
}
PositiveAns *= this.PriorProbability(PositiveDoc);
return ( NegativeAns-PositiveAns );
}
public Hashtable<String, Integer> Read_TestFile( String FilePath ) throws Exception {
ArrayList<String> FileCurrentList = new ArrayList<String>();
ReadXML( FilePath, FileCurrentList );
Hashtable<String, Integer> FileHashTable = HashTable( FileCurrentList );
return FileHashTable;
}
public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回
SAXReader SaxReader = new SAXReader();
Document Doc = SaxReader.read(new File(FilePath));
Element root = Doc.getRootElement();
Element content = root.element("content");
List<?> sentenses = content.elements("sentence"); //每一句话作为一项
for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) {
Element sentense = (Element)iter.next();
List<?> toks = sentense.elements();
for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) {
Element tok = (Element)iter1.next();
String Type = tok.attributeValue("type");
if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中
GetWord( tok, currentList ); //从"group"中获取词
}
}
}
}
public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词
String Word = "";
List<?> elements = root.elements("tok");
for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) {
Element tok = (Element)iter.next();
String Type = tok.attributeValue("type");
if ( Type.compareTo("atom") == 0 ) {
Word += tok.getText().trim();
}
else {
GetWord( tok, currentList );
}
}
if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词
currentList.add(Word);
}
}
public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表
Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>();
for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) {
String Word = (String)iter.next();
if ( HashTable.containsKey(Word) ) {
Integer Weight = HashTable.get(Word);
HashTable.put(Word, Weight+1);
}
else {
HashTable.put(Word, 1);
}
}
return HashTable;
}
public double PriorProbability( Integer SentimentDoc ) {
double Ans = 1;
Ans = ( (double)SentimentDoc/( (double)NegativeDoc+(double)PositiveDoc+(double)UnsureDoc ) );
return Ans;
}
public double PostProbability( String Word, Hashtable<String, Integer> SentimentWeight ) {
double Ans, V, E;
double Weight = 0, Weights = 0;
if ( SentimentWeight.containsKey(Word) )
Weight = (double)SentimentWeight.get(Word);
Weights = PostWeights( SentimentWeight );
V = PostWeights( NegativeWeight ) + PostWeights( PositiveWeight ) + PostWeights( UnsureWeight );
E = 1/Math.abs(V);
Ans = ( Weight + E )/( Weights + E*Math.abs(V) );
return Ans;
}
public double PostWeights( Hashtable<String, Integer> SentimentWeight ) {
double Weights = 0;
Enumeration<String> Keys;
Keys = SentimentWeight.keys();
while( Keys.hasMoreElements() ) {
String Key = Keys.nextElement();
Weights += (double)SentimentWeight.get(Key);
}
return Weights;
}
}

Loading…
Cancel
Save