Browse Source

add some test to chinese sentiment project

fetches/sdlf/master
gitclebeg 9 years ago
parent
commit
0c11ad6d1f
4 changed files with 48 additions and 21 deletions
  1. +8
    -0
      README
  2. +3
    -1
      data/.gitignore
  3. +8
    -0
      pom.xml
  4. +29
    -20
      src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java

+ 8
- 0
README View File

@@ -1,6 +1,14 @@
基于自然语言处理的情感分析工具
本程序依赖data目录下面的data.zip,先解压缩 data 目录下面的 data.zip到当前目录。



*新增说明1:2015-04-10测试了不用中文分词器,分词之后 LingPipe 情感分类的准确率,同时测试了去除停用词之后的情感分类的准确率。
1) 发现用HanLP的NLPTokenizer分词器,准确率最高,但是速度有点慢。
2) 如果用HanLP的标准分词器就会准确率低一点点,但是速度快。
3) 分词之后去除停用词效果更加差。
4) 结巴分词效果不好,而且速度慢。

1、基于词典和贝叶斯模型的情感分析
主程序:eshore.cn.it.sentiment.Sentiment 此类通过
data/Sentiment_Dictionary中的正负面词语建立模型。


+ 3
- 1
data/.gitignore View File

@@ -2,4 +2,6 @@
nerws_corpus/
output/
polarity_corpus/
Sentiment_Dictionary/
Sentiment_Dictionary/
dictionary/
model/

+ 8
- 0
pom.xml View File

@@ -16,6 +16,7 @@
<dom4j.version>1.6.1</dom4j.version>
<lingpipe.version>4.1.0</lingpipe.version>
<jieba.version>1.0.0</jieba.version>
<hanlp.version>1.1.0</hanlp.version>
<!-- 无法到中央仓库下载的jar包就集中存放到这个位置 -->
<maven.libs.home>F:/java_git_projects/nlp-sentiment/libs</maven.libs.home>
</properties>
@@ -54,5 +55,12 @@
<artifactId>jieba-analysis</artifactId>
<version>${jieba.version}</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>${hanlp.version}</version>
</dependency>
</dependencies>
</project>

+ 29
- 20
src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java View File

@@ -7,6 +7,12 @@ import java.util.List;









import org.apache.commons.io.IOUtils;

import com.aliasi.classify.Classification;
@@ -14,9 +20,13 @@ import com.aliasi.classify.Classified;
import com.aliasi.classify.DynamicLMClassifier;
import com.aliasi.lm.NGramProcessLM;
import com.aliasi.util.Files;
import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
import com.huaban.analysis.jieba.SegToken;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;

//下面的分词器准确率最高,去除停用词反而准确率不高了。
//import com.hankcs.hanlp.tokenizer.NLPTokenizer;


/**
* ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类
@@ -40,10 +50,7 @@ public class ChinesePolarityBasic {
private static final String TESTFILES_INFO =
"data/polarity_corpus/hotel_reviews/test2.rlabelclass";
private static final String ENCODING = "GBK";
private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter();
private final SegMode segMode = SegMode.INDEX;

public static void main(String[] args) {
try {
new ChinesePolarityBasic().run();
@@ -65,7 +72,11 @@ public class ChinesePolarityBasic {

private void run() throws ClassNotFoundException,
IOException {
//训练数据
train();
//测试训练结果
evaluate();
}

@@ -105,14 +116,14 @@ public class ChinesePolarityBasic {
String review = Files.readFromFile(trainFile, fileEncoding);
//此处加入中文分词器,得到分词之后的字符串
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode);
review = "";
for (SegToken seg : segTokens) {
review += seg.word.getToken() + " ";
}
String segWords = "";
List<Term> terms = HanLP.segment(review);
for (Term term : terms)
segWords += term.word + " ";
Classified<CharSequence> classified
= new Classified<CharSequence>(review,classification);
= new Classified<CharSequence>(segWords,classification);
mClassifier.handle(classified);
}
@@ -126,15 +137,13 @@ public class ChinesePolarityBasic {
String review
= Files.readFromFile(testFile, fileEncoding);
//同理,这里可以加入分词器,这样可以试试效果如何。
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode);
review = "";
for (SegToken seg : segTokens) {
review += seg.word.getToken() + " ";
}
String segWords = "";
List<Term> terms = HanLP.segment(review);
for (Term term : terms)
segWords += term.word + " ";
++numTests;
Classification classification
= mClassifier.classify(review);
= mClassifier.classify(segWords);
//得到训练结果
String resultCategory
= classification.bestCategory();


Loading…
Cancel
Save