diff --git a/README b/README
index 6340f9a..7d89080 100644
--- a/README
+++ b/README
@@ -1,6 +1,14 @@
基于自然语言处理的情感分析工具
本程序依赖data目录下面的data.zip,先解压缩 data 目录下面的 data.zip到当前目录。
+
+
+*新增说明1:2015-04-10测试了不用中文分词器,分词之后 LingPipe 情感分类的准确率,同时测试了去除停用词之后的情感分类的准确率。
+1) 发现用HanLP的NLPTokenizer分词器,准确率最高,但是速度有点慢。
+2) 如果用HanLP的标准分词器就会准确率低一点点,但是速度快。
+3) 分词之后去除停用词效果更加差。
+4) 结巴分词效果不好,而且速度慢。
+
1、基于词典和贝叶斯模型的情感分析
主程序:eshore.cn.it.sentiment.Sentiment 此类通过
data/Sentiment_Dictionary中的正负面词语建立模型。
diff --git a/data/.gitignore b/data/.gitignore
index 31367df..e375520 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -2,4 +2,6 @@
nerws_corpus/
output/
polarity_corpus/
-Sentiment_Dictionary/
\ No newline at end of file
+Sentiment_Dictionary/
+dictionary/
+model/
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 66850e2..aa67952 100644
--- a/pom.xml
+++ b/pom.xml
@@ -16,6 +16,7 @@
1.6.1
4.1.0
1.0.0
+ 1.1.0
F:/java_git_projects/nlp-sentiment/libs
@@ -54,5 +55,12 @@
jieba-analysis
${jieba.version}
+
+
+
+ com.hankcs
+ hanlp
+ ${hanlp.version}
+
diff --git a/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java b/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java
index d1100a9..e3429cc 100644
--- a/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java
+++ b/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java
@@ -7,6 +7,12 @@ import java.util.List;
+
+
+
+
+
+
import org.apache.commons.io.IOUtils;
import com.aliasi.classify.Classification;
@@ -14,9 +20,13 @@ import com.aliasi.classify.Classified;
import com.aliasi.classify.DynamicLMClassifier;
import com.aliasi.lm.NGramProcessLM;
import com.aliasi.util.Files;
-import com.huaban.analysis.jieba.JiebaSegmenter;
-import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
-import com.huaban.analysis.jieba.SegToken;
+
+import com.hankcs.hanlp.HanLP;
+import com.hankcs.hanlp.seg.common.Term;
+
+//下面的分词器准确率最高,去除停用词反而准确率不高了。
+//import com.hankcs.hanlp.tokenizer.NLPTokenizer;
+
/**
* ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类
@@ -40,10 +50,7 @@ public class ChinesePolarityBasic {
private static final String TESTFILES_INFO =
"data/polarity_corpus/hotel_reviews/test2.rlabelclass";
private static final String ENCODING = "GBK";
-
- private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter();
- private final SegMode segMode = SegMode.INDEX;
-
+
public static void main(String[] args) {
try {
new ChinesePolarityBasic().run();
@@ -65,7 +72,11 @@ public class ChinesePolarityBasic {
private void run() throws ClassNotFoundException,
IOException {
+
+ //训练数据
train();
+
+ //测试训练结果
evaluate();
}
@@ -105,14 +116,14 @@ public class ChinesePolarityBasic {
String review = Files.readFromFile(trainFile, fileEncoding);
//此处加入中文分词器,得到分词之后的字符串
- List segTokens = jiebaSegmenter.process(review, segMode);
- review = "";
- for (SegToken seg : segTokens) {
- review += seg.word.getToken() + " ";
- }
+ String segWords = "";
+ List terms = HanLP.segment(review);
+ for (Term term : terms)
+ segWords += term.word + " ";
+
Classified classified
- = new Classified(review,classification);
+ = new Classified(segWords,classification);
mClassifier.handle(classified);
}
@@ -126,15 +137,13 @@ public class ChinesePolarityBasic {
String review
= Files.readFromFile(testFile, fileEncoding);
//同理,这里可以加入分词器,这样可以试试效果如何。
- List segTokens = jiebaSegmenter.process(review, segMode);
- review = "";
- for (SegToken seg : segTokens) {
- review += seg.word.getToken() + " ";
- }
-
+ String segWords = "";
+ List terms = HanLP.segment(review);
+ for (Term term : terms)
+ segWords += term.word + " ";
++numTests;
Classification classification
- = mClassifier.classify(review);
+ = mClassifier.classify(segWords);
//得到训练结果
String resultCategory
= classification.bestCategory();