diff --git a/README b/README index 6340f9a..7d89080 100644 --- a/README +++ b/README @@ -1,6 +1,14 @@ 基于自然语言处理的情感分析工具 本程序依赖data目录下面的data.zip,先解压缩 data 目录下面的 data.zip到当前目录。 + + +*新增说明1:2015-04-10测试了不用中文分词器,分词之后 LingPipe 情感分类的准确率,同时测试了去除停用词之后的情感分类的准确率。 +1) 发现用HanLP的NLPTokenizer分词器,准确率最高,但是速度有点慢。 +2) 如果用HanLP的标准分词器就会准确率低一点点,但是速度快。 +3) 分词之后去除停用词效果更加差。 +4) 结巴分词效果不好,而且速度慢。 + 1、基于词典和贝叶斯模型的情感分析 主程序:eshore.cn.it.sentiment.Sentiment 此类通过 data/Sentiment_Dictionary中的正负面词语建立模型。 diff --git a/data/.gitignore b/data/.gitignore index 31367df..e375520 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -2,4 +2,6 @@ nerws_corpus/ output/ polarity_corpus/ -Sentiment_Dictionary/ \ No newline at end of file +Sentiment_Dictionary/ +dictionary/ +model/ \ No newline at end of file diff --git a/pom.xml b/pom.xml index 66850e2..aa67952 100644 --- a/pom.xml +++ b/pom.xml @@ -16,6 +16,7 @@ 1.6.1 4.1.0 1.0.0 + 1.1.0 F:/java_git_projects/nlp-sentiment/libs @@ -54,5 +55,12 @@ jieba-analysis ${jieba.version} + + + + com.hankcs + hanlp + ${hanlp.version} + diff --git a/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java b/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java index d1100a9..e3429cc 100644 --- a/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java +++ b/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java @@ -7,6 +7,12 @@ import java.util.List; + + + + + + import org.apache.commons.io.IOUtils; import com.aliasi.classify.Classification; @@ -14,9 +20,13 @@ import com.aliasi.classify.Classified; import com.aliasi.classify.DynamicLMClassifier; import com.aliasi.lm.NGramProcessLM; import com.aliasi.util.Files; -import com.huaban.analysis.jieba.JiebaSegmenter; -import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; -import com.huaban.analysis.jieba.SegToken; + +import com.hankcs.hanlp.HanLP; +import com.hankcs.hanlp.seg.common.Term; + +//下面的分词器准确率最高,去除停用词反而准确率不高了。 +//import com.hankcs.hanlp.tokenizer.NLPTokenizer; + /** * ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类 @@ -40,10 +50,7 @@ public class ChinesePolarityBasic { private static final String TESTFILES_INFO = "data/polarity_corpus/hotel_reviews/test2.rlabelclass"; private static final String ENCODING = "GBK"; - - private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter(); - private final SegMode segMode = SegMode.INDEX; - + public static void main(String[] args) { try { new ChinesePolarityBasic().run(); @@ -65,7 +72,11 @@ public class ChinesePolarityBasic { private void run() throws ClassNotFoundException, IOException { + + //训练数据 train(); + + //测试训练结果 evaluate(); } @@ -105,14 +116,14 @@ public class ChinesePolarityBasic { String review = Files.readFromFile(trainFile, fileEncoding); //此处加入中文分词器,得到分词之后的字符串 - List segTokens = jiebaSegmenter.process(review, segMode); - review = ""; - for (SegToken seg : segTokens) { - review += seg.word.getToken() + " "; - } + String segWords = ""; + List terms = HanLP.segment(review); + for (Term term : terms) + segWords += term.word + " "; + Classified classified - = new Classified(review,classification); + = new Classified(segWords,classification); mClassifier.handle(classified); } @@ -126,15 +137,13 @@ public class ChinesePolarityBasic { String review = Files.readFromFile(testFile, fileEncoding); //同理,这里可以加入分词器,这样可以试试效果如何。 - List segTokens = jiebaSegmenter.process(review, segMode); - review = ""; - for (SegToken seg : segTokens) { - review += seg.word.getToken() + " "; - } - + String segWords = ""; + List terms = HanLP.segment(review); + for (Term term : terms) + segWords += term.word + " "; ++numTests; Classification classification - = mClassifier.classify(review); + = mClassifier.classify(segWords); //得到训练结果 String resultCategory = classification.bestCategory();