|
|
@@ -7,6 +7,12 @@ import java.util.List; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import org.apache.commons.io.IOUtils; |
|
|
|
|
|
|
|
import com.aliasi.classify.Classification; |
|
|
@@ -14,9 +20,13 @@ import com.aliasi.classify.Classified; |
|
|
|
import com.aliasi.classify.DynamicLMClassifier; |
|
|
|
import com.aliasi.lm.NGramProcessLM; |
|
|
|
import com.aliasi.util.Files; |
|
|
|
import com.huaban.analysis.jieba.JiebaSegmenter; |
|
|
|
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; |
|
|
|
import com.huaban.analysis.jieba.SegToken; |
|
|
|
|
|
|
|
import com.hankcs.hanlp.HanLP; |
|
|
|
import com.hankcs.hanlp.seg.common.Term; |
|
|
|
|
|
|
|
//下面的分词器准确率最高,去除停用词反而准确率不高了。 |
|
|
|
//import com.hankcs.hanlp.tokenizer.NLPTokenizer; |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
* ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类 |
|
|
@@ -40,10 +50,7 @@ public class ChinesePolarityBasic { |
|
|
|
private static final String TESTFILES_INFO = |
|
|
|
"data/polarity_corpus/hotel_reviews/test2.rlabelclass"; |
|
|
|
private static final String ENCODING = "GBK"; |
|
|
|
|
|
|
|
private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter(); |
|
|
|
private final SegMode segMode = SegMode.INDEX; |
|
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) { |
|
|
|
try { |
|
|
|
new ChinesePolarityBasic().run(); |
|
|
@@ -65,7 +72,11 @@ public class ChinesePolarityBasic { |
|
|
|
|
|
|
|
private void run() throws ClassNotFoundException, |
|
|
|
IOException { |
|
|
|
|
|
|
|
//训练数据 |
|
|
|
train(); |
|
|
|
|
|
|
|
//测试训练结果 |
|
|
|
evaluate(); |
|
|
|
} |
|
|
|
|
|
|
@@ -105,14 +116,14 @@ public class ChinesePolarityBasic { |
|
|
|
String review = Files.readFromFile(trainFile, fileEncoding); |
|
|
|
|
|
|
|
//此处加入中文分词器,得到分词之后的字符串 |
|
|
|
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); |
|
|
|
review = ""; |
|
|
|
for (SegToken seg : segTokens) { |
|
|
|
review += seg.word.getToken() + " "; |
|
|
|
} |
|
|
|
|
|
|
|
String segWords = ""; |
|
|
|
List<Term> terms = HanLP.segment(review); |
|
|
|
for (Term term : terms) |
|
|
|
segWords += term.word + " "; |
|
|
|
|
|
|
|
Classified<CharSequence> classified |
|
|
|
= new Classified<CharSequence>(review,classification); |
|
|
|
= new Classified<CharSequence>(segWords,classification); |
|
|
|
mClassifier.handle(classified); |
|
|
|
} |
|
|
|
|
|
|
@@ -126,15 +137,13 @@ public class ChinesePolarityBasic { |
|
|
|
String review |
|
|
|
= Files.readFromFile(testFile, fileEncoding); |
|
|
|
//同理,这里可以加入分词器,这样可以试试效果如何。 |
|
|
|
List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); |
|
|
|
review = ""; |
|
|
|
for (SegToken seg : segTokens) { |
|
|
|
review += seg.word.getToken() + " "; |
|
|
|
} |
|
|
|
|
|
|
|
String segWords = ""; |
|
|
|
List<Term> terms = HanLP.segment(review); |
|
|
|
for (Term term : terms) |
|
|
|
segWords += term.word + " "; |
|
|
|
++numTests; |
|
|
|
Classification classification |
|
|
|
= mClassifier.classify(review); |
|
|
|
= mClassifier.classify(segWords); |
|
|
|
//得到训练结果 |
|
|
|
String resultCategory |
|
|
|
= classification.bestCategory(); |
|
|
|