Browse Source

add text classification to model

fetches/sdlf/master
gitclebeg 9 years ago
parent
commit
25abd088f2
4 changed files with 132 additions and 3 deletions
  1. +7
    -2
      README.md
  2. +2
    -1
      data/.gitignore
  3. BIN
      data/text_classification.zip
  4. +123
    -0
      src/main/java/eshore/cn/it/classification/LingPipeClassier.java

+ 7
- 2
README.md View File

@@ -2,13 +2,18 @@
#本程序依赖data目录下面的data.zip和dictionary.zip先解压缩 data 目录下面的 data.zip到当前目录。



* 新增说明2:增加文本分类程序,目的是找出自己领域相关的文本,然后再从这个领域相关的文本中判断正负面。
```
+ 测试语料:data/text_classification.zip 解压缩即可
+ 运行程序:LingPipeClassier 即可。
```
* 新增说明1:2015-04-10测试了不用中文分词器,分词之后 LingPipe 情感分类的准确率,同时测试了去除停用词之后的情感分类的准确率。
```
+ 1) 发现用HanLP的NLPTokenizer分词器,准确率最高,但是速度有点慢。
+ 2) 如果用HanLP的标准分词器就会准确率低一点点,但是速度快。
+ 3) 分词之后去除停用词效果更加差。
+ 4) 结巴分词效果不好,而且速度慢。
```
###1、基于词典和贝叶斯模型的情感分析
主程序:eshore.cn.it.sentiment.Sentiment 此类通过
data/Sentiment_Dictionary中的正负面词语建立模型。


+ 2
- 1
data/.gitignore View File

@@ -3,4 +3,5 @@ nerws_corpus/
output/
polarity_corpus/
Sentiment_Dictionary/
dictionary/
dictionary/
text_classification/

BIN
data/text_classification.zip View File


+ 123
- 0
src/main/java/eshore/cn/it/classification/LingPipeClassier.java View File

@@ -0,0 +1,123 @@
package eshore.cn.it.classification;

import java.io.File;
import java.io.IOException;
import java.util.List;

import com.aliasi.classify.Classification;
import com.aliasi.classify.Classified;
import com.aliasi.classify.ConfusionMatrix;
import com.aliasi.classify.DynamicLMClassifier;
import com.aliasi.classify.JointClassification;
import com.aliasi.classify.JointClassifier;
import com.aliasi.classify.JointClassifierEvaluator;
import com.aliasi.lm.NGramProcessLM;
import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Files;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;

/**
* 基于LingPipe的文本分类器,主要分类成两类
* 一类: 关于政务的
* 另一类: 非政务的
* 采用的算法有:
* @author clebeg
* @time 2015-04-13
* */
public class LingPipeClassier {
private static String[] CATEGORIES = {
"government",
"others"
};
private static int NGRAM_SIZE = 2;
private static String TEXT_CLASSIFICATION_TRAINING = "data/text_classification/training";
private static String TEXT_CLASSIFICATION_TESTING = "data/text_classification/testing";
private static DynamicLMClassifier<NGramProcessLM> classifier
= DynamicLMClassifier.createNGramProcess(CATEGORIES, NGRAM_SIZE);
public static void main(String[] args) throws IOException, ClassNotFoundException {
trainModel();
evaluate();
}
public static void trainModel() throws IOException {
for(int i = 0; i < CATEGORIES.length; ++i) {
File classDir = new File(TEXT_CLASSIFICATION_TRAINING, CATEGORIES[i]);
if (!classDir.isDirectory()) {
String msg = "Could not find training directory="
+ classDir
+ "\nHave you unpacked "
+ CATEGORIES.length
+ "groups?";
System.out.println(msg); // in case exception gets lost in shell
throw new IllegalArgumentException(msg);
}

String[] trainingFiles = classDir.list();
for (int j = 0; j < trainingFiles.length; ++j) {
File file = new File(classDir, trainingFiles[j]);
String text = Files.readFromFile(file, "GBK");
System.out.println("Training on " + CATEGORIES[i] + "/" + trainingFiles[j]);
String segWords = "";
List<Term> terms = HanLP.segment(text);
for (Term term : terms)
segWords += term.word + " ";
Classification classification
= new Classification(CATEGORIES[i]);
Classified<CharSequence> classified
= new Classified<CharSequence>(segWords, classification);
classifier.handle(classified);
}
}
}
public static void evaluate() throws IOException, ClassNotFoundException {
//compiling
System.out.println("Compiling");
@SuppressWarnings("unchecked") // we created object so know it's safe
JointClassifier<CharSequence> compiledClassifier
= (JointClassifier<CharSequence>)
AbstractExternalizable.compile(classifier);

boolean storeCategories = true;
JointClassifierEvaluator<CharSequence> evaluator
= new JointClassifierEvaluator<CharSequence>(compiledClassifier,
CATEGORIES,
storeCategories);
for(int i = 0; i < CATEGORIES.length; ++i) {
File classDir = new File(TEXT_CLASSIFICATION_TESTING, CATEGORIES[i]);
String[] testingFiles = classDir.list();
for (int j=0; j < testingFiles.length; ++j) {
String text
= Files.readFromFile(new File(classDir,testingFiles[j]),"ISO-8859-1");
System.out.print("Testing on " + CATEGORIES[i] + "/" + testingFiles[j] + " ");
Classification classification
= new Classification(CATEGORIES[i]);
String segWords = "";
List<Term> terms = HanLP.segment(text);
for (Term term : terms)
segWords += term.word + " ";
Classified<CharSequence> classified
= new Classified<CharSequence>(segWords, classification);
evaluator.handle(classified);
JointClassification jc =
compiledClassifier.classify(text);
String bestCategory = jc.bestCategory();
//String details = jc.toString();
System.out.println("Got best category of: " + bestCategory);
System.out.println(jc.toString());
System.out.println("---------------");
}
}
ConfusionMatrix confMatrix = evaluator.confusionMatrix();
System.out.println("Total Accuracy: " + confMatrix.totalAccuracy());
System.out.println("\nFULL EVAL");
System.out.println(evaluator);
}
}

Loading…
Cancel
Save