diff --git a/README.md b/README.md index 34963d6..1627125 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,18 @@ #本程序依赖data目录下面的data.zip和dictionary.zip先解压缩 data 目录下面的 data.zip到当前目录。 - +* 新增说明2:增加文本分类程序,目的是找出自己领域相关的文本,然后再从这个领域相关的文本中判断正负面。 +``` ++ 测试语料:data/text_classification.zip 解压缩即可 ++ 运行程序:LingPipeClassier 即可。 +``` * 新增说明1:2015-04-10测试了不用中文分词器,分词之后 LingPipe 情感分类的准确率,同时测试了去除停用词之后的情感分类的准确率。 +``` + 1) 发现用HanLP的NLPTokenizer分词器,准确率最高,但是速度有点慢。 + 2) 如果用HanLP的标准分词器就会准确率低一点点,但是速度快。 + 3) 分词之后去除停用词效果更加差。 + 4) 结巴分词效果不好,而且速度慢。 - +``` ###1、基于词典和贝叶斯模型的情感分析 主程序:eshore.cn.it.sentiment.Sentiment 此类通过 data/Sentiment_Dictionary中的正负面词语建立模型。 diff --git a/data/.gitignore b/data/.gitignore index f1e70dd..b9a67c0 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -3,4 +3,5 @@ nerws_corpus/ output/ polarity_corpus/ Sentiment_Dictionary/ -dictionary/ \ No newline at end of file +dictionary/ +text_classification/ \ No newline at end of file diff --git a/data/text_classification.zip b/data/text_classification.zip new file mode 100644 index 0000000..400ece0 Binary files /dev/null and b/data/text_classification.zip differ diff --git a/src/main/java/eshore/cn/it/classification/LingPipeClassier.java b/src/main/java/eshore/cn/it/classification/LingPipeClassier.java new file mode 100644 index 0000000..b7bfaae --- /dev/null +++ b/src/main/java/eshore/cn/it/classification/LingPipeClassier.java @@ -0,0 +1,123 @@ +package eshore.cn.it.classification; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import com.aliasi.classify.Classification; +import com.aliasi.classify.Classified; +import com.aliasi.classify.ConfusionMatrix; +import com.aliasi.classify.DynamicLMClassifier; +import com.aliasi.classify.JointClassification; +import com.aliasi.classify.JointClassifier; +import com.aliasi.classify.JointClassifierEvaluator; +import com.aliasi.lm.NGramProcessLM; +import com.aliasi.util.AbstractExternalizable; +import com.aliasi.util.Files; +import com.hankcs.hanlp.HanLP; +import com.hankcs.hanlp.seg.common.Term; + +/** + * 基于LingPipe的文本分类器,主要分类成两类 + * 一类: 关于政务的 + * 另一类: 非政务的 + * 采用的算法有: + * @author clebeg + * @time 2015-04-13 + * */ +public class LingPipeClassier { + private static String[] CATEGORIES = { + "government", + "others" + }; + private static int NGRAM_SIZE = 2; + + private static String TEXT_CLASSIFICATION_TRAINING = "data/text_classification/training"; + private static String TEXT_CLASSIFICATION_TESTING = "data/text_classification/testing"; + + private static DynamicLMClassifier classifier + = DynamicLMClassifier.createNGramProcess(CATEGORIES, NGRAM_SIZE); + + public static void main(String[] args) throws IOException, ClassNotFoundException { + trainModel(); + evaluate(); + } + public static void trainModel() throws IOException { + for(int i = 0; i < CATEGORIES.length; ++i) { + File classDir = new File(TEXT_CLASSIFICATION_TRAINING, CATEGORIES[i]); + if (!classDir.isDirectory()) { + String msg = "Could not find training directory=" + + classDir + + "\nHave you unpacked " + + CATEGORIES.length + + "groups?"; + System.out.println(msg); // in case exception gets lost in shell + throw new IllegalArgumentException(msg); + } + + String[] trainingFiles = classDir.list(); + for (int j = 0; j < trainingFiles.length; ++j) { + File file = new File(classDir, trainingFiles[j]); + String text = Files.readFromFile(file, "GBK"); + System.out.println("Training on " + CATEGORIES[i] + "/" + trainingFiles[j]); + + String segWords = ""; + List terms = HanLP.segment(text); + for (Term term : terms) + segWords += term.word + " "; + + Classification classification + = new Classification(CATEGORIES[i]); + Classified classified + = new Classified(segWords, classification); + classifier.handle(classified); + } + } + } + + public static void evaluate() throws IOException, ClassNotFoundException { + //compiling + System.out.println("Compiling"); + @SuppressWarnings("unchecked") // we created object so know it's safe + JointClassifier compiledClassifier + = (JointClassifier) + AbstractExternalizable.compile(classifier); + + boolean storeCategories = true; + JointClassifierEvaluator evaluator + = new JointClassifierEvaluator(compiledClassifier, + CATEGORIES, + storeCategories); + for(int i = 0; i < CATEGORIES.length; ++i) { + File classDir = new File(TEXT_CLASSIFICATION_TESTING, CATEGORIES[i]); + String[] testingFiles = classDir.list(); + for (int j=0; j < testingFiles.length; ++j) { + String text + = Files.readFromFile(new File(classDir,testingFiles[j]),"ISO-8859-1"); + System.out.print("Testing on " + CATEGORIES[i] + "/" + testingFiles[j] + " "); + Classification classification + = new Classification(CATEGORIES[i]); + + String segWords = ""; + List terms = HanLP.segment(text); + for (Term term : terms) + segWords += term.word + " "; + + Classified classified + = new Classified(segWords, classification); + evaluator.handle(classified); + JointClassification jc = + compiledClassifier.classify(text); + String bestCategory = jc.bestCategory(); + //String details = jc.toString(); + System.out.println("Got best category of: " + bestCategory); + System.out.println(jc.toString()); + System.out.println("---------------"); + } + } + ConfusionMatrix confMatrix = evaluator.confusionMatrix(); + System.out.println("Total Accuracy: " + confMatrix.totalAccuracy()); + System.out.println("\nFULL EVAL"); + System.out.println(evaluator); + } +}