|
@@ -0,0 +1,123 @@ |
|
|
|
|
|
package eshore.cn.it.classification; |
|
|
|
|
|
|
|
|
|
|
|
import java.io.File; |
|
|
|
|
|
import java.io.IOException; |
|
|
|
|
|
import java.util.List; |
|
|
|
|
|
|
|
|
|
|
|
import com.aliasi.classify.Classification; |
|
|
|
|
|
import com.aliasi.classify.Classified; |
|
|
|
|
|
import com.aliasi.classify.ConfusionMatrix; |
|
|
|
|
|
import com.aliasi.classify.DynamicLMClassifier; |
|
|
|
|
|
import com.aliasi.classify.JointClassification; |
|
|
|
|
|
import com.aliasi.classify.JointClassifier; |
|
|
|
|
|
import com.aliasi.classify.JointClassifierEvaluator; |
|
|
|
|
|
import com.aliasi.lm.NGramProcessLM; |
|
|
|
|
|
import com.aliasi.util.AbstractExternalizable; |
|
|
|
|
|
import com.aliasi.util.Files; |
|
|
|
|
|
import com.hankcs.hanlp.HanLP; |
|
|
|
|
|
import com.hankcs.hanlp.seg.common.Term; |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
|
* 基于LingPipe的文本分类器,主要分类成两类 |
|
|
|
|
|
* 一类: 关于政务的 |
|
|
|
|
|
* 另一类: 非政务的 |
|
|
|
|
|
* 采用的算法有: |
|
|
|
|
|
* @author clebeg |
|
|
|
|
|
* @time 2015-04-13 |
|
|
|
|
|
* */ |
|
|
|
|
|
public class LingPipeClassier { |
|
|
|
|
|
private static String[] CATEGORIES = { |
|
|
|
|
|
"government", |
|
|
|
|
|
"others" |
|
|
|
|
|
}; |
|
|
|
|
|
private static int NGRAM_SIZE = 2; |
|
|
|
|
|
|
|
|
|
|
|
private static String TEXT_CLASSIFICATION_TRAINING = "data/text_classification/training"; |
|
|
|
|
|
private static String TEXT_CLASSIFICATION_TESTING = "data/text_classification/testing"; |
|
|
|
|
|
|
|
|
|
|
|
private static DynamicLMClassifier<NGramProcessLM> classifier |
|
|
|
|
|
= DynamicLMClassifier.createNGramProcess(CATEGORIES, NGRAM_SIZE); |
|
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) throws IOException, ClassNotFoundException { |
|
|
|
|
|
trainModel(); |
|
|
|
|
|
evaluate(); |
|
|
|
|
|
} |
|
|
|
|
|
public static void trainModel() throws IOException { |
|
|
|
|
|
for(int i = 0; i < CATEGORIES.length; ++i) { |
|
|
|
|
|
File classDir = new File(TEXT_CLASSIFICATION_TRAINING, CATEGORIES[i]); |
|
|
|
|
|
if (!classDir.isDirectory()) { |
|
|
|
|
|
String msg = "Could not find training directory=" |
|
|
|
|
|
+ classDir |
|
|
|
|
|
+ "\nHave you unpacked " |
|
|
|
|
|
+ CATEGORIES.length |
|
|
|
|
|
+ "groups?"; |
|
|
|
|
|
System.out.println(msg); // in case exception gets lost in shell |
|
|
|
|
|
throw new IllegalArgumentException(msg); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
String[] trainingFiles = classDir.list(); |
|
|
|
|
|
for (int j = 0; j < trainingFiles.length; ++j) { |
|
|
|
|
|
File file = new File(classDir, trainingFiles[j]); |
|
|
|
|
|
String text = Files.readFromFile(file, "GBK"); |
|
|
|
|
|
System.out.println("Training on " + CATEGORIES[i] + "/" + trainingFiles[j]); |
|
|
|
|
|
|
|
|
|
|
|
String segWords = ""; |
|
|
|
|
|
List<Term> terms = HanLP.segment(text); |
|
|
|
|
|
for (Term term : terms) |
|
|
|
|
|
segWords += term.word + " "; |
|
|
|
|
|
|
|
|
|
|
|
Classification classification |
|
|
|
|
|
= new Classification(CATEGORIES[i]); |
|
|
|
|
|
Classified<CharSequence> classified |
|
|
|
|
|
= new Classified<CharSequence>(segWords, classification); |
|
|
|
|
|
classifier.handle(classified); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
public static void evaluate() throws IOException, ClassNotFoundException { |
|
|
|
|
|
//compiling |
|
|
|
|
|
System.out.println("Compiling"); |
|
|
|
|
|
@SuppressWarnings("unchecked") // we created object so know it's safe |
|
|
|
|
|
JointClassifier<CharSequence> compiledClassifier |
|
|
|
|
|
= (JointClassifier<CharSequence>) |
|
|
|
|
|
AbstractExternalizable.compile(classifier); |
|
|
|
|
|
|
|
|
|
|
|
boolean storeCategories = true; |
|
|
|
|
|
JointClassifierEvaluator<CharSequence> evaluator |
|
|
|
|
|
= new JointClassifierEvaluator<CharSequence>(compiledClassifier, |
|
|
|
|
|
CATEGORIES, |
|
|
|
|
|
storeCategories); |
|
|
|
|
|
for(int i = 0; i < CATEGORIES.length; ++i) { |
|
|
|
|
|
File classDir = new File(TEXT_CLASSIFICATION_TESTING, CATEGORIES[i]); |
|
|
|
|
|
String[] testingFiles = classDir.list(); |
|
|
|
|
|
for (int j=0; j < testingFiles.length; ++j) { |
|
|
|
|
|
String text |
|
|
|
|
|
= Files.readFromFile(new File(classDir,testingFiles[j]),"ISO-8859-1"); |
|
|
|
|
|
System.out.print("Testing on " + CATEGORIES[i] + "/" + testingFiles[j] + " "); |
|
|
|
|
|
Classification classification |
|
|
|
|
|
= new Classification(CATEGORIES[i]); |
|
|
|
|
|
|
|
|
|
|
|
String segWords = ""; |
|
|
|
|
|
List<Term> terms = HanLP.segment(text); |
|
|
|
|
|
for (Term term : terms) |
|
|
|
|
|
segWords += term.word + " "; |
|
|
|
|
|
|
|
|
|
|
|
Classified<CharSequence> classified |
|
|
|
|
|
= new Classified<CharSequence>(segWords, classification); |
|
|
|
|
|
evaluator.handle(classified); |
|
|
|
|
|
JointClassification jc = |
|
|
|
|
|
compiledClassifier.classify(text); |
|
|
|
|
|
String bestCategory = jc.bestCategory(); |
|
|
|
|
|
//String details = jc.toString(); |
|
|
|
|
|
System.out.println("Got best category of: " + bestCategory); |
|
|
|
|
|
System.out.println(jc.toString()); |
|
|
|
|
|
System.out.println("---------------"); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
ConfusionMatrix confMatrix = evaluator.confusionMatrix(); |
|
|
|
|
|
System.out.println("Total Accuracy: " + confMatrix.totalAccuracy()); |
|
|
|
|
|
System.out.println("\nFULL EVAL"); |
|
|
|
|
|
System.out.println(evaluator); |
|
|
|
|
|
} |
|
|
|
|
|
} |