From 698b998b1597aebc21b2a07c9544bee548cf5225 Mon Sep 17 00:00:00 2001
From: gitclebeg <clebeg@163.com>
Date: Mon, 13 Apr 2015 17:24:02 +0800
Subject: [PATCH] add new text classification base on tf-idf

---
 README.md                                     |   7 +-
 .../cn/it/classification/DfIdfClassier.java   | 157 +++++++
 ...ngPipeClassier.java => NGramClassier.java} |   7 +-
 .../cn/it/classification/StringTools.java     | 393 ++++++++++++++++++
 4 files changed, 558 insertions(+), 6 deletions(-)
 create mode 100644 src/main/java/eshore/cn/it/classification/DfIdfClassier.java
 rename src/main/java/eshore/cn/it/classification/{LingPipeClassier.java => NGramClassier.java} (98%)
 create mode 100644 src/main/java/eshore/cn/it/classification/StringTools.java
diff --git a/README.md b/README.md
index a66227e..9c9a5c3 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,13 @@
 ##基于自然语言处理的情感分析工具
 ###本程序依赖data目录下面的data.zip和dictionary.zip先解压缩  data 目录下面的 data.zip到当前目录。
 
+*	新增说明3：增加基于 TF-IDF(词向量) 特征的文本分类程序。
+1. 主程序：DfIdfClassifier
 
-*	新增说明2：增加文本分类程序，目的是找出自己领域相关的文本，然后再从这个领域相关的文本中判断正负面。
-
+*	新增说明2：增加基于 N-Gram(词向量) 特征的文本分类程序，目的是找出自己领域相关的文本，然后再从这个领域相关的文本中判断正负面。
 
 1. 测试语料：data/text_classification.zip 解压缩即可
-2. 运行程序：LingPipeClassier 即可。
+2. 运行程序：NGramClassifier 即可。
 
 *	新增说明1：2015-04-10测试了不用中文分词器，分词之后 LingPipe 情感分类的准确率，同时测试了去除停用词之后的情感分类的准确率。
 
diff --git a/src/main/java/eshore/cn/it/classification/DfIdfClassier.java b/src/main/java/eshore/cn/it/classification/DfIdfClassier.java
new file mode 100644
index 0000000..125d6b6
--- /dev/null
+++ b/src/main/java/eshore/cn/it/classification/DfIdfClassier.java
@@ -0,0 +1,157 @@
+package eshore.cn.it.classification;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.text.NumberFormat;
+import java.util.List;
+
+import com.aliasi.classify.Classification;
+import com.aliasi.classify.Classified;
+import com.aliasi.classify.ConfusionMatrix;
+import com.aliasi.classify.ScoredClassification;
+import com.aliasi.classify.ScoredClassifier;
+import com.aliasi.classify.TfIdfClassifierTrainer;
+import com.aliasi.tokenizer.CharacterTokenizerFactory;
+import com.aliasi.tokenizer.TokenFeatureExtractor;
+import com.aliasi.util.Files;
+import com.hankcs.hanlp.HanLP;
+import com.hankcs.hanlp.seg.common.Term;
+
+/**
+ * 基于LingPipe的文本分类器，主要分类成两类
+ * 一类：		关于政务的
+ * 另一类：	非政务的
+ * 采用的算法有：
+ * @author  clebeg
+ * @time	2015-04-13
+ * */
+public class DfIdfClassier {
+	private static String[] CATEGORIES = {
+		"government",
+		"others"
+	};
+	
+	private static String TEXT_CLASSIFICATION_TRAINING = "data/text_classification/training";
+	private static String TEXT_CLASSIFICATION_TESTING = "data/text_classification/testing";
+	private static String MODEL_FILE = "Model/tfidf_classifier"; 
+	
+	private static TfIdfClassifierTrainer<CharSequence> classifier = new TfIdfClassifierTrainer<CharSequence>(  
+            new TokenFeatureExtractor(CharacterTokenizerFactory.INSTANCE)); 
+	
+	public static void main(String[] args) throws IOException, ClassNotFoundException {
+		trainModel(true);
+		evaluate();
+	}
+	public static void trainModel(boolean needStoreModle) throws IOException {
+		for(int i = 0; i < CATEGORIES.length; ++i) {
+            File classDir = new File(TEXT_CLASSIFICATION_TRAINING, CATEGORIES[i]);
+            if (!classDir.isDirectory()) {
+                String msg = "Could not find training directory="
+                    + classDir
+                    + "\nHave you unpacked "
+                    + CATEGORIES.length
+                    + "groups?";
+                System.out.println(msg); // in case exception gets lost in shell
+                throw new IllegalArgumentException(msg);
+            }
+
+            String[] trainingFiles = classDir.list();
+            for (int j = 0; j < trainingFiles.length; ++j) {
+                File file = new File(classDir, trainingFiles[j]);
+                String text = Files.readFromFile(file, "GBK");
+                System.out.println("Training on " + CATEGORIES[i] + "/" + trainingFiles[j]);
+                
+                String segWords = "";
+        		List<Term> terms = HanLP.segment(text);
+        		for (Term term : terms)
+        			segWords += term.word + " ";
+        		
+                Classification classification
+                    = new Classification(CATEGORIES[i]);
+                Classified<CharSequence> classified
+                    = new Classified<CharSequence>(segWords, classification);
+                classifier.handle(classified);
+            }
+        }
+		if (needStoreModle) {
+			System.out.println("开始保存分类器到  " + MODEL_FILE);  
+	        ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(  
+	        		MODEL_FILE));  
+	        classifier.compileTo(os);  
+	        os.close();  
+	        System.out.println("分类器保存完成");  
+		}
+	}
+	
+	@SuppressWarnings("unchecked")
+	public static void evaluate() throws IOException, ClassNotFoundException {
+		ScoredClassifier<CharSequence> compiledClassifier = null;  
+        try {  
+            ObjectInputStream oi = new ObjectInputStream(new FileInputStream(  
+            		MODEL_FILE));  
+            compiledClassifier = (ScoredClassifier<CharSequence>) oi  
+                    .readObject();  
+            oi.close();  
+        } catch (IOException ie) {  
+            System.out.println("IO Error: Model file " + MODEL_FILE + " missing");  
+        }  
+        // 遍历分类目录中的文件测试分类准确度  
+        ConfusionMatrix confMatrix = new ConfusionMatrix(CATEGORIES);  
+        NumberFormat nf = NumberFormat.getInstance();  
+        nf.setMaximumIntegerDigits(1);  
+        nf.setMaximumFractionDigits(3);  
+        for(int i = 0; i < CATEGORIES.length; ++i) {
+            File classDir = new File(TEXT_CLASSIFICATION_TESTING, CATEGORIES[i]);
+            String[] testingFiles = classDir.list();
+            for (int j = 0; j < testingFiles.length;  ++j) {
+                String text
+                    = Files.readFromFile(new File(classDir, testingFiles[j]),"GBK");
+                String segWords = "";
+        		List<Term> terms = HanLP.segment(text);
+        		for (Term term : terms)
+        			segWords += term.word + " ";
+                
+        		System.out.println("测试 " + CATEGORIES[i]  
+                        + File.separator + testingFiles[j]);  
+  
+                ScoredClassification classification = compiledClassifier  
+                        .classify(segWords.subSequence(0, text.length()));  
+                confMatrix.increment(CATEGORIES[i],  
+                        classification.bestCategory());  
+                System.out.println("最适合的分类: "  
+                        + classification.bestCategory()); 
+            }
+        }
+        
+        System.out.println("--------------------------------------------");  
+        System.out.println("- 结果 ");  
+        System.out.println("--------------------------------------------");  
+        int[][] imatrix = confMatrix.matrix();  
+        StringBuffer sb = new StringBuffer();  
+        sb.append(StringTools.fillin("CATEGORY", 10, true, ' '));  
+        for (int i = 0; i < CATEGORIES.length; i++)  
+            sb.append(StringTools.fillin(CATEGORIES[i], 8, false, ' '));  
+        System.out.println(sb.toString());  
+  
+        for (int i = 0; i < imatrix.length; i++) {  
+            sb = new StringBuffer();  
+            sb.append(StringTools.fillin(CATEGORIES[i], 10, true, ' ',  
+                    10 - CATEGORIES[i].length()));  
+            for (int j = 0; j < imatrix.length; j++) {  
+                String out = "" + imatrix[i][j];  
+                sb.append(StringTools.fillin(out, 8, false, ' ',  
+                        8 - out.length()));  
+            }  
+            System.out.println(sb.toString());  
+        }  
+  
+        System.out.println("准确度: "  
+                + nf.format(confMatrix.totalAccuracy()));  
+        System.out.println("总共正确数 : " + confMatrix.totalCorrect());  
+        System.out.println("总数：" + confMatrix.totalCount());  
+	}
+}
diff --git a/src/main/java/eshore/cn/it/classification/LingPipeClassier.java b/src/main/java/eshore/cn/it/classification/NGramClassier.java
similarity index 98%
rename from src/main/java/eshore/cn/it/classification/LingPipeClassier.java
rename to src/main/java/eshore/cn/it/classification/NGramClassier.java
index b7bfaae..44e54e6 100644
--- a/src/main/java/eshore/cn/it/classification/LingPipeClassier.java
+++ b/src/main/java/eshore/cn/it/classification/NGramClassier.java
@@ -25,12 +25,12 @@ import com.hankcs.hanlp.seg.common.Term;
  * @author  clebeg
  * @time	2015-04-13
  * */
-public class LingPipeClassier {
+public class NGramClassier {
 	private static String[] CATEGORIES = {
 		"government",
 		"others"
 	};
-	private static int NGRAM_SIZE = 2;
+	private static int NGRAM_SIZE = 3;
 	
 	private static String TEXT_CLASSIFICATION_TRAINING = "data/text_classification/training";
 	private static String TEXT_CLASSIFICATION_TESTING = "data/text_classification/testing";
@@ -38,6 +38,7 @@ public class LingPipeClassier {
 	private static  DynamicLMClassifier<NGramProcessLM> classifier
 		= DynamicLMClassifier.createNGramProcess(CATEGORIES, NGRAM_SIZE);
 	
+	
 	public static void main(String[] args) throws IOException, ClassNotFoundException {
 		trainModel();
 		evaluate();
@@ -93,7 +94,7 @@ public class LingPipeClassier {
             String[] testingFiles = classDir.list();
             for (int j=0; j < testingFiles.length;  ++j) {
                 String text
-                    = Files.readFromFile(new File(classDir,testingFiles[j]),"ISO-8859-1");
+                    = Files.readFromFile(new File(classDir,testingFiles[j]),"GBK");
                 System.out.print("Testing on " + CATEGORIES[i] + "/" + testingFiles[j] + " ");
                 Classification classification
                     = new Classification(CATEGORIES[i]);
diff --git a/src/main/java/eshore/cn/it/classification/StringTools.java b/src/main/java/eshore/cn/it/classification/StringTools.java
new file mode 100644
index 0000000..0184ec5
--- /dev/null
+++ b/src/main/java/eshore/cn/it/classification/StringTools.java
@@ -0,0 +1,393 @@
+package eshore.cn.it.classification;
+import java.io.IOException;
+import java.io.Reader;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.aliasi.spell.EditDistance;
+
+/**
+ * A class containing a bunch of string utilities - <br>
+ * a. filterChars: Remove extraneous characters from a string and return a
+ * "clean" string. <br>
+ * b. getSuffix: Given a file name return its extension. <br>
+ * c. fillin: pad or truncate a string to a fixed number of characters. <br>
+ * d. removeAmpersandStrings: remove strings that start with ampersand <br>
+ * e. shaDigest: Compute the 40 byte digest signature of a string <br>
+ */
+public class StringTools {
+  public static final Locale LOCALE = new Locale("en");
+  // * -- String limit for StringTools
+  private static int STRING_TOOLS_LIMIT = 1000000;
+  // *-- pre-compiled RE patterns
+  private static Pattern extPattern = Pattern.compile("^.*[.](.*?){1}quot;");
+  private static Pattern spacesPattern = Pattern.compile("\\s+");
+  private static Pattern removeAmpersandPattern = Pattern.compile("&[^;]*?;");
+
+  /**
+   * Removes non-printable spaces and replaces with a single space
+   * 
+   * @param in
+   *          String with mixed characters
+   * @return String with collapsed spaces and printable characters
+   */
+  public static String filterChars(String in) {
+    return (filterChars(in, "", ' ', true));
+  }
+
+  public static String filterChars(String in, boolean newLine) {
+    return (filterChars(in, "", ' ', newLine));
+  }
+
+  public static String filterChars(String in, String badChars) {
+    return (filterChars(in, badChars, ' ', true));
+  }
+
+  public static String filterChars(String in, char replaceChar) {
+    return (filterChars(in, "", replaceChar, true));
+  }
+
+  public static String filterChars(String in, String badChars,
+      char replaceChar, boolean newLine) {
+    if (in == null)
+      return "";
+    int inLen = in.length();
+    if (inLen > STRING_TOOLS_LIMIT)
+      return in;
+    try {
+      // **-- replace non-recognizable characters with spaces
+      StringBuffer out = new StringBuffer();
+      int badLen = badChars.length();
+      for (int i = 0; i < inLen; i++) {
+        char ch = in.charAt(i);
+        if ((badLen != 0) && removeChar(ch, badChars)) {
+          ch = replaceChar;
+        } else if (!Character.isDefined(ch) && !Character.isSpaceChar(ch)) {
+          ch = replaceChar;
+        }
+        out.append(ch);
+      }
+
+      // *-- replace new lines with space
+      Matcher matcher = null;
+      in = out.toString();
+
+      // *-- replace consecutive spaces with single space and remove
+      // leading/trailing spaces
+      in = in.trim();
+      matcher = spacesPattern.matcher(in);
+      in = matcher.replaceAll(" ");
+    } catch (OutOfMemoryError e) {
+      return in;
+    }
+
+    return in;
+  }
+
+  // *-- remove any chars found in the badChars string
+  private static boolean removeChar(char ch, String badChars) {
+    if (badChars.length() == 0)
+      return false;
+    for (int i = 0; i < badChars.length(); i++) {
+      if (ch == badChars.charAt(i))
+        return true;
+    }
+    return false;
+  }
+
+  /**
+   * Return the extension of a file, if possible.
+   * 
+   * @param filename
+   * @return string
+   */
+  public static String getSuffix(String filename) {
+    if (filename.length() > STRING_TOOLS_LIMIT)
+      return ("");
+    Matcher matcher = extPattern.matcher(filename);
+    if (!matcher.matches())
+      return "";
+    return (matcher.group(1).toLowerCase(LOCALE));
+  }
+
+  public static String fillin(String in, int len) {
+    return fillin(in, len, true, ' ', 3);
+  }
+
+  public static String fillin(String in, int len, char fillinChar) {
+    return fillin(in, len, true, fillinChar, 3);
+  }
+
+  public static String fillin(String in, int len, boolean right) {
+    return fillin(in, len, right, ' ', 3);
+  }
+
+  public static String fillin(String in, int len, boolean right, char fillinChar) {
+    return fillin(in, len, right, fillinChar, 3);
+  }
+
+  /**
+   * Return a string concatenated or padded to the specified length
+   * 
+   * @param in
+   *          string to be truncated or padded
+   * @param len
+   *          int length for string
+   * @param right
+   *          boolean fillin from the left or right
+   * @param fillinChar
+   *          char to pad the string
+   * @param numFills
+   *          int number of characters to pad
+   * @return String of specified length
+   */
+  public static String fillin(String in, int len, boolean right,
+      char fillinChar, int numFills) {
+    // *-- return if string is of required length
+    int slen = in.length();
+    if ((slen == len) || (slen > STRING_TOOLS_LIMIT))
+      return (in);
+
+    // *-- build the fillin string
+    StringBuffer fillinStb = new StringBuffer();
+    for (int i = 0; i < numFills; i++)
+      fillinStb.append(fillinChar);
+    String fillinString = fillinStb.toString();
+
+    // *-- truncate and pad string if length exceeds required length
+    if (slen > len) {
+      if (right)
+        return (in.substring(0, len - numFills) + fillinString);
+      else
+        return (fillinString + in.substring(slen - len + numFills, slen));
+    }
+
+    // *-- pad string if length is less than required length DatabaseEntry
+    // dbe = dbt.getNextKey(); String dbkey = new String (dbe.getData());
+    StringBuffer sb = new StringBuffer();
+    if (right)
+      sb.append(in);
+    sb.append(fillinString);
+    if (!right)
+      sb.append(in);
+    return (sb.toString());
+  }
+
+  /**
+   * Remove ampersand strings such as \ 
+   * 
+   * @param in
+   *          Text string extracted from Web pages
+   * @return String Text string without ampersand strings
+   */
+  public static String removeAmpersandStrings(String in) {
+    if (in.length() > STRING_TOOLS_LIMIT)
+      return (in);
+    Matcher matcher = removeAmpersandPattern.matcher(in);
+    return (matcher.replaceAll(""));
+  }
+
+  /**
+   * Escape back slashes
+   * 
+   * @param in
+   *          Text to be escaped
+   * @return String Escaped test
+   */
+  public static String escapeText(String in) {
+    StringBuffer sb = new StringBuffer();
+    for (int i = 0; i < in.length(); i++) {
+      char ch = in.charAt(i);
+      if (ch == '\\')
+        sb.append("\\\\");
+      else
+        sb.append(ch);
+    }
+    return (sb.toString());
+  }
+
+  /**
+   * Get the SHA signature of a string
+   * 
+   * @param in
+   *          String
+   * @return String SHA signature of in
+   */
+  public static String shaDigest(String in) {
+    StringBuffer out = new StringBuffer();
+    if ((in == null) || (in.length() == 0))
+      return ("");
+    try {
+      // *-- create a message digest instance and compute the hash
+      // byte array
+      MessageDigest md = MessageDigest.getInstance("SHA-1");
+      md.reset();
+      md.update(in.getBytes());
+      byte[] hash = md.digest();
+
+      // *--- Convert the hash byte array to hexadecimal format, pad
+      // hex chars with leading zeroes
+      // *--- to get a signature of consistent length (40) for all
+      // strings.
+      for (int i = 0; i < hash.length; i++) {
+        out.append(fillin(Integer.toString(0xFF & hash[i], 16), 2, false, '0',
+            1));
+      }
+    } catch (OutOfMemoryError e) {
+      return ("<-------------OUT_OF_MEMORY------------>");
+    } catch (NoSuchAlgorithmException e) {
+      return ("<------SHA digest algorithm not found--->");
+    }
+
+    return (out.toString());
+  }
+
+  /**
+   * Return the string with the first letter upper cased
+   * 
+   * @param in
+   * @return String
+   */
+  public static String firstLetterUC(String in) {
+    if ((in == null) || (in.length() == 0))
+      return ("");
+    String out = in.toLowerCase(LOCALE);
+    String part1 = out.substring(0, 1);
+    String part2 = out.substring(1, in.length());
+    return (part1.toUpperCase(LOCALE) + part2.toLowerCase(LOCALE));
+  }
+
+  /**
+   * Return a pattern that can be used to collapse consecutive patterns of the
+   * same type
+   * 
+   * @param entityTypes
+   *          A list of entity types
+   * @return Regex pattern for the entity types
+   */
+  public static Pattern getCollapsePattern(String[] entityTypes) {
+    Pattern collapsePattern = null;
+    StringBuffer collapseStr = new StringBuffer();
+    for (int i = 0; i < entityTypes.length; i++) {
+      collapseStr.append("(<\\/");
+      collapseStr.append(entityTypes[i]);
+      collapseStr.append(">\\s+");
+      collapseStr.append("<");
+      collapseStr.append(entityTypes[i]);
+      collapseStr.append(">)|");
+    }
+    collapsePattern = Pattern.compile(collapseStr.toString().substring(0,
+        collapseStr.length() - 1));
+    return (collapsePattern);
+  }
+
+  /**
+   * return a double that indicates the degree of similarity between two strings
+   * Use the Jaccard similarity, i.e. the ratio of A intersection B to A union B
+   * 
+   * @param first
+   *          string
+   * @param second
+   *          string
+   * @return double degreee of similarity
+   */
+  public static double stringSimilarity(String first, String second) {
+    if ((first == null) || (second == null))
+      return (0.0);
+    String[] a = first.split("\\s+");
+    String[] b = second.split("\\s+");
+
+    // *-- compute a union b
+    HashSet<String> aUnionb = new HashSet<String>();
+    HashSet<String> aTokens = new HashSet<String>();
+    HashSet<String> bTokens = new HashSet<String>();
+    for (int i = 0; i < a.length; i++) {
+      aUnionb.add(a[i]);
+      aTokens.add(a[i]);
+    }
+    for (int i = 0; i < b.length; i++) {
+      aUnionb.add(b[i]);
+      bTokens.add(b[i]);
+    }
+    int sizeAunionB = aUnionb.size();
+
+    // *-- compute a intersect b
+    Iterator <String> iter = aUnionb.iterator();
+    int sizeAinterB = 0;
+    while (iter != null && iter.hasNext()) {
+      String token = (String) iter.next();
+      if (aTokens.contains(token) && bTokens.contains(token))
+        sizeAinterB++;
+    }
+    return ((sizeAunionB > 0) ? (sizeAinterB + 0.0) / sizeAunionB : 0.0);
+  }
+
+  /**
+   * Return the edit distance between the two strings
+   * 
+   * @param s1
+   * @param s2
+   * @return double
+   */
+  public static double editDistance(String s1, String s2) {
+    if ((s1.length() == 0) || (s2.length() == 0))
+      return (0.0);
+    return EditDistance.editDistance(s1.subSequence(0, s1.length()), s2
+        .subSequence(0, s2.length()), false);
+  }
+
+  /**
+   * Return a string with the contents from the passed reader
+   * 
+   * @param r Reader
+   * @return String
+   */
+  public static String readerToString(Reader r) {
+    int charValue;
+    StringBuffer sb = new StringBuffer(1024);
+    try {
+      while ((charValue = r.read()) != -1)
+        sb.append((char) charValue);
+    } catch (IOException ie) {
+      sb.setLength(0);
+    }
+    return (sb.toString());
+  }
+
+  /**
+   * Clean up a sentence by consecutive non-alphanumeric chars with a single
+   * non-alphanumeric char
+   * 
+   * @param in Array of chars
+   * @return String
+   */
+  public static String cleanString(char[] in) {
+    int len = in.length;
+    boolean prevOK = true;
+    for (int i = 0; i < len; i++) {
+      if (Character.isLetterOrDigit(in[i]) || Character.isWhitespace(in[i]))
+        prevOK = true;
+      else {
+        if (!prevOK)
+          in[i] = ' ';
+        prevOK = false;
+      }
+    }
+    return (new String(in));
+  }
+
+  /**
+   * Return a clean file name
+   * 
+   * @param filename
+   * @return String
+   */
+  public static String parseFile(String filename) {
+    return (filterChars(filename, "\\/_:."));
+  }
+}
\ No newline at end of file