From 698b998b1597aebc21b2a07c9544bee548cf5225 Mon Sep 17 00:00:00 2001 From: gitclebeg Date: Mon, 13 Apr 2015 17:24:02 +0800 Subject: [PATCH] add new text classification base on tf-idf --- README.md | 7 +- .../cn/it/classification/DfIdfClassier.java | 157 +++++++ ...ngPipeClassier.java => NGramClassier.java} | 7 +- .../cn/it/classification/StringTools.java | 393 ++++++++++++++++++ 4 files changed, 558 insertions(+), 6 deletions(-) create mode 100644 src/main/java/eshore/cn/it/classification/DfIdfClassier.java rename src/main/java/eshore/cn/it/classification/{LingPipeClassier.java => NGramClassier.java} (98%) create mode 100644 src/main/java/eshore/cn/it/classification/StringTools.java diff --git a/README.md b/README.md index a66227e..9c9a5c3 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,13 @@ ##基于自然语言处理的情感分析工具 ###本程序依赖data目录下面的data.zip和dictionary.zip先解压缩 data 目录下面的 data.zip到当前目录。 +* 新增说明3:增加基于 TF-IDF(词向量) 特征的文本分类程序。 +1. 主程序:DfIdfClassifier -* 新增说明2:增加文本分类程序,目的是找出自己领域相关的文本,然后再从这个领域相关的文本中判断正负面。 - +* 新增说明2:增加基于 N-Gram(词向量) 特征的文本分类程序,目的是找出自己领域相关的文本,然后再从这个领域相关的文本中判断正负面。 1. 测试语料:data/text_classification.zip 解压缩即可 -2. 运行程序:LingPipeClassier 即可。 +2. 运行程序:NGramClassifier 即可。 * 新增说明1:2015-04-10测试了不用中文分词器,分词之后 LingPipe 情感分类的准确率,同时测试了去除停用词之后的情感分类的准确率。 diff --git a/src/main/java/eshore/cn/it/classification/DfIdfClassier.java b/src/main/java/eshore/cn/it/classification/DfIdfClassier.java new file mode 100644 index 0000000..125d6b6 --- /dev/null +++ b/src/main/java/eshore/cn/it/classification/DfIdfClassier.java @@ -0,0 +1,157 @@ +package eshore.cn.it.classification; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.text.NumberFormat; +import java.util.List; + +import com.aliasi.classify.Classification; +import com.aliasi.classify.Classified; +import com.aliasi.classify.ConfusionMatrix; +import com.aliasi.classify.ScoredClassification; +import com.aliasi.classify.ScoredClassifier; +import com.aliasi.classify.TfIdfClassifierTrainer; +import com.aliasi.tokenizer.CharacterTokenizerFactory; +import com.aliasi.tokenizer.TokenFeatureExtractor; +import com.aliasi.util.Files; +import com.hankcs.hanlp.HanLP; +import com.hankcs.hanlp.seg.common.Term; + +/** + * 基于LingPipe的文本分类器,主要分类成两类 + * 一类: 关于政务的 + * 另一类: 非政务的 + * 采用的算法有: + * @author clebeg + * @time 2015-04-13 + * */ +public class DfIdfClassier { + private static String[] CATEGORIES = { + "government", + "others" + }; + + private static String TEXT_CLASSIFICATION_TRAINING = "data/text_classification/training"; + private static String TEXT_CLASSIFICATION_TESTING = "data/text_classification/testing"; + private static String MODEL_FILE = "Model/tfidf_classifier"; + + private static TfIdfClassifierTrainer classifier = new TfIdfClassifierTrainer( + new TokenFeatureExtractor(CharacterTokenizerFactory.INSTANCE)); + + public static void main(String[] args) throws IOException, ClassNotFoundException { + trainModel(true); + evaluate(); + } + public static void trainModel(boolean needStoreModle) throws IOException { + for(int i = 0; i < CATEGORIES.length; ++i) { + File classDir = new File(TEXT_CLASSIFICATION_TRAINING, CATEGORIES[i]); + if (!classDir.isDirectory()) { + String msg = "Could not find training directory=" + + classDir + + "\nHave you unpacked " + + CATEGORIES.length + + "groups?"; + System.out.println(msg); // in case exception gets lost in shell + throw new IllegalArgumentException(msg); + } + + String[] trainingFiles = classDir.list(); + for (int j = 0; j < trainingFiles.length; ++j) { + File file = new File(classDir, trainingFiles[j]); + String text = Files.readFromFile(file, "GBK"); + System.out.println("Training on " + CATEGORIES[i] + "/" + trainingFiles[j]); + + String segWords = ""; + List terms = HanLP.segment(text); + for (Term term : terms) + segWords += term.word + " "; + + Classification classification + = new Classification(CATEGORIES[i]); + Classified classified + = new Classified(segWords, classification); + classifier.handle(classified); + } + } + if (needStoreModle) { + System.out.println("开始保存分类器到 " + MODEL_FILE); + ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream( + MODEL_FILE)); + classifier.compileTo(os); + os.close(); + System.out.println("分类器保存完成"); + } + } + + @SuppressWarnings("unchecked") + public static void evaluate() throws IOException, ClassNotFoundException { + ScoredClassifier compiledClassifier = null; + try { + ObjectInputStream oi = new ObjectInputStream(new FileInputStream( + MODEL_FILE)); + compiledClassifier = (ScoredClassifier) oi + .readObject(); + oi.close(); + } catch (IOException ie) { + System.out.println("IO Error: Model file " + MODEL_FILE + " missing"); + } + // 遍历分类目录中的文件测试分类准确度 + ConfusionMatrix confMatrix = new ConfusionMatrix(CATEGORIES); + NumberFormat nf = NumberFormat.getInstance(); + nf.setMaximumIntegerDigits(1); + nf.setMaximumFractionDigits(3); + for(int i = 0; i < CATEGORIES.length; ++i) { + File classDir = new File(TEXT_CLASSIFICATION_TESTING, CATEGORIES[i]); + String[] testingFiles = classDir.list(); + for (int j = 0; j < testingFiles.length; ++j) { + String text + = Files.readFromFile(new File(classDir, testingFiles[j]),"GBK"); + String segWords = ""; + List terms = HanLP.segment(text); + for (Term term : terms) + segWords += term.word + " "; + + System.out.println("测试 " + CATEGORIES[i] + + File.separator + testingFiles[j]); + + ScoredClassification classification = compiledClassifier + .classify(segWords.subSequence(0, text.length())); + confMatrix.increment(CATEGORIES[i], + classification.bestCategory()); + System.out.println("最适合的分类: " + + classification.bestCategory()); + } + } + + System.out.println("--------------------------------------------"); + System.out.println("- 结果 "); + System.out.println("--------------------------------------------"); + int[][] imatrix = confMatrix.matrix(); + StringBuffer sb = new StringBuffer(); + sb.append(StringTools.fillin("CATEGORY", 10, true, ' ')); + for (int i = 0; i < CATEGORIES.length; i++) + sb.append(StringTools.fillin(CATEGORIES[i], 8, false, ' ')); + System.out.println(sb.toString()); + + for (int i = 0; i < imatrix.length; i++) { + sb = new StringBuffer(); + sb.append(StringTools.fillin(CATEGORIES[i], 10, true, ' ', + 10 - CATEGORIES[i].length())); + for (int j = 0; j < imatrix.length; j++) { + String out = "" + imatrix[i][j]; + sb.append(StringTools.fillin(out, 8, false, ' ', + 8 - out.length())); + } + System.out.println(sb.toString()); + } + + System.out.println("准确度: " + + nf.format(confMatrix.totalAccuracy())); + System.out.println("总共正确数 : " + confMatrix.totalCorrect()); + System.out.println("总数:" + confMatrix.totalCount()); + } +} diff --git a/src/main/java/eshore/cn/it/classification/LingPipeClassier.java b/src/main/java/eshore/cn/it/classification/NGramClassier.java similarity index 98% rename from src/main/java/eshore/cn/it/classification/LingPipeClassier.java rename to src/main/java/eshore/cn/it/classification/NGramClassier.java index b7bfaae..44e54e6 100644 --- a/src/main/java/eshore/cn/it/classification/LingPipeClassier.java +++ b/src/main/java/eshore/cn/it/classification/NGramClassier.java @@ -25,12 +25,12 @@ import com.hankcs.hanlp.seg.common.Term; * @author clebeg * @time 2015-04-13 * */ -public class LingPipeClassier { +public class NGramClassier { private static String[] CATEGORIES = { "government", "others" }; - private static int NGRAM_SIZE = 2; + private static int NGRAM_SIZE = 3; private static String TEXT_CLASSIFICATION_TRAINING = "data/text_classification/training"; private static String TEXT_CLASSIFICATION_TESTING = "data/text_classification/testing"; @@ -38,6 +38,7 @@ public class LingPipeClassier { private static DynamicLMClassifier classifier = DynamicLMClassifier.createNGramProcess(CATEGORIES, NGRAM_SIZE); + public static void main(String[] args) throws IOException, ClassNotFoundException { trainModel(); evaluate(); @@ -93,7 +94,7 @@ public class LingPipeClassier { String[] testingFiles = classDir.list(); for (int j=0; j < testingFiles.length; ++j) { String text - = Files.readFromFile(new File(classDir,testingFiles[j]),"ISO-8859-1"); + = Files.readFromFile(new File(classDir,testingFiles[j]),"GBK"); System.out.print("Testing on " + CATEGORIES[i] + "/" + testingFiles[j] + " "); Classification classification = new Classification(CATEGORIES[i]); diff --git a/src/main/java/eshore/cn/it/classification/StringTools.java b/src/main/java/eshore/cn/it/classification/StringTools.java new file mode 100644 index 0000000..0184ec5 --- /dev/null +++ b/src/main/java/eshore/cn/it/classification/StringTools.java @@ -0,0 +1,393 @@ +package eshore.cn.it.classification; +import java.io.IOException; +import java.io.Reader; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.aliasi.spell.EditDistance; + +/** + * A class containing a bunch of string utilities -
+ * a. filterChars: Remove extraneous characters from a string and return a + * "clean" string.
+ * b. getSuffix: Given a file name return its extension.
+ * c. fillin: pad or truncate a string to a fixed number of characters.
+ * d. removeAmpersandStrings: remove strings that start with ampersand
+ * e. shaDigest: Compute the 40 byte digest signature of a string
+ */ +public class StringTools { + public static final Locale LOCALE = new Locale("en"); + // * -- String limit for StringTools + private static int STRING_TOOLS_LIMIT = 1000000; + // *-- pre-compiled RE patterns + private static Pattern extPattern = Pattern.compile("^.*[.](.*?){1}quot;"); + private static Pattern spacesPattern = Pattern.compile("\\s+"); + private static Pattern removeAmpersandPattern = Pattern.compile("&[^;]*?;"); + + /** + * Removes non-printable spaces and replaces with a single space + * + * @param in + * String with mixed characters + * @return String with collapsed spaces and printable characters + */ + public static String filterChars(String in) { + return (filterChars(in, "", ' ', true)); + } + + public static String filterChars(String in, boolean newLine) { + return (filterChars(in, "", ' ', newLine)); + } + + public static String filterChars(String in, String badChars) { + return (filterChars(in, badChars, ' ', true)); + } + + public static String filterChars(String in, char replaceChar) { + return (filterChars(in, "", replaceChar, true)); + } + + public static String filterChars(String in, String badChars, + char replaceChar, boolean newLine) { + if (in == null) + return ""; + int inLen = in.length(); + if (inLen > STRING_TOOLS_LIMIT) + return in; + try { + // **-- replace non-recognizable characters with spaces + StringBuffer out = new StringBuffer(); + int badLen = badChars.length(); + for (int i = 0; i < inLen; i++) { + char ch = in.charAt(i); + if ((badLen != 0) && removeChar(ch, badChars)) { + ch = replaceChar; + } else if (!Character.isDefined(ch) && !Character.isSpaceChar(ch)) { + ch = replaceChar; + } + out.append(ch); + } + + // *-- replace new lines with space + Matcher matcher = null; + in = out.toString(); + + // *-- replace consecutive spaces with single space and remove + // leading/trailing spaces + in = in.trim(); + matcher = spacesPattern.matcher(in); + in = matcher.replaceAll(" "); + } catch (OutOfMemoryError e) { + return in; + } + + return in; + } + + // *-- remove any chars found in the badChars string + private static boolean removeChar(char ch, String badChars) { + if (badChars.length() == 0) + return false; + for (int i = 0; i < badChars.length(); i++) { + if (ch == badChars.charAt(i)) + return true; + } + return false; + } + + /** + * Return the extension of a file, if possible. + * + * @param filename + * @return string + */ + public static String getSuffix(String filename) { + if (filename.length() > STRING_TOOLS_LIMIT) + return (""); + Matcher matcher = extPattern.matcher(filename); + if (!matcher.matches()) + return ""; + return (matcher.group(1).toLowerCase(LOCALE)); + } + + public static String fillin(String in, int len) { + return fillin(in, len, true, ' ', 3); + } + + public static String fillin(String in, int len, char fillinChar) { + return fillin(in, len, true, fillinChar, 3); + } + + public static String fillin(String in, int len, boolean right) { + return fillin(in, len, right, ' ', 3); + } + + public static String fillin(String in, int len, boolean right, char fillinChar) { + return fillin(in, len, right, fillinChar, 3); + } + + /** + * Return a string concatenated or padded to the specified length + * + * @param in + * string to be truncated or padded + * @param len + * int length for string + * @param right + * boolean fillin from the left or right + * @param fillinChar + * char to pad the string + * @param numFills + * int number of characters to pad + * @return String of specified length + */ + public static String fillin(String in, int len, boolean right, + char fillinChar, int numFills) { + // *-- return if string is of required length + int slen = in.length(); + if ((slen == len) || (slen > STRING_TOOLS_LIMIT)) + return (in); + + // *-- build the fillin string + StringBuffer fillinStb = new StringBuffer(); + for (int i = 0; i < numFills; i++) + fillinStb.append(fillinChar); + String fillinString = fillinStb.toString(); + + // *-- truncate and pad string if length exceeds required length + if (slen > len) { + if (right) + return (in.substring(0, len - numFills) + fillinString); + else + return (fillinString + in.substring(slen - len + numFills, slen)); + } + + // *-- pad string if length is less than required length DatabaseEntry + // dbe = dbt.getNextKey(); String dbkey = new String (dbe.getData()); + StringBuffer sb = new StringBuffer(); + if (right) + sb.append(in); + sb.append(fillinString); + if (!right) + sb.append(in); + return (sb.toString()); + } + + /** + * Remove ampersand strings such as \  + * + * @param in + * Text string extracted from Web pages + * @return String Text string without ampersand strings + */ + public static String removeAmpersandStrings(String in) { + if (in.length() > STRING_TOOLS_LIMIT) + return (in); + Matcher matcher = removeAmpersandPattern.matcher(in); + return (matcher.replaceAll("")); + } + + /** + * Escape back slashes + * + * @param in + * Text to be escaped + * @return String Escaped test + */ + public static String escapeText(String in) { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < in.length(); i++) { + char ch = in.charAt(i); + if (ch == '\\') + sb.append("\\\\"); + else + sb.append(ch); + } + return (sb.toString()); + } + + /** + * Get the SHA signature of a string + * + * @param in + * String + * @return String SHA signature of in + */ + public static String shaDigest(String in) { + StringBuffer out = new StringBuffer(); + if ((in == null) || (in.length() == 0)) + return (""); + try { + // *-- create a message digest instance and compute the hash + // byte array + MessageDigest md = MessageDigest.getInstance("SHA-1"); + md.reset(); + md.update(in.getBytes()); + byte[] hash = md.digest(); + + // *--- Convert the hash byte array to hexadecimal format, pad + // hex chars with leading zeroes + // *--- to get a signature of consistent length (40) for all + // strings. + for (int i = 0; i < hash.length; i++) { + out.append(fillin(Integer.toString(0xFF & hash[i], 16), 2, false, '0', + 1)); + } + } catch (OutOfMemoryError e) { + return ("<-------------OUT_OF_MEMORY------------>"); + } catch (NoSuchAlgorithmException e) { + return ("<------SHA digest algorithm not found--->"); + } + + return (out.toString()); + } + + /** + * Return the string with the first letter upper cased + * + * @param in + * @return String + */ + public static String firstLetterUC(String in) { + if ((in == null) || (in.length() == 0)) + return (""); + String out = in.toLowerCase(LOCALE); + String part1 = out.substring(0, 1); + String part2 = out.substring(1, in.length()); + return (part1.toUpperCase(LOCALE) + part2.toLowerCase(LOCALE)); + } + + /** + * Return a pattern that can be used to collapse consecutive patterns of the + * same type + * + * @param entityTypes + * A list of entity types + * @return Regex pattern for the entity types + */ + public static Pattern getCollapsePattern(String[] entityTypes) { + Pattern collapsePattern = null; + StringBuffer collapseStr = new StringBuffer(); + for (int i = 0; i < entityTypes.length; i++) { + collapseStr.append("(<\\/"); + collapseStr.append(entityTypes[i]); + collapseStr.append(">\\s+"); + collapseStr.append("<"); + collapseStr.append(entityTypes[i]); + collapseStr.append(">)|"); + } + collapsePattern = Pattern.compile(collapseStr.toString().substring(0, + collapseStr.length() - 1)); + return (collapsePattern); + } + + /** + * return a double that indicates the degree of similarity between two strings + * Use the Jaccard similarity, i.e. the ratio of A intersection B to A union B + * + * @param first + * string + * @param second + * string + * @return double degreee of similarity + */ + public static double stringSimilarity(String first, String second) { + if ((first == null) || (second == null)) + return (0.0); + String[] a = first.split("\\s+"); + String[] b = second.split("\\s+"); + + // *-- compute a union b + HashSet aUnionb = new HashSet(); + HashSet aTokens = new HashSet(); + HashSet bTokens = new HashSet(); + for (int i = 0; i < a.length; i++) { + aUnionb.add(a[i]); + aTokens.add(a[i]); + } + for (int i = 0; i < b.length; i++) { + aUnionb.add(b[i]); + bTokens.add(b[i]); + } + int sizeAunionB = aUnionb.size(); + + // *-- compute a intersect b + Iterator iter = aUnionb.iterator(); + int sizeAinterB = 0; + while (iter != null && iter.hasNext()) { + String token = (String) iter.next(); + if (aTokens.contains(token) && bTokens.contains(token)) + sizeAinterB++; + } + return ((sizeAunionB > 0) ? (sizeAinterB + 0.0) / sizeAunionB : 0.0); + } + + /** + * Return the edit distance between the two strings + * + * @param s1 + * @param s2 + * @return double + */ + public static double editDistance(String s1, String s2) { + if ((s1.length() == 0) || (s2.length() == 0)) + return (0.0); + return EditDistance.editDistance(s1.subSequence(0, s1.length()), s2 + .subSequence(0, s2.length()), false); + } + + /** + * Return a string with the contents from the passed reader + * + * @param r Reader + * @return String + */ + public static String readerToString(Reader r) { + int charValue; + StringBuffer sb = new StringBuffer(1024); + try { + while ((charValue = r.read()) != -1) + sb.append((char) charValue); + } catch (IOException ie) { + sb.setLength(0); + } + return (sb.toString()); + } + + /** + * Clean up a sentence by consecutive non-alphanumeric chars with a single + * non-alphanumeric char + * + * @param in Array of chars + * @return String + */ + public static String cleanString(char[] in) { + int len = in.length; + boolean prevOK = true; + for (int i = 0; i < len; i++) { + if (Character.isLetterOrDigit(in[i]) || Character.isWhitespace(in[i])) + prevOK = true; + else { + if (!prevOK) + in[i] = ' '; + prevOK = false; + } + } + return (new String(in)); + } + + /** + * Return a clean file name + * + * @param filename + * @return String + */ + public static String parseFile(String filename) { + return (filterChars(filename, "\\/_:.")); + } +} \ No newline at end of file