| @@ -0,0 +1,7 @@ | |||
| .project | |||
| .classpath | |||
| target/ | |||
| Result/ | |||
| Model/ | |||
| .settings/ | |||
| @@ -0,0 +1,15 @@ | |||
| 基于自然语言处理的情感分析工具 | |||
| 本程序依赖data目录下面的data.zip,先解压缩 data 目录下面的 data.zip到当前目录。 | |||
| 1、基于词典和贝叶斯模型的情感分析 | |||
| 主程序:eshore.cn.it.sentiment.Sentiment 此类通过 | |||
| data/Sentiment_Dictionary中的正负面词语建立模型。 | |||
| 测试: eshore.cn.it.sentiment.SentimentTest | |||
| 通过这个类就可以测试 data/500trainblogxml中的某个文件夹下面的博客的情感。 | |||
| 2、直接利用lingpipe的情感分析模块测试情感分析 | |||
| 直接运行程序: eshore.cn.it.sentiment.ChinesePolarityBasic | |||
| 程序就会通过: data/polarity_corpus/hotel_reviews/train2训练 | |||
| 然后自动测试: data/polarity_corpus/hotel_reviews/test2 | |||
| 最后给出程序测试结果。 | |||
| @@ -0,0 +1,5 @@ | |||
| 500trainblogxml/ | |||
| nerws_corpus/ | |||
| output/ | |||
| polarity_corpus/ | |||
| Sentiment_Dictionary/ | |||
| @@ -0,0 +1,58 @@ | |||
| <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |||
| xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |||
| <modelVersion>4.0.0</modelVersion> | |||
| <groupId>eshore.cn.it</groupId> | |||
| <artifactId>nlp-sentiment</artifactId> | |||
| <version>0.0.1-SNAPSHOT</version> | |||
| <packaging>jar</packaging> | |||
| <name>nlp-sentiment</name> | |||
| <url>http://maven.apache.org</url> | |||
| <properties> | |||
| <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |||
| <commons.io.version>2.4</commons.io.version> | |||
| <dom4j.version>1.6.1</dom4j.version> | |||
| <lingpipe.version>4.1.0</lingpipe.version> | |||
| <jieba.version>1.0.0</jieba.version> | |||
| <!-- 无法到中央仓库下载的jar包就集中存放到这个位置 --> | |||
| <maven.libs.home>F:/java_git_projects/nlp-sentiment/libs</maven.libs.home> | |||
| </properties> | |||
| <dependencies> | |||
| <dependency> | |||
| <groupId>junit</groupId> | |||
| <artifactId>junit</artifactId> | |||
| <version>3.8.1</version> | |||
| <scope>test</scope> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>commons-io</groupId> | |||
| <artifactId>commons-io</artifactId> | |||
| <version>${commons.io.version}</version> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>dom4j</groupId> | |||
| <artifactId>dom4j</artifactId> | |||
| <version>${dom4j.version}</version> | |||
| </dependency> | |||
| <!-- 此处需要手动到lingpipe官网下载lingpipe的corejar包 --> | |||
| <dependency> | |||
| <groupId>com.aliasi</groupId> | |||
| <artifactId>lingpipe</artifactId> | |||
| <version>${lingpipe.version}</version> | |||
| <scope>system</scope> | |||
| <systemPath>${maven.libs.home}/lingpipe-4.1.0.jar</systemPath> | |||
| </dependency> | |||
| <!-- 此处添加结巴分词器 --> | |||
| <dependency> | |||
| <groupId>com.huaban</groupId> | |||
| <artifactId>jieba-analysis</artifactId> | |||
| <version>${jieba.version}</version> | |||
| </dependency> | |||
| </dependencies> | |||
| </project> | |||
| @@ -0,0 +1,146 @@ | |||
| package eshore.cn.it.sentiment; | |||
| import java.io.File; | |||
| import java.io.FileReader; | |||
| import java.io.IOException; | |||
| import java.util.List; | |||
| import org.apache.commons.io.IOUtils; | |||
| import com.aliasi.classify.Classification; | |||
| import com.aliasi.classify.Classified; | |||
| import com.aliasi.classify.DynamicLMClassifier; | |||
| import com.aliasi.lm.NGramProcessLM; | |||
| import com.aliasi.util.Files; | |||
| import com.huaban.analysis.jieba.JiebaSegmenter; | |||
| import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; | |||
| import com.huaban.analysis.jieba.SegToken; | |||
| /** | |||
| * ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类 | |||
| * lingpipe适合做增量分析 | |||
| * @clebeg 2015-03-13 | |||
| * @version 0.0.1 | |||
| * */ | |||
| public class ChinesePolarityBasic { | |||
| private String[] mCategories = new String[]{"+1", "-1"}; | |||
| //这就是分类模型 | |||
| private DynamicLMClassifier<NGramProcessLM> mClassifier; | |||
| private int numTests = 0; | |||
| private int numCorrect = 0; | |||
| private static final String TRAINFILES_INFO = | |||
| "data/polarity_corpus/hotel_reviews/train2.rlabelclass"; | |||
| private static final String TRAINFILES_DIR = | |||
| "data/polarity_corpus/hotel_reviews/train2"; | |||
| private static final String TESTFILES_DIR = | |||
| "data/polarity_corpus/hotel_reviews/test2"; | |||
| private static final String TESTFILES_INFO = | |||
| "data/polarity_corpus/hotel_reviews/test2.rlabelclass"; | |||
| private static final String ENCODING = "GBK"; | |||
| private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter(); | |||
| private final SegMode segMode = SegMode.INDEX; | |||
| public static void main(String[] args) { | |||
| try { | |||
| new ChinesePolarityBasic().run(); | |||
| } catch (Throwable t) { | |||
| System.out.println("Thrown: " + t); | |||
| t.printStackTrace(System.out); | |||
| } | |||
| } | |||
| public ChinesePolarityBasic() { | |||
| super(); | |||
| int nGram = 8; | |||
| mClassifier | |||
| = DynamicLMClassifier | |||
| .createNGramProcess(mCategories,nGram); | |||
| } | |||
| private void run() throws ClassNotFoundException, | |||
| IOException { | |||
| train(); | |||
| evaluate(); | |||
| } | |||
| private void train() throws IOException { | |||
| FileReader input = new FileReader(new File(TRAINFILES_INFO)); | |||
| List<String> trainInfos = IOUtils.readLines(input); | |||
| for (String str : trainInfos){ | |||
| String[] train = str.split(" "); | |||
| train(train[1], new File(TRAINFILES_DIR, train[0]), ENCODING); | |||
| } | |||
| } | |||
| private void evaluate() throws IOException { | |||
| FileReader input = new FileReader(new File(TESTFILES_INFO)); | |||
| List<String> trainInfos = IOUtils.readLines(input); | |||
| for (String str : trainInfos){ | |||
| String[] train = str.split(" "); | |||
| evaluate(train[1], new File(TESTFILES_DIR, train[0]), ENCODING); | |||
| } | |||
| System.out.println(" # Test Cases=" | |||
| + numTests); | |||
| System.out.println(" # Correct=" | |||
| + numCorrect); | |||
| System.out.println(" % Correct=" | |||
| + ((double)numCorrect) | |||
| /(double)numTests); | |||
| } | |||
| /** | |||
| * 给定分类标识,给定训练文本,给定文本的编码,即可作分类训练 | |||
| * 分类完成之后就会加入到分类模型中 | |||
| * @throws IOException | |||
| * */ | |||
| private void train(String category, File trainFile, String fileEncoding) | |||
| throws IOException { | |||
| Classification classification = new Classification(category); | |||
| String review = Files.readFromFile(trainFile, fileEncoding); | |||
| //此处加入中文分词器,得到分词之后的字符串 | |||
| List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); | |||
| review = ""; | |||
| for (SegToken seg : segTokens) { | |||
| review += seg.word.getToken() + " "; | |||
| } | |||
| Classified<CharSequence> classified | |||
| = new Classified<CharSequence>(review,classification); | |||
| mClassifier.handle(classified); | |||
| } | |||
| /** | |||
| * 给定分类标识,给定测试文本,给定文本的编码,即可作测试模型 | |||
| * @throws IOException | |||
| * */ | |||
| private void evaluate(String category, File testFile, String fileEncoding) | |||
| throws IOException { | |||
| String review | |||
| = Files.readFromFile(testFile, fileEncoding); | |||
| //同理,这里可以加入分词器,这样可以试试效果如何。 | |||
| List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); | |||
| review = ""; | |||
| for (SegToken seg : segTokens) { | |||
| review += seg.word.getToken() + " "; | |||
| } | |||
| ++numTests; | |||
| Classification classification | |||
| = mClassifier.classify(review); | |||
| //得到训练结果 | |||
| String resultCategory | |||
| = classification.bestCategory(); | |||
| if (resultCategory.equals(category)) | |||
| ++numCorrect; | |||
| } | |||
| } | |||
| @@ -0,0 +1,94 @@ | |||
| package eshore.cn.it.sentiment; | |||
| import java.io.File; | |||
| import java.io.IOException; | |||
| import com.aliasi.classify.Classification; | |||
| import com.aliasi.classify.Classified; | |||
| import com.aliasi.classify.DynamicLMClassifier; | |||
| import com.aliasi.lm.NGramProcessLM; | |||
| import com.aliasi.util.Files; | |||
| public class PolarityBasic { | |||
| File mPolarityDir; | |||
| String[] mCategories; | |||
| DynamicLMClassifier<NGramProcessLM> mClassifier; | |||
| public PolarityBasic(String[] args) { | |||
| mPolarityDir = new File("data/polarity_corpus","txt_sentoken"); | |||
| mCategories = mPolarityDir.list(); | |||
| int nGram = 8; | |||
| mClassifier | |||
| = DynamicLMClassifier | |||
| .createNGramProcess(mCategories,nGram); | |||
| } | |||
| public static void main(String[] args) { | |||
| try { | |||
| new PolarityBasic(args).run(); | |||
| } catch (Throwable t) { | |||
| System.out.println("Thrown: " + t); | |||
| t.printStackTrace(System.out); | |||
| } | |||
| } | |||
| private void run() throws ClassNotFoundException, | |||
| IOException { | |||
| train(); | |||
| evaluate(); | |||
| } | |||
| private void train() throws IOException { | |||
| for (int i = 0; i < mCategories.length; ++i) { | |||
| String category = mCategories[i]; | |||
| Classification classification | |||
| = new Classification(category); | |||
| File dir = new File(mPolarityDir, mCategories[i]); | |||
| File[] trainFiles = dir.listFiles(); | |||
| for (int j = 0; j < trainFiles.length; ++j) { | |||
| File trainFile = trainFiles[j]; | |||
| if (isTrainingFile(trainFile)) { | |||
| String review | |||
| = Files.readFromFile(trainFile,"ISO-8859-1"); | |||
| Classified<CharSequence> classified | |||
| = new Classified<CharSequence>(review,classification); | |||
| mClassifier.handle(classified); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| boolean isTrainingFile(File file) { | |||
| return file.getName().charAt(2) != '9'; // test on fold 9 | |||
| } | |||
| void evaluate() throws IOException { | |||
| int numTests = 0; | |||
| int numCorrect = 0; | |||
| for (int i = 0; i < mCategories.length; ++i) { | |||
| String category = mCategories[i]; | |||
| File file = new File(mPolarityDir,mCategories[i]); | |||
| File[] testFiles = file.listFiles(); | |||
| for (int j = 0; j < testFiles.length; ++j) { | |||
| File testFile = testFiles[j]; | |||
| if (!isTrainingFile(testFile)) { | |||
| String review | |||
| = Files.readFromFile(testFile,"ISO-8859-1"); | |||
| ++numTests; | |||
| Classification classification | |||
| = mClassifier.classify(review); | |||
| String resultCategory | |||
| = classification.bestCategory(); | |||
| if (resultCategory.equals(category)) | |||
| ++numCorrect; | |||
| } | |||
| } | |||
| } | |||
| System.out.println(" # Test Cases=" | |||
| + numTests); | |||
| System.out.println(" # Correct=" | |||
| + numCorrect); | |||
| System.out.println(" % Correct=" | |||
| + ((double)numCorrect) | |||
| /(double)numTests); | |||
| } | |||
| } | |||
| @@ -0,0 +1,211 @@ | |||
| package eshore.cn.it.sentiment; | |||
| import java.io.*; | |||
| import java.util.*; | |||
| import org.apache.commons.io.FileUtils; | |||
| import org.dom4j.Document; | |||
| import org.dom4j.Element; | |||
| import org.dom4j.io.SAXReader; | |||
| public class Sentiment { | |||
| static private HashSet<String> Negative, Positive; //两种情感词典 | |||
| static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 | |||
| static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 | |||
| static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/"; | |||
| static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; | |||
| static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; | |||
| static final String FILE_ENCODING = "UTF-8"; | |||
| public static void main(String[] args) throws Exception { | |||
| // TODO 自动生成的方法存根 | |||
| Sentiment Sentiment = new Sentiment(); | |||
| Sentiment.Model( ); | |||
| Sentiment.Save_Model(); | |||
| } | |||
| public void Model( ) throws Exception { | |||
| this.Read_Sentiment_Dictionary(); | |||
| this.Sentiment_Doc_Weight(SENTIMENT_DOC_WEIGHT_PATH); | |||
| } | |||
| @SuppressWarnings("resource") | |||
| public void Read_Sentiment_Dictionary() throws Exception { | |||
| BufferedReader buf; | |||
| String str; | |||
| //集合,里面元素不允许重复 | |||
| Negative = new HashSet<String>(); | |||
| buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); | |||
| while( (str = buf.readLine()) != null ) { | |||
| Negative.add(str); | |||
| } | |||
| Positive = new HashSet<String>(); | |||
| buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); | |||
| while( (str = buf.readLine()) != null ) { | |||
| Positive.add(str); | |||
| } | |||
| } | |||
| public void Sentiment_Doc_Weight( String DirPath ) throws Exception { | |||
| File NegativeDir = new File( DirPath + "negativeout" ); | |||
| String[] NegativeFiles = NegativeDir.list(); | |||
| NegativeDoc = NegativeFiles.length; | |||
| ArrayList<String> NegativeCurrentList = new ArrayList<String>(); | |||
| for ( int i = 0; i < NegativeFiles.length; i ++ ) { | |||
| System.out.println("NegativeFiles No."+(i+1)+" "+DirPath+"negativeout/"+NegativeFiles[i]); | |||
| this.ReadXML(DirPath+"negativeout/"+NegativeFiles[i], NegativeCurrentList); | |||
| } | |||
| NegativeWeight = HashTable( NegativeCurrentList ); | |||
| /**********************************************************************************************************/ | |||
| File PositiveDir = new File( DirPath + "positiveout" ); | |||
| String[] PositiveFiles = PositiveDir.list(); | |||
| PositiveDoc = PositiveFiles.length; | |||
| ArrayList<String> PositiveCurrentList = new ArrayList<String>(); | |||
| for ( int i = 0; i < PositiveFiles.length; i ++ ) { | |||
| System.out.println("PositiveFiles No."+(i+1)+" "+DirPath+"positiveout/"+PositiveFiles[i]); | |||
| this.ReadXML(DirPath+"positiveout/"+PositiveFiles[i], PositiveCurrentList); | |||
| } | |||
| PositiveWeight = HashTable( PositiveCurrentList ); | |||
| /*********************************************************************************************************/ | |||
| File UnsureDir = new File( DirPath + "unsureout" ); | |||
| String[] UnsureFiles = UnsureDir.list(); | |||
| UnsureDoc = UnsureFiles.length; | |||
| ArrayList<String> UnsureCurrentList = new ArrayList<String>(); | |||
| for ( int i = 0; i < UnsureFiles.length; i ++ ) { | |||
| System.out.println("UnsureFiles No."+(i+1)+" "+DirPath+"unsureout/"+UnsureFiles[i]); | |||
| this.ReadXML(DirPath+"unsureout/"+UnsureFiles[i], UnsureCurrentList); | |||
| } | |||
| UnsureWeight = HashTable( UnsureCurrentList ); | |||
| /********************************************************************************************************/ | |||
| System.out.println("UnsureCurrent = " + UnsureCurrentList.size() + " UnsureHashTable = " + UnsureWeight.size()); | |||
| System.out.println("PositiveCurrent = " + PositiveCurrentList.size() + " PositiveHashTable = " + PositiveWeight.size()); | |||
| System.out.println("NegativeCurrent = " + NegativeCurrentList.size() + " NegativeHashTable = " + NegativeWeight.size()); | |||
| System.out.println("NegativeDoc = " + NegativeDoc + " PositiveDoc = " + PositiveDoc + " UnsureDoc = " + UnsureDoc); | |||
| } | |||
| public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 | |||
| SAXReader SaxReader = new SAXReader(); | |||
| Document Doc = SaxReader.read(new File(FilePath)); | |||
| Element root = Doc.getRootElement(); | |||
| Element content = root.element("content"); | |||
| List<?> sentenses = content.elements("sentence"); //每一句话作为一项 | |||
| for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) { | |||
| Element sentense = (Element)iter.next(); | |||
| List<?> toks = sentense.elements(); | |||
| for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) { | |||
| Element tok = (Element)iter1.next(); | |||
| String Type = tok.attributeValue("type"); | |||
| if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 | |||
| GetWord( tok, currentList ); //从"group"中获取词 | |||
| } | |||
| } | |||
| } | |||
| } | |||
| public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词 | |||
| String Word = ""; | |||
| List<?> elements = root.elements("tok"); | |||
| for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) { | |||
| Element tok = (Element)iter.next(); | |||
| String Type = tok.attributeValue("type"); | |||
| if ( Type.compareTo("atom") == 0 ) { | |||
| Word += tok.getText().trim(); | |||
| } | |||
| else { | |||
| GetWord( tok, currentList ); | |||
| } | |||
| } | |||
| if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 | |||
| currentList.add(Word); | |||
| } | |||
| } | |||
| public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表 | |||
| Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>(); | |||
| for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) { | |||
| String Word = (String)iter.next(); | |||
| if ( HashTable.containsKey(Word) ) { | |||
| Integer Weight = HashTable.get(Word); | |||
| HashTable.put(Word, Weight+1); | |||
| } | |||
| else { | |||
| HashTable.put(Word, 1); | |||
| } | |||
| } | |||
| return HashTable; | |||
| } | |||
| @SuppressWarnings("resource") | |||
| public void Save_Model( ) throws Exception { | |||
| ObjectOutputStream OOS; | |||
| File ModelPath = new File("Model"); | |||
| File NegativeModel = new File(ModelPath, "NegativeModel.txt"); | |||
| File PositiveModel = new File(ModelPath, "PositiveModel.txt"); | |||
| File UnsureModel = new File(ModelPath, "UnsureModel.txt"); | |||
| if ( !ModelPath.exists() ) { ModelPath.mkdir(); } | |||
| System.out.println("Saving NegativeModel..."); | |||
| OOS = new ObjectOutputStream( new FileOutputStream( NegativeModel ) ); //对象流直接写入 | |||
| OOS.writeObject(NegativeDoc); | |||
| OOS.writeObject(NegativeWeight); | |||
| System.out.println("Saving PositiveModel..."); | |||
| OOS = new ObjectOutputStream( new FileOutputStream( PositiveModel ) ); | |||
| OOS.writeObject(PositiveDoc); | |||
| OOS.writeObject(PositiveWeight); | |||
| System.out.println("Saving UnsureModel..."); | |||
| OOS = new ObjectOutputStream( new FileOutputStream( UnsureModel ) ); | |||
| OOS.writeObject(UnsureDoc); | |||
| OOS.writeObject(UnsureWeight); | |||
| Enumeration<String> Keys; | |||
| System.out.println("Saving NegativeWeight..."); | |||
| Keys = NegativeWeight.keys(); | |||
| while( Keys.hasMoreElements() ) { | |||
| String Key = Keys.nextElement(); | |||
| FileUtils.writeStringToFile(new File("Model", "NegativeWeight.txt"), Key+"\t\t\t"+NegativeWeight.get(Key)+"\r\n", "UTF-8", true); | |||
| } | |||
| System.out.println("Saving PositiveWeight..."); | |||
| Keys = PositiveWeight.keys(); | |||
| while( Keys.hasMoreElements() ) { | |||
| String Key = Keys.nextElement(); | |||
| FileUtils.writeStringToFile(new File("Model", "PositiveWeight.txt"), Key+"\t\t\t"+PositiveWeight.get(Key)+"\r\n", "UTF-8", true); | |||
| } | |||
| System.out.println("Saving UnsureWeight..."); | |||
| Keys = UnsureWeight.keys(); | |||
| while( Keys.hasMoreElements() ) { | |||
| String Key = Keys.nextElement(); | |||
| FileUtils.writeStringToFile(new File("Model", "UnsureWeight.txt"), Key+"\t\t\t"+UnsureWeight.get(Key)+"\r\n", "UTF-8", true); | |||
| } | |||
| System.out.println("Save Success!"); | |||
| } | |||
| } | |||
| @@ -0,0 +1,244 @@ | |||
| package eshore.cn.it.sentiment; | |||
| import java.io.*; | |||
| import java.util.*; | |||
| import org.apache.commons.io.FileUtils; | |||
| import org.dom4j.Document; | |||
| import org.dom4j.Element; | |||
| import org.dom4j.io.SAXReader; | |||
| public class SentimentTest { | |||
| static private HashSet<String> Negative, Positive; //两种情感词典 | |||
| static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 | |||
| static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 | |||
| static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/negativeout/"; | |||
| static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; | |||
| static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; | |||
| static final String FILE_ENCODING = "UTF-8"; | |||
| public static void main(String[] args) throws Exception { | |||
| // TODO 自动生成的方法存根 | |||
| SentimentTest Sentiment_Test = new SentimentTest(); | |||
| Sentiment_Test.Read_Model(); //读取模型 | |||
| Sentiment_Test.Classify_Directory(SENTIMENT_DOC_WEIGHT_PATH); | |||
| } | |||
| @SuppressWarnings({ "resource", "unchecked" }) | |||
| public void Read_Model() throws Exception { | |||
| this.Read_Sentiment_Dictionary(); | |||
| ObjectInputStream OIS; //对象流直接读入 | |||
| File ModelPath = new File("Model"); | |||
| File NegativeModel = new File(ModelPath, "NegativeModel.txt"); | |||
| File PositiveModel = new File(ModelPath, "PositiveModel.txt"); | |||
| File UnsureModel = new File(ModelPath, "UnsureModel.txt"); | |||
| System.out.println("Reading NegativeModel..."); | |||
| OIS = new ObjectInputStream( new FileInputStream( NegativeModel ) ); | |||
| NegativeDoc = (Integer) OIS.readObject(); | |||
| NegativeWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||
| System.out.println("Reading PositiveModel..."); | |||
| OIS = new ObjectInputStream( new FileInputStream( PositiveModel ) ); | |||
| PositiveDoc = (Integer) OIS.readObject(); | |||
| PositiveWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||
| System.out.println("Reading UnsureModel..."); | |||
| OIS = new ObjectInputStream( new FileInputStream( UnsureModel ) ); | |||
| UnsureDoc = (Integer) OIS.readObject(); | |||
| UnsureWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||
| System.out.println("Read Success."); | |||
| } | |||
| @SuppressWarnings("resource") | |||
| public void Read_Sentiment_Dictionary( ) throws Exception { //读入情感词典 | |||
| BufferedReader buf; | |||
| String str; | |||
| Negative = new HashSet<String>(); | |||
| buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); | |||
| while( (str = buf.readLine()) != null ) { | |||
| Negative.add(str); | |||
| } | |||
| Positive = new HashSet<String>(); | |||
| buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); | |||
| while( (str = buf.readLine()) != null ) { | |||
| Positive.add(str); | |||
| } | |||
| } | |||
| public void Classify_Directory( String DirectoryPath ) throws Exception { | |||
| int PositiveNum = 0, NegativeNum = 0, UnsureNum = 0; | |||
| String[] Text_Path = new File( DirectoryPath ).list(); | |||
| for ( int i = 0; i < Text_Path.length; i ++ ) { | |||
| Classify( DirectoryPath+Text_Path[i] ); | |||
| double Ans = Classify( DirectoryPath+Text_Path[i] ); //对当前目录下的每一个文件进行测试 | |||
| if ( Ans < 0 ) { //根据测试结果将测试文本进行分类 | |||
| FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Positive"), Text_Path[i])); | |||
| PositiveNum ++; | |||
| } | |||
| else if ( Ans > 0 ) { | |||
| FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Negative"), Text_Path[i])); | |||
| NegativeNum ++; | |||
| } | |||
| else { | |||
| FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Unsure"), Text_Path[i])); | |||
| UnsureNum ++; | |||
| } | |||
| System.out.print( "No." + (i+1) + " " + Text_Path[i] + ": " ); | |||
| if ( Ans < 0 ) { System.out.println("Positive"); } | |||
| else if ( Ans > 0 ) { System.out.println("Negative"); } | |||
| else { System.out.println("Unsure"); } | |||
| } | |||
| System.out.println("End."); | |||
| System.out.println("NegativeNum = " + NegativeNum + " PositiveNum = " + PositiveNum + " UnsureNum = " + UnsureNum); | |||
| } | |||
| public double Classify( String FilePath ) throws Exception { | |||
| Hashtable<String, Integer> FileHashTable = Read_TestFile( FilePath ); | |||
| Enumeration<String> Keys; | |||
| double NegativeAns = 1, PositiveAns = 1; | |||
| Keys = FileHashTable.keys(); | |||
| while( Keys.hasMoreElements() ) { | |||
| String Word = Keys.nextElement(); | |||
| NegativeAns *= ( Math.pow(this.PostProbability(Word, NegativeWeight), FileHashTable.get(Word)) ); | |||
| } | |||
| NegativeAns *= this.PriorProbability(NegativeDoc); | |||
| Keys = FileHashTable.keys(); | |||
| while( Keys.hasMoreElements() ) { | |||
| String Word = Keys.nextElement(); | |||
| PositiveAns *= ( Math.pow(this.PostProbability(Word, PositiveWeight), FileHashTable.get(Word)) ); | |||
| } | |||
| PositiveAns *= this.PriorProbability(PositiveDoc); | |||
| return ( NegativeAns-PositiveAns ); | |||
| } | |||
| public Hashtable<String, Integer> Read_TestFile( String FilePath ) throws Exception { | |||
| ArrayList<String> FileCurrentList = new ArrayList<String>(); | |||
| ReadXML( FilePath, FileCurrentList ); | |||
| Hashtable<String, Integer> FileHashTable = HashTable( FileCurrentList ); | |||
| return FileHashTable; | |||
| } | |||
| public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 | |||
| SAXReader SaxReader = new SAXReader(); | |||
| Document Doc = SaxReader.read(new File(FilePath)); | |||
| Element root = Doc.getRootElement(); | |||
| Element content = root.element("content"); | |||
| List<?> sentenses = content.elements("sentence"); //每一句话作为一项 | |||
| for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) { | |||
| Element sentense = (Element)iter.next(); | |||
| List<?> toks = sentense.elements(); | |||
| for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) { | |||
| Element tok = (Element)iter1.next(); | |||
| String Type = tok.attributeValue("type"); | |||
| if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 | |||
| GetWord( tok, currentList ); //从"group"中获取词 | |||
| } | |||
| } | |||
| } | |||
| } | |||
| public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词 | |||
| String Word = ""; | |||
| List<?> elements = root.elements("tok"); | |||
| for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) { | |||
| Element tok = (Element)iter.next(); | |||
| String Type = tok.attributeValue("type"); | |||
| if ( Type.compareTo("atom") == 0 ) { | |||
| Word += tok.getText().trim(); | |||
| } | |||
| else { | |||
| GetWord( tok, currentList ); | |||
| } | |||
| } | |||
| if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 | |||
| currentList.add(Word); | |||
| } | |||
| } | |||
| public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表 | |||
| Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>(); | |||
| for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) { | |||
| String Word = (String)iter.next(); | |||
| if ( HashTable.containsKey(Word) ) { | |||
| Integer Weight = HashTable.get(Word); | |||
| HashTable.put(Word, Weight+1); | |||
| } | |||
| else { | |||
| HashTable.put(Word, 1); | |||
| } | |||
| } | |||
| return HashTable; | |||
| } | |||
| public double PriorProbability( Integer SentimentDoc ) { | |||
| double Ans = 1; | |||
| Ans = ( (double)SentimentDoc/( (double)NegativeDoc+(double)PositiveDoc+(double)UnsureDoc ) ); | |||
| return Ans; | |||
| } | |||
| public double PostProbability( String Word, Hashtable<String, Integer> SentimentWeight ) { | |||
| double Ans, V, E; | |||
| double Weight = 0, Weights = 0; | |||
| if ( SentimentWeight.containsKey(Word) ) | |||
| Weight = (double)SentimentWeight.get(Word); | |||
| Weights = PostWeights( SentimentWeight ); | |||
| V = PostWeights( NegativeWeight ) + PostWeights( PositiveWeight ) + PostWeights( UnsureWeight ); | |||
| E = 1/Math.abs(V); | |||
| Ans = ( Weight + E )/( Weights + E*Math.abs(V) ); | |||
| return Ans; | |||
| } | |||
| public double PostWeights( Hashtable<String, Integer> SentimentWeight ) { | |||
| double Weights = 0; | |||
| Enumeration<String> Keys; | |||
| Keys = SentimentWeight.keys(); | |||
| while( Keys.hasMoreElements() ) { | |||
| String Key = Keys.nextElement(); | |||
| Weights += (double)SentimentWeight.get(Key); | |||
| } | |||
| return Weights; | |||
| } | |||
| } | |||