| @@ -0,0 +1,7 @@ | |||||
| .project | |||||
| .classpath | |||||
| target/ | |||||
| Result/ | |||||
| Model/ | |||||
| .settings/ | |||||
| @@ -0,0 +1,15 @@ | |||||
| 基于自然语言处理的情感分析工具 | |||||
| 本程序依赖data目录下面的data.zip,先解压缩 data 目录下面的 data.zip到当前目录。 | |||||
| 1、基于词典和贝叶斯模型的情感分析 | |||||
| 主程序:eshore.cn.it.sentiment.Sentiment 此类通过 | |||||
| data/Sentiment_Dictionary中的正负面词语建立模型。 | |||||
| 测试: eshore.cn.it.sentiment.SentimentTest | |||||
| 通过这个类就可以测试 data/500trainblogxml中的某个文件夹下面的博客的情感。 | |||||
| 2、直接利用lingpipe的情感分析模块测试情感分析 | |||||
| 直接运行程序: eshore.cn.it.sentiment.ChinesePolarityBasic | |||||
| 程序就会通过: data/polarity_corpus/hotel_reviews/train2训练 | |||||
| 然后自动测试: data/polarity_corpus/hotel_reviews/test2 | |||||
| 最后给出程序测试结果。 | |||||
| @@ -0,0 +1,5 @@ | |||||
| 500trainblogxml/ | |||||
| nerws_corpus/ | |||||
| output/ | |||||
| polarity_corpus/ | |||||
| Sentiment_Dictionary/ | |||||
| @@ -0,0 +1,58 @@ | |||||
| <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |||||
| xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |||||
| <modelVersion>4.0.0</modelVersion> | |||||
| <groupId>eshore.cn.it</groupId> | |||||
| <artifactId>nlp-sentiment</artifactId> | |||||
| <version>0.0.1-SNAPSHOT</version> | |||||
| <packaging>jar</packaging> | |||||
| <name>nlp-sentiment</name> | |||||
| <url>http://maven.apache.org</url> | |||||
| <properties> | |||||
| <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |||||
| <commons.io.version>2.4</commons.io.version> | |||||
| <dom4j.version>1.6.1</dom4j.version> | |||||
| <lingpipe.version>4.1.0</lingpipe.version> | |||||
| <jieba.version>1.0.0</jieba.version> | |||||
| <!-- 无法到中央仓库下载的jar包就集中存放到这个位置 --> | |||||
| <maven.libs.home>F:/java_git_projects/nlp-sentiment/libs</maven.libs.home> | |||||
| </properties> | |||||
| <dependencies> | |||||
| <dependency> | |||||
| <groupId>junit</groupId> | |||||
| <artifactId>junit</artifactId> | |||||
| <version>3.8.1</version> | |||||
| <scope>test</scope> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>commons-io</groupId> | |||||
| <artifactId>commons-io</artifactId> | |||||
| <version>${commons.io.version}</version> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>dom4j</groupId> | |||||
| <artifactId>dom4j</artifactId> | |||||
| <version>${dom4j.version}</version> | |||||
| </dependency> | |||||
| <!-- 此处需要手动到lingpipe官网下载lingpipe的corejar包 --> | |||||
| <dependency> | |||||
| <groupId>com.aliasi</groupId> | |||||
| <artifactId>lingpipe</artifactId> | |||||
| <version>${lingpipe.version}</version> | |||||
| <scope>system</scope> | |||||
| <systemPath>${maven.libs.home}/lingpipe-4.1.0.jar</systemPath> | |||||
| </dependency> | |||||
| <!-- 此处添加结巴分词器 --> | |||||
| <dependency> | |||||
| <groupId>com.huaban</groupId> | |||||
| <artifactId>jieba-analysis</artifactId> | |||||
| <version>${jieba.version}</version> | |||||
| </dependency> | |||||
| </dependencies> | |||||
| </project> | |||||
| @@ -0,0 +1,146 @@ | |||||
| package eshore.cn.it.sentiment; | |||||
| import java.io.File; | |||||
| import java.io.FileReader; | |||||
| import java.io.IOException; | |||||
| import java.util.List; | |||||
| import org.apache.commons.io.IOUtils; | |||||
| import com.aliasi.classify.Classification; | |||||
| import com.aliasi.classify.Classified; | |||||
| import com.aliasi.classify.DynamicLMClassifier; | |||||
| import com.aliasi.lm.NGramProcessLM; | |||||
| import com.aliasi.util.Files; | |||||
| import com.huaban.analysis.jieba.JiebaSegmenter; | |||||
| import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; | |||||
| import com.huaban.analysis.jieba.SegToken; | |||||
| /** | |||||
| * ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类 | |||||
| * lingpipe适合做增量分析 | |||||
| * @clebeg 2015-03-13 | |||||
| * @version 0.0.1 | |||||
| * */ | |||||
| public class ChinesePolarityBasic { | |||||
| private String[] mCategories = new String[]{"+1", "-1"}; | |||||
| //这就是分类模型 | |||||
| private DynamicLMClassifier<NGramProcessLM> mClassifier; | |||||
| private int numTests = 0; | |||||
| private int numCorrect = 0; | |||||
| private static final String TRAINFILES_INFO = | |||||
| "data/polarity_corpus/hotel_reviews/train2.rlabelclass"; | |||||
| private static final String TRAINFILES_DIR = | |||||
| "data/polarity_corpus/hotel_reviews/train2"; | |||||
| private static final String TESTFILES_DIR = | |||||
| "data/polarity_corpus/hotel_reviews/test2"; | |||||
| private static final String TESTFILES_INFO = | |||||
| "data/polarity_corpus/hotel_reviews/test2.rlabelclass"; | |||||
| private static final String ENCODING = "GBK"; | |||||
| private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter(); | |||||
| private final SegMode segMode = SegMode.INDEX; | |||||
| public static void main(String[] args) { | |||||
| try { | |||||
| new ChinesePolarityBasic().run(); | |||||
| } catch (Throwable t) { | |||||
| System.out.println("Thrown: " + t); | |||||
| t.printStackTrace(System.out); | |||||
| } | |||||
| } | |||||
| public ChinesePolarityBasic() { | |||||
| super(); | |||||
| int nGram = 8; | |||||
| mClassifier | |||||
| = DynamicLMClassifier | |||||
| .createNGramProcess(mCategories,nGram); | |||||
| } | |||||
| private void run() throws ClassNotFoundException, | |||||
| IOException { | |||||
| train(); | |||||
| evaluate(); | |||||
| } | |||||
| private void train() throws IOException { | |||||
| FileReader input = new FileReader(new File(TRAINFILES_INFO)); | |||||
| List<String> trainInfos = IOUtils.readLines(input); | |||||
| for (String str : trainInfos){ | |||||
| String[] train = str.split(" "); | |||||
| train(train[1], new File(TRAINFILES_DIR, train[0]), ENCODING); | |||||
| } | |||||
| } | |||||
| private void evaluate() throws IOException { | |||||
| FileReader input = new FileReader(new File(TESTFILES_INFO)); | |||||
| List<String> trainInfos = IOUtils.readLines(input); | |||||
| for (String str : trainInfos){ | |||||
| String[] train = str.split(" "); | |||||
| evaluate(train[1], new File(TESTFILES_DIR, train[0]), ENCODING); | |||||
| } | |||||
| System.out.println(" # Test Cases=" | |||||
| + numTests); | |||||
| System.out.println(" # Correct=" | |||||
| + numCorrect); | |||||
| System.out.println(" % Correct=" | |||||
| + ((double)numCorrect) | |||||
| /(double)numTests); | |||||
| } | |||||
| /** | |||||
| * 给定分类标识,给定训练文本,给定文本的编码,即可作分类训练 | |||||
| * 分类完成之后就会加入到分类模型中 | |||||
| * @throws IOException | |||||
| * */ | |||||
| private void train(String category, File trainFile, String fileEncoding) | |||||
| throws IOException { | |||||
| Classification classification = new Classification(category); | |||||
| String review = Files.readFromFile(trainFile, fileEncoding); | |||||
| //此处加入中文分词器,得到分词之后的字符串 | |||||
| List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); | |||||
| review = ""; | |||||
| for (SegToken seg : segTokens) { | |||||
| review += seg.word.getToken() + " "; | |||||
| } | |||||
| Classified<CharSequence> classified | |||||
| = new Classified<CharSequence>(review,classification); | |||||
| mClassifier.handle(classified); | |||||
| } | |||||
| /** | |||||
| * 给定分类标识,给定测试文本,给定文本的编码,即可作测试模型 | |||||
| * @throws IOException | |||||
| * */ | |||||
| private void evaluate(String category, File testFile, String fileEncoding) | |||||
| throws IOException { | |||||
| String review | |||||
| = Files.readFromFile(testFile, fileEncoding); | |||||
| //同理,这里可以加入分词器,这样可以试试效果如何。 | |||||
| List<SegToken> segTokens = jiebaSegmenter.process(review, segMode); | |||||
| review = ""; | |||||
| for (SegToken seg : segTokens) { | |||||
| review += seg.word.getToken() + " "; | |||||
| } | |||||
| ++numTests; | |||||
| Classification classification | |||||
| = mClassifier.classify(review); | |||||
| //得到训练结果 | |||||
| String resultCategory | |||||
| = classification.bestCategory(); | |||||
| if (resultCategory.equals(category)) | |||||
| ++numCorrect; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,94 @@ | |||||
| package eshore.cn.it.sentiment; | |||||
| import java.io.File; | |||||
| import java.io.IOException; | |||||
| import com.aliasi.classify.Classification; | |||||
| import com.aliasi.classify.Classified; | |||||
| import com.aliasi.classify.DynamicLMClassifier; | |||||
| import com.aliasi.lm.NGramProcessLM; | |||||
| import com.aliasi.util.Files; | |||||
| public class PolarityBasic { | |||||
| File mPolarityDir; | |||||
| String[] mCategories; | |||||
| DynamicLMClassifier<NGramProcessLM> mClassifier; | |||||
| public PolarityBasic(String[] args) { | |||||
| mPolarityDir = new File("data/polarity_corpus","txt_sentoken"); | |||||
| mCategories = mPolarityDir.list(); | |||||
| int nGram = 8; | |||||
| mClassifier | |||||
| = DynamicLMClassifier | |||||
| .createNGramProcess(mCategories,nGram); | |||||
| } | |||||
| public static void main(String[] args) { | |||||
| try { | |||||
| new PolarityBasic(args).run(); | |||||
| } catch (Throwable t) { | |||||
| System.out.println("Thrown: " + t); | |||||
| t.printStackTrace(System.out); | |||||
| } | |||||
| } | |||||
| private void run() throws ClassNotFoundException, | |||||
| IOException { | |||||
| train(); | |||||
| evaluate(); | |||||
| } | |||||
| private void train() throws IOException { | |||||
| for (int i = 0; i < mCategories.length; ++i) { | |||||
| String category = mCategories[i]; | |||||
| Classification classification | |||||
| = new Classification(category); | |||||
| File dir = new File(mPolarityDir, mCategories[i]); | |||||
| File[] trainFiles = dir.listFiles(); | |||||
| for (int j = 0; j < trainFiles.length; ++j) { | |||||
| File trainFile = trainFiles[j]; | |||||
| if (isTrainingFile(trainFile)) { | |||||
| String review | |||||
| = Files.readFromFile(trainFile,"ISO-8859-1"); | |||||
| Classified<CharSequence> classified | |||||
| = new Classified<CharSequence>(review,classification); | |||||
| mClassifier.handle(classified); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| boolean isTrainingFile(File file) { | |||||
| return file.getName().charAt(2) != '9'; // test on fold 9 | |||||
| } | |||||
| void evaluate() throws IOException { | |||||
| int numTests = 0; | |||||
| int numCorrect = 0; | |||||
| for (int i = 0; i < mCategories.length; ++i) { | |||||
| String category = mCategories[i]; | |||||
| File file = new File(mPolarityDir,mCategories[i]); | |||||
| File[] testFiles = file.listFiles(); | |||||
| for (int j = 0; j < testFiles.length; ++j) { | |||||
| File testFile = testFiles[j]; | |||||
| if (!isTrainingFile(testFile)) { | |||||
| String review | |||||
| = Files.readFromFile(testFile,"ISO-8859-1"); | |||||
| ++numTests; | |||||
| Classification classification | |||||
| = mClassifier.classify(review); | |||||
| String resultCategory | |||||
| = classification.bestCategory(); | |||||
| if (resultCategory.equals(category)) | |||||
| ++numCorrect; | |||||
| } | |||||
| } | |||||
| } | |||||
| System.out.println(" # Test Cases=" | |||||
| + numTests); | |||||
| System.out.println(" # Correct=" | |||||
| + numCorrect); | |||||
| System.out.println(" % Correct=" | |||||
| + ((double)numCorrect) | |||||
| /(double)numTests); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,211 @@ | |||||
| package eshore.cn.it.sentiment; | |||||
| import java.io.*; | |||||
| import java.util.*; | |||||
| import org.apache.commons.io.FileUtils; | |||||
| import org.dom4j.Document; | |||||
| import org.dom4j.Element; | |||||
| import org.dom4j.io.SAXReader; | |||||
| public class Sentiment { | |||||
| static private HashSet<String> Negative, Positive; //两种情感词典 | |||||
| static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 | |||||
| static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 | |||||
| static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/"; | |||||
| static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; | |||||
| static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; | |||||
| static final String FILE_ENCODING = "UTF-8"; | |||||
| public static void main(String[] args) throws Exception { | |||||
| // TODO 自动生成的方法存根 | |||||
| Sentiment Sentiment = new Sentiment(); | |||||
| Sentiment.Model( ); | |||||
| Sentiment.Save_Model(); | |||||
| } | |||||
| public void Model( ) throws Exception { | |||||
| this.Read_Sentiment_Dictionary(); | |||||
| this.Sentiment_Doc_Weight(SENTIMENT_DOC_WEIGHT_PATH); | |||||
| } | |||||
| @SuppressWarnings("resource") | |||||
| public void Read_Sentiment_Dictionary() throws Exception { | |||||
| BufferedReader buf; | |||||
| String str; | |||||
| //集合,里面元素不允许重复 | |||||
| Negative = new HashSet<String>(); | |||||
| buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); | |||||
| while( (str = buf.readLine()) != null ) { | |||||
| Negative.add(str); | |||||
| } | |||||
| Positive = new HashSet<String>(); | |||||
| buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); | |||||
| while( (str = buf.readLine()) != null ) { | |||||
| Positive.add(str); | |||||
| } | |||||
| } | |||||
| public void Sentiment_Doc_Weight( String DirPath ) throws Exception { | |||||
| File NegativeDir = new File( DirPath + "negativeout" ); | |||||
| String[] NegativeFiles = NegativeDir.list(); | |||||
| NegativeDoc = NegativeFiles.length; | |||||
| ArrayList<String> NegativeCurrentList = new ArrayList<String>(); | |||||
| for ( int i = 0; i < NegativeFiles.length; i ++ ) { | |||||
| System.out.println("NegativeFiles No."+(i+1)+" "+DirPath+"negativeout/"+NegativeFiles[i]); | |||||
| this.ReadXML(DirPath+"negativeout/"+NegativeFiles[i], NegativeCurrentList); | |||||
| } | |||||
| NegativeWeight = HashTable( NegativeCurrentList ); | |||||
| /**********************************************************************************************************/ | |||||
| File PositiveDir = new File( DirPath + "positiveout" ); | |||||
| String[] PositiveFiles = PositiveDir.list(); | |||||
| PositiveDoc = PositiveFiles.length; | |||||
| ArrayList<String> PositiveCurrentList = new ArrayList<String>(); | |||||
| for ( int i = 0; i < PositiveFiles.length; i ++ ) { | |||||
| System.out.println("PositiveFiles No."+(i+1)+" "+DirPath+"positiveout/"+PositiveFiles[i]); | |||||
| this.ReadXML(DirPath+"positiveout/"+PositiveFiles[i], PositiveCurrentList); | |||||
| } | |||||
| PositiveWeight = HashTable( PositiveCurrentList ); | |||||
| /*********************************************************************************************************/ | |||||
| File UnsureDir = new File( DirPath + "unsureout" ); | |||||
| String[] UnsureFiles = UnsureDir.list(); | |||||
| UnsureDoc = UnsureFiles.length; | |||||
| ArrayList<String> UnsureCurrentList = new ArrayList<String>(); | |||||
| for ( int i = 0; i < UnsureFiles.length; i ++ ) { | |||||
| System.out.println("UnsureFiles No."+(i+1)+" "+DirPath+"unsureout/"+UnsureFiles[i]); | |||||
| this.ReadXML(DirPath+"unsureout/"+UnsureFiles[i], UnsureCurrentList); | |||||
| } | |||||
| UnsureWeight = HashTable( UnsureCurrentList ); | |||||
| /********************************************************************************************************/ | |||||
| System.out.println("UnsureCurrent = " + UnsureCurrentList.size() + " UnsureHashTable = " + UnsureWeight.size()); | |||||
| System.out.println("PositiveCurrent = " + PositiveCurrentList.size() + " PositiveHashTable = " + PositiveWeight.size()); | |||||
| System.out.println("NegativeCurrent = " + NegativeCurrentList.size() + " NegativeHashTable = " + NegativeWeight.size()); | |||||
| System.out.println("NegativeDoc = " + NegativeDoc + " PositiveDoc = " + PositiveDoc + " UnsureDoc = " + UnsureDoc); | |||||
| } | |||||
| public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 | |||||
| SAXReader SaxReader = new SAXReader(); | |||||
| Document Doc = SaxReader.read(new File(FilePath)); | |||||
| Element root = Doc.getRootElement(); | |||||
| Element content = root.element("content"); | |||||
| List<?> sentenses = content.elements("sentence"); //每一句话作为一项 | |||||
| for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) { | |||||
| Element sentense = (Element)iter.next(); | |||||
| List<?> toks = sentense.elements(); | |||||
| for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) { | |||||
| Element tok = (Element)iter1.next(); | |||||
| String Type = tok.attributeValue("type"); | |||||
| if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 | |||||
| GetWord( tok, currentList ); //从"group"中获取词 | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词 | |||||
| String Word = ""; | |||||
| List<?> elements = root.elements("tok"); | |||||
| for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) { | |||||
| Element tok = (Element)iter.next(); | |||||
| String Type = tok.attributeValue("type"); | |||||
| if ( Type.compareTo("atom") == 0 ) { | |||||
| Word += tok.getText().trim(); | |||||
| } | |||||
| else { | |||||
| GetWord( tok, currentList ); | |||||
| } | |||||
| } | |||||
| if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 | |||||
| currentList.add(Word); | |||||
| } | |||||
| } | |||||
| public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表 | |||||
| Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>(); | |||||
| for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) { | |||||
| String Word = (String)iter.next(); | |||||
| if ( HashTable.containsKey(Word) ) { | |||||
| Integer Weight = HashTable.get(Word); | |||||
| HashTable.put(Word, Weight+1); | |||||
| } | |||||
| else { | |||||
| HashTable.put(Word, 1); | |||||
| } | |||||
| } | |||||
| return HashTable; | |||||
| } | |||||
| @SuppressWarnings("resource") | |||||
| public void Save_Model( ) throws Exception { | |||||
| ObjectOutputStream OOS; | |||||
| File ModelPath = new File("Model"); | |||||
| File NegativeModel = new File(ModelPath, "NegativeModel.txt"); | |||||
| File PositiveModel = new File(ModelPath, "PositiveModel.txt"); | |||||
| File UnsureModel = new File(ModelPath, "UnsureModel.txt"); | |||||
| if ( !ModelPath.exists() ) { ModelPath.mkdir(); } | |||||
| System.out.println("Saving NegativeModel..."); | |||||
| OOS = new ObjectOutputStream( new FileOutputStream( NegativeModel ) ); //对象流直接写入 | |||||
| OOS.writeObject(NegativeDoc); | |||||
| OOS.writeObject(NegativeWeight); | |||||
| System.out.println("Saving PositiveModel..."); | |||||
| OOS = new ObjectOutputStream( new FileOutputStream( PositiveModel ) ); | |||||
| OOS.writeObject(PositiveDoc); | |||||
| OOS.writeObject(PositiveWeight); | |||||
| System.out.println("Saving UnsureModel..."); | |||||
| OOS = new ObjectOutputStream( new FileOutputStream( UnsureModel ) ); | |||||
| OOS.writeObject(UnsureDoc); | |||||
| OOS.writeObject(UnsureWeight); | |||||
| Enumeration<String> Keys; | |||||
| System.out.println("Saving NegativeWeight..."); | |||||
| Keys = NegativeWeight.keys(); | |||||
| while( Keys.hasMoreElements() ) { | |||||
| String Key = Keys.nextElement(); | |||||
| FileUtils.writeStringToFile(new File("Model", "NegativeWeight.txt"), Key+"\t\t\t"+NegativeWeight.get(Key)+"\r\n", "UTF-8", true); | |||||
| } | |||||
| System.out.println("Saving PositiveWeight..."); | |||||
| Keys = PositiveWeight.keys(); | |||||
| while( Keys.hasMoreElements() ) { | |||||
| String Key = Keys.nextElement(); | |||||
| FileUtils.writeStringToFile(new File("Model", "PositiveWeight.txt"), Key+"\t\t\t"+PositiveWeight.get(Key)+"\r\n", "UTF-8", true); | |||||
| } | |||||
| System.out.println("Saving UnsureWeight..."); | |||||
| Keys = UnsureWeight.keys(); | |||||
| while( Keys.hasMoreElements() ) { | |||||
| String Key = Keys.nextElement(); | |||||
| FileUtils.writeStringToFile(new File("Model", "UnsureWeight.txt"), Key+"\t\t\t"+UnsureWeight.get(Key)+"\r\n", "UTF-8", true); | |||||
| } | |||||
| System.out.println("Save Success!"); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,244 @@ | |||||
| package eshore.cn.it.sentiment; | |||||
| import java.io.*; | |||||
| import java.util.*; | |||||
| import org.apache.commons.io.FileUtils; | |||||
| import org.dom4j.Document; | |||||
| import org.dom4j.Element; | |||||
| import org.dom4j.io.SAXReader; | |||||
| public class SentimentTest { | |||||
| static private HashSet<String> Negative, Positive; //两种情感词典 | |||||
| static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 | |||||
| static private Hashtable<String, Integer> NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 | |||||
| static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/negativeout/"; | |||||
| static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; | |||||
| static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; | |||||
| static final String FILE_ENCODING = "UTF-8"; | |||||
| public static void main(String[] args) throws Exception { | |||||
| // TODO 自动生成的方法存根 | |||||
| SentimentTest Sentiment_Test = new SentimentTest(); | |||||
| Sentiment_Test.Read_Model(); //读取模型 | |||||
| Sentiment_Test.Classify_Directory(SENTIMENT_DOC_WEIGHT_PATH); | |||||
| } | |||||
| @SuppressWarnings({ "resource", "unchecked" }) | |||||
| public void Read_Model() throws Exception { | |||||
| this.Read_Sentiment_Dictionary(); | |||||
| ObjectInputStream OIS; //对象流直接读入 | |||||
| File ModelPath = new File("Model"); | |||||
| File NegativeModel = new File(ModelPath, "NegativeModel.txt"); | |||||
| File PositiveModel = new File(ModelPath, "PositiveModel.txt"); | |||||
| File UnsureModel = new File(ModelPath, "UnsureModel.txt"); | |||||
| System.out.println("Reading NegativeModel..."); | |||||
| OIS = new ObjectInputStream( new FileInputStream( NegativeModel ) ); | |||||
| NegativeDoc = (Integer) OIS.readObject(); | |||||
| NegativeWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||||
| System.out.println("Reading PositiveModel..."); | |||||
| OIS = new ObjectInputStream( new FileInputStream( PositiveModel ) ); | |||||
| PositiveDoc = (Integer) OIS.readObject(); | |||||
| PositiveWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||||
| System.out.println("Reading UnsureModel..."); | |||||
| OIS = new ObjectInputStream( new FileInputStream( UnsureModel ) ); | |||||
| UnsureDoc = (Integer) OIS.readObject(); | |||||
| UnsureWeight = (Hashtable<String, Integer>) OIS.readObject(); | |||||
| System.out.println("Read Success."); | |||||
| } | |||||
| @SuppressWarnings("resource") | |||||
| public void Read_Sentiment_Dictionary( ) throws Exception { //读入情感词典 | |||||
| BufferedReader buf; | |||||
| String str; | |||||
| Negative = new HashSet<String>(); | |||||
| buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); | |||||
| while( (str = buf.readLine()) != null ) { | |||||
| Negative.add(str); | |||||
| } | |||||
| Positive = new HashSet<String>(); | |||||
| buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); | |||||
| while( (str = buf.readLine()) != null ) { | |||||
| Positive.add(str); | |||||
| } | |||||
| } | |||||
| public void Classify_Directory( String DirectoryPath ) throws Exception { | |||||
| int PositiveNum = 0, NegativeNum = 0, UnsureNum = 0; | |||||
| String[] Text_Path = new File( DirectoryPath ).list(); | |||||
| for ( int i = 0; i < Text_Path.length; i ++ ) { | |||||
| Classify( DirectoryPath+Text_Path[i] ); | |||||
| double Ans = Classify( DirectoryPath+Text_Path[i] ); //对当前目录下的每一个文件进行测试 | |||||
| if ( Ans < 0 ) { //根据测试结果将测试文本进行分类 | |||||
| FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Positive"), Text_Path[i])); | |||||
| PositiveNum ++; | |||||
| } | |||||
| else if ( Ans > 0 ) { | |||||
| FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Negative"), Text_Path[i])); | |||||
| NegativeNum ++; | |||||
| } | |||||
| else { | |||||
| FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Unsure"), Text_Path[i])); | |||||
| UnsureNum ++; | |||||
| } | |||||
| System.out.print( "No." + (i+1) + " " + Text_Path[i] + ": " ); | |||||
| if ( Ans < 0 ) { System.out.println("Positive"); } | |||||
| else if ( Ans > 0 ) { System.out.println("Negative"); } | |||||
| else { System.out.println("Unsure"); } | |||||
| } | |||||
| System.out.println("End."); | |||||
| System.out.println("NegativeNum = " + NegativeNum + " PositiveNum = " + PositiveNum + " UnsureNum = " + UnsureNum); | |||||
| } | |||||
| public double Classify( String FilePath ) throws Exception { | |||||
| Hashtable<String, Integer> FileHashTable = Read_TestFile( FilePath ); | |||||
| Enumeration<String> Keys; | |||||
| double NegativeAns = 1, PositiveAns = 1; | |||||
| Keys = FileHashTable.keys(); | |||||
| while( Keys.hasMoreElements() ) { | |||||
| String Word = Keys.nextElement(); | |||||
| NegativeAns *= ( Math.pow(this.PostProbability(Word, NegativeWeight), FileHashTable.get(Word)) ); | |||||
| } | |||||
| NegativeAns *= this.PriorProbability(NegativeDoc); | |||||
| Keys = FileHashTable.keys(); | |||||
| while( Keys.hasMoreElements() ) { | |||||
| String Word = Keys.nextElement(); | |||||
| PositiveAns *= ( Math.pow(this.PostProbability(Word, PositiveWeight), FileHashTable.get(Word)) ); | |||||
| } | |||||
| PositiveAns *= this.PriorProbability(PositiveDoc); | |||||
| return ( NegativeAns-PositiveAns ); | |||||
| } | |||||
| public Hashtable<String, Integer> Read_TestFile( String FilePath ) throws Exception { | |||||
| ArrayList<String> FileCurrentList = new ArrayList<String>(); | |||||
| ReadXML( FilePath, FileCurrentList ); | |||||
| Hashtable<String, Integer> FileHashTable = HashTable( FileCurrentList ); | |||||
| return FileHashTable; | |||||
| } | |||||
| public void ReadXML( String FilePath, ArrayList<String> currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 | |||||
| SAXReader SaxReader = new SAXReader(); | |||||
| Document Doc = SaxReader.read(new File(FilePath)); | |||||
| Element root = Doc.getRootElement(); | |||||
| Element content = root.element("content"); | |||||
| List<?> sentenses = content.elements("sentence"); //每一句话作为一项 | |||||
| for ( Iterator<?> iter = sentenses.iterator(); iter.hasNext(); ) { | |||||
| Element sentense = (Element)iter.next(); | |||||
| List<?> toks = sentense.elements(); | |||||
| for ( Iterator<?> iter1 = toks.iterator(); iter1.hasNext(); ) { | |||||
| Element tok = (Element)iter1.next(); | |||||
| String Type = tok.attributeValue("type"); | |||||
| if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 | |||||
| GetWord( tok, currentList ); //从"group"中获取词 | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| public void GetWord( Element root, ArrayList<String> currentList ) { //获取XML中的情感词 | |||||
| String Word = ""; | |||||
| List<?> elements = root.elements("tok"); | |||||
| for ( Iterator<?> iter = elements.iterator(); iter.hasNext(); ) { | |||||
| Element tok = (Element)iter.next(); | |||||
| String Type = tok.attributeValue("type"); | |||||
| if ( Type.compareTo("atom") == 0 ) { | |||||
| Word += tok.getText().trim(); | |||||
| } | |||||
| else { | |||||
| GetWord( tok, currentList ); | |||||
| } | |||||
| } | |||||
| if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 | |||||
| currentList.add(Word); | |||||
| } | |||||
| } | |||||
| public Hashtable<String, Integer> HashTable( ArrayList<String> currentList ) { //根据文本中的情感词构建哈希表 | |||||
| Hashtable<String, Integer> HashTable = new Hashtable<String, Integer>(); | |||||
| for ( Iterator<String> iter = currentList.iterator(); iter.hasNext(); ) { | |||||
| String Word = (String)iter.next(); | |||||
| if ( HashTable.containsKey(Word) ) { | |||||
| Integer Weight = HashTable.get(Word); | |||||
| HashTable.put(Word, Weight+1); | |||||
| } | |||||
| else { | |||||
| HashTable.put(Word, 1); | |||||
| } | |||||
| } | |||||
| return HashTable; | |||||
| } | |||||
| public double PriorProbability( Integer SentimentDoc ) { | |||||
| double Ans = 1; | |||||
| Ans = ( (double)SentimentDoc/( (double)NegativeDoc+(double)PositiveDoc+(double)UnsureDoc ) ); | |||||
| return Ans; | |||||
| } | |||||
| public double PostProbability( String Word, Hashtable<String, Integer> SentimentWeight ) { | |||||
| double Ans, V, E; | |||||
| double Weight = 0, Weights = 0; | |||||
| if ( SentimentWeight.containsKey(Word) ) | |||||
| Weight = (double)SentimentWeight.get(Word); | |||||
| Weights = PostWeights( SentimentWeight ); | |||||
| V = PostWeights( NegativeWeight ) + PostWeights( PositiveWeight ) + PostWeights( UnsureWeight ); | |||||
| E = 1/Math.abs(V); | |||||
| Ans = ( Weight + E )/( Weights + E*Math.abs(V) ); | |||||
| return Ans; | |||||
| } | |||||
| public double PostWeights( Hashtable<String, Integer> SentimentWeight ) { | |||||
| double Weights = 0; | |||||
| Enumeration<String> Keys; | |||||
| Keys = SentimentWeight.keys(); | |||||
| while( Keys.hasMoreElements() ) { | |||||
| String Key = Keys.nextElement(); | |||||
| Weights += (double)SentimentWeight.get(Key); | |||||
| } | |||||
| return Weights; | |||||
| } | |||||
| } | |||||