commit 941597f3ddb382c877bda4426ff56473d5eb41d9 Author: gitclebeg Date: Thu Apr 9 08:45:05 2015 +0800 initial the project nlp-sentiment diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..806dab5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.project +.classpath +target/ +Result/ +Model/ + +.settings/ \ No newline at end of file diff --git a/README b/README new file mode 100644 index 0000000..6340f9a --- /dev/null +++ b/README @@ -0,0 +1,15 @@ +基于自然语言处理的情感分析工具 +本程序依赖data目录下面的data.zip,先解压缩 data 目录下面的 data.zip到当前目录。 + +1、基于词典和贝叶斯模型的情感分析 +主程序:eshore.cn.it.sentiment.Sentiment 此类通过 +data/Sentiment_Dictionary中的正负面词语建立模型。 + +测试: eshore.cn.it.sentiment.SentimentTest +通过这个类就可以测试 data/500trainblogxml中的某个文件夹下面的博客的情感。 + +2、直接利用lingpipe的情感分析模块测试情感分析 +直接运行程序: eshore.cn.it.sentiment.ChinesePolarityBasic +程序就会通过: data/polarity_corpus/hotel_reviews/train2训练 +然后自动测试: data/polarity_corpus/hotel_reviews/test2 +最后给出程序测试结果。 \ No newline at end of file diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..31367df --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,5 @@ +500trainblogxml/ +nerws_corpus/ +output/ +polarity_corpus/ +Sentiment_Dictionary/ \ No newline at end of file diff --git a/data/data.zip b/data/data.zip new file mode 100644 index 0000000..3e4c5d7 Binary files /dev/null and b/data/data.zip differ diff --git a/libs/lingpipe-4.1.0.jar b/libs/lingpipe-4.1.0.jar new file mode 100644 index 0000000..654e653 Binary files /dev/null and b/libs/lingpipe-4.1.0.jar differ diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..66850e2 --- /dev/null +++ b/pom.xml @@ -0,0 +1,58 @@ + + 4.0.0 + + eshore.cn.it + nlp-sentiment + 0.0.1-SNAPSHOT + jar + + nlp-sentiment + http://maven.apache.org + + + UTF-8 + 2.4 + 1.6.1 + 4.1.0 + 1.0.0 + + F:/java_git_projects/nlp-sentiment/libs + + + + + junit + junit + 3.8.1 + test + + + commons-io + commons-io + ${commons.io.version} + + + dom4j + dom4j + ${dom4j.version} + + + + + + com.aliasi + lingpipe + ${lingpipe.version} + system + ${maven.libs.home}/lingpipe-4.1.0.jar + + + + + com.huaban + jieba-analysis + ${jieba.version} + + + diff --git a/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java b/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java new file mode 100644 index 0000000..d1100a9 --- /dev/null +++ b/src/main/java/eshore/cn/it/sentiment/ChinesePolarityBasic.java @@ -0,0 +1,146 @@ +package eshore.cn.it.sentiment; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.List; + + + +import org.apache.commons.io.IOUtils; + +import com.aliasi.classify.Classification; +import com.aliasi.classify.Classified; +import com.aliasi.classify.DynamicLMClassifier; +import com.aliasi.lm.NGramProcessLM; +import com.aliasi.util.Files; +import com.huaban.analysis.jieba.JiebaSegmenter; +import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; +import com.huaban.analysis.jieba.SegToken; + +/** + * ChinesePolarityBasic 此类是利用lingpipe作中文情感预测的示例类 + * lingpipe适合做增量分析 + * @clebeg 2015-03-13 + * @version 0.0.1 + * */ +public class ChinesePolarityBasic { + private String[] mCategories = new String[]{"+1", "-1"}; + //这就是分类模型 + private DynamicLMClassifier mClassifier; + + private int numTests = 0; + private int numCorrect = 0; + private static final String TRAINFILES_INFO = + "data/polarity_corpus/hotel_reviews/train2.rlabelclass"; + private static final String TRAINFILES_DIR = + "data/polarity_corpus/hotel_reviews/train2"; + private static final String TESTFILES_DIR = + "data/polarity_corpus/hotel_reviews/test2"; + private static final String TESTFILES_INFO = + "data/polarity_corpus/hotel_reviews/test2.rlabelclass"; + private static final String ENCODING = "GBK"; + + private final JiebaSegmenter jiebaSegmenter = new JiebaSegmenter(); + private final SegMode segMode = SegMode.INDEX; + + public static void main(String[] args) { + try { + new ChinesePolarityBasic().run(); + } catch (Throwable t) { + System.out.println("Thrown: " + t); + t.printStackTrace(System.out); + } + + } + + + public ChinesePolarityBasic() { + super(); + int nGram = 8; + mClassifier + = DynamicLMClassifier + .createNGramProcess(mCategories,nGram); + } + + private void run() throws ClassNotFoundException, + IOException { + train(); + evaluate(); + } + + private void train() throws IOException { + FileReader input = new FileReader(new File(TRAINFILES_INFO)); + List trainInfos = IOUtils.readLines(input); + for (String str : trainInfos){ + String[] train = str.split(" "); + train(train[1], new File(TRAINFILES_DIR, train[0]), ENCODING); + } + } + + private void evaluate() throws IOException { + FileReader input = new FileReader(new File(TESTFILES_INFO)); + List trainInfos = IOUtils.readLines(input); + for (String str : trainInfos){ + String[] train = str.split(" "); + evaluate(train[1], new File(TESTFILES_DIR, train[0]), ENCODING); + } + System.out.println(" # Test Cases=" + + numTests); + System.out.println(" # Correct=" + + numCorrect); + System.out.println(" % Correct=" + + ((double)numCorrect) + /(double)numTests); + } + + /** + * 给定分类标识,给定训练文本,给定文本的编码,即可作分类训练 + * 分类完成之后就会加入到分类模型中 + * @throws IOException + * */ + private void train(String category, File trainFile, String fileEncoding) + throws IOException { + Classification classification = new Classification(category); + String review = Files.readFromFile(trainFile, fileEncoding); + + //此处加入中文分词器,得到分词之后的字符串 + List segTokens = jiebaSegmenter.process(review, segMode); + review = ""; + for (SegToken seg : segTokens) { + review += seg.word.getToken() + " "; + } + + Classified classified + = new Classified(review,classification); + mClassifier.handle(classified); + } + + /** + * 给定分类标识,给定测试文本,给定文本的编码,即可作测试模型 + * @throws IOException + * */ + private void evaluate(String category, File testFile, String fileEncoding) + throws IOException { + + String review + = Files.readFromFile(testFile, fileEncoding); + //同理,这里可以加入分词器,这样可以试试效果如何。 + List segTokens = jiebaSegmenter.process(review, segMode); + review = ""; + for (SegToken seg : segTokens) { + review += seg.word.getToken() + " "; + } + + ++numTests; + Classification classification + = mClassifier.classify(review); + //得到训练结果 + String resultCategory + = classification.bestCategory(); + if (resultCategory.equals(category)) + ++numCorrect; + + } + +} diff --git a/src/main/java/eshore/cn/it/sentiment/PolarityBasic.java b/src/main/java/eshore/cn/it/sentiment/PolarityBasic.java new file mode 100644 index 0000000..36fad6b --- /dev/null +++ b/src/main/java/eshore/cn/it/sentiment/PolarityBasic.java @@ -0,0 +1,94 @@ +package eshore.cn.it.sentiment; + +import java.io.File; +import java.io.IOException; + +import com.aliasi.classify.Classification; +import com.aliasi.classify.Classified; +import com.aliasi.classify.DynamicLMClassifier; +import com.aliasi.lm.NGramProcessLM; +import com.aliasi.util.Files; + +public class PolarityBasic { + File mPolarityDir; + String[] mCategories; + DynamicLMClassifier mClassifier; + + public PolarityBasic(String[] args) { + mPolarityDir = new File("data/polarity_corpus","txt_sentoken"); + mCategories = mPolarityDir.list(); + int nGram = 8; + mClassifier + = DynamicLMClassifier + .createNGramProcess(mCategories,nGram); + } + + + public static void main(String[] args) { + try { + new PolarityBasic(args).run(); + } catch (Throwable t) { + System.out.println("Thrown: " + t); + t.printStackTrace(System.out); + } + } + + private void run() throws ClassNotFoundException, + IOException { + train(); + evaluate(); + } + private void train() throws IOException { + for (int i = 0; i < mCategories.length; ++i) { + String category = mCategories[i]; + Classification classification + = new Classification(category); + File dir = new File(mPolarityDir, mCategories[i]); + File[] trainFiles = dir.listFiles(); + for (int j = 0; j < trainFiles.length; ++j) { + File trainFile = trainFiles[j]; + if (isTrainingFile(trainFile)) { + String review + = Files.readFromFile(trainFile,"ISO-8859-1"); + Classified classified + = new Classified(review,classification); + mClassifier.handle(classified); + } + } + } + } + boolean isTrainingFile(File file) { + return file.getName().charAt(2) != '9'; // test on fold 9 + } + + void evaluate() throws IOException { + int numTests = 0; + int numCorrect = 0; + for (int i = 0; i < mCategories.length; ++i) { + String category = mCategories[i]; + File file = new File(mPolarityDir,mCategories[i]); + File[] testFiles = file.listFiles(); + for (int j = 0; j < testFiles.length; ++j) { + File testFile = testFiles[j]; + if (!isTrainingFile(testFile)) { + String review + = Files.readFromFile(testFile,"ISO-8859-1"); + ++numTests; + Classification classification + = mClassifier.classify(review); + String resultCategory + = classification.bestCategory(); + if (resultCategory.equals(category)) + ++numCorrect; + } + } + } + System.out.println(" # Test Cases=" + + numTests); + System.out.println(" # Correct=" + + numCorrect); + System.out.println(" % Correct=" + + ((double)numCorrect) + /(double)numTests); + } +} diff --git a/src/main/java/eshore/cn/it/sentiment/Sentiment.java b/src/main/java/eshore/cn/it/sentiment/Sentiment.java new file mode 100644 index 0000000..b97a0ca --- /dev/null +++ b/src/main/java/eshore/cn/it/sentiment/Sentiment.java @@ -0,0 +1,211 @@ +package eshore.cn.it.sentiment; + +import java.io.*; +import java.util.*; + +import org.apache.commons.io.FileUtils; +import org.dom4j.Document; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; + +public class Sentiment { + + static private HashSet Negative, Positive; //两种情感词典 + static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 + static private Hashtable NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 + + static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/"; + static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; + static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; + + static final String FILE_ENCODING = "UTF-8"; + public static void main(String[] args) throws Exception { + // TODO 自动生成的方法存根 + Sentiment Sentiment = new Sentiment(); + + Sentiment.Model( ); + Sentiment.Save_Model(); + } + + public void Model( ) throws Exception { + + this.Read_Sentiment_Dictionary(); + this.Sentiment_Doc_Weight(SENTIMENT_DOC_WEIGHT_PATH); + + } + + @SuppressWarnings("resource") + public void Read_Sentiment_Dictionary() throws Exception { + + BufferedReader buf; + String str; + + //集合,里面元素不允许重复 + Negative = new HashSet(); + buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); + while( (str = buf.readLine()) != null ) { + Negative.add(str); + } + + Positive = new HashSet(); + buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); + while( (str = buf.readLine()) != null ) { + Positive.add(str); + } + } + + + public void Sentiment_Doc_Weight( String DirPath ) throws Exception { + + File NegativeDir = new File( DirPath + "negativeout" ); + String[] NegativeFiles = NegativeDir.list(); + NegativeDoc = NegativeFiles.length; + ArrayList NegativeCurrentList = new ArrayList(); + for ( int i = 0; i < NegativeFiles.length; i ++ ) { + System.out.println("NegativeFiles No."+(i+1)+" "+DirPath+"negativeout/"+NegativeFiles[i]); + this.ReadXML(DirPath+"negativeout/"+NegativeFiles[i], NegativeCurrentList); + } + NegativeWeight = HashTable( NegativeCurrentList ); + + /**********************************************************************************************************/ + + File PositiveDir = new File( DirPath + "positiveout" ); + String[] PositiveFiles = PositiveDir.list(); + PositiveDoc = PositiveFiles.length; + ArrayList PositiveCurrentList = new ArrayList(); + for ( int i = 0; i < PositiveFiles.length; i ++ ) { + System.out.println("PositiveFiles No."+(i+1)+" "+DirPath+"positiveout/"+PositiveFiles[i]); + this.ReadXML(DirPath+"positiveout/"+PositiveFiles[i], PositiveCurrentList); + } + PositiveWeight = HashTable( PositiveCurrentList ); + + /*********************************************************************************************************/ + + File UnsureDir = new File( DirPath + "unsureout" ); + String[] UnsureFiles = UnsureDir.list(); + UnsureDoc = UnsureFiles.length; + ArrayList UnsureCurrentList = new ArrayList(); + for ( int i = 0; i < UnsureFiles.length; i ++ ) { + System.out.println("UnsureFiles No."+(i+1)+" "+DirPath+"unsureout/"+UnsureFiles[i]); + this.ReadXML(DirPath+"unsureout/"+UnsureFiles[i], UnsureCurrentList); + } + UnsureWeight = HashTable( UnsureCurrentList ); + + /********************************************************************************************************/ + System.out.println("UnsureCurrent = " + UnsureCurrentList.size() + " UnsureHashTable = " + UnsureWeight.size()); + System.out.println("PositiveCurrent = " + PositiveCurrentList.size() + " PositiveHashTable = " + PositiveWeight.size()); + System.out.println("NegativeCurrent = " + NegativeCurrentList.size() + " NegativeHashTable = " + NegativeWeight.size()); + System.out.println("NegativeDoc = " + NegativeDoc + " PositiveDoc = " + PositiveDoc + " UnsureDoc = " + UnsureDoc); + + } + + public void ReadXML( String FilePath, ArrayList currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 + + SAXReader SaxReader = new SAXReader(); + Document Doc = SaxReader.read(new File(FilePath)); + Element root = Doc.getRootElement(); + + Element content = root.element("content"); + List sentenses = content.elements("sentence"); //每一句话作为一项 + + for ( Iterator iter = sentenses.iterator(); iter.hasNext(); ) { + Element sentense = (Element)iter.next(); + + List toks = sentense.elements(); + for ( Iterator iter1 = toks.iterator(); iter1.hasNext(); ) { + Element tok = (Element)iter1.next(); + String Type = tok.attributeValue("type"); + + if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 + GetWord( tok, currentList ); //从"group"中获取词 + } + } + } + } + + public void GetWord( Element root, ArrayList currentList ) { //获取XML中的情感词 + + String Word = ""; + List elements = root.elements("tok"); + for ( Iterator iter = elements.iterator(); iter.hasNext(); ) { + Element tok = (Element)iter.next(); + String Type = tok.attributeValue("type"); + + if ( Type.compareTo("atom") == 0 ) { + Word += tok.getText().trim(); + } + else { + GetWord( tok, currentList ); + } + } + if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 + currentList.add(Word); + } + } + + public Hashtable HashTable( ArrayList currentList ) { //根据文本中的情感词构建哈希表 + + Hashtable HashTable = new Hashtable(); + + for ( Iterator iter = currentList.iterator(); iter.hasNext(); ) { + String Word = (String)iter.next(); + if ( HashTable.containsKey(Word) ) { + Integer Weight = HashTable.get(Word); + HashTable.put(Word, Weight+1); + } + else { + HashTable.put(Word, 1); + } + } + return HashTable; + } + + @SuppressWarnings("resource") + public void Save_Model( ) throws Exception { + + ObjectOutputStream OOS; + File ModelPath = new File("Model"); + File NegativeModel = new File(ModelPath, "NegativeModel.txt"); + File PositiveModel = new File(ModelPath, "PositiveModel.txt"); + File UnsureModel = new File(ModelPath, "UnsureModel.txt"); + + if ( !ModelPath.exists() ) { ModelPath.mkdir(); } + + System.out.println("Saving NegativeModel..."); + OOS = new ObjectOutputStream( new FileOutputStream( NegativeModel ) ); //对象流直接写入 + OOS.writeObject(NegativeDoc); + OOS.writeObject(NegativeWeight); + + System.out.println("Saving PositiveModel..."); + OOS = new ObjectOutputStream( new FileOutputStream( PositiveModel ) ); + OOS.writeObject(PositiveDoc); + OOS.writeObject(PositiveWeight); + + System.out.println("Saving UnsureModel..."); + OOS = new ObjectOutputStream( new FileOutputStream( UnsureModel ) ); + OOS.writeObject(UnsureDoc); + OOS.writeObject(UnsureWeight); + + Enumeration Keys; + System.out.println("Saving NegativeWeight..."); + Keys = NegativeWeight.keys(); + while( Keys.hasMoreElements() ) { + String Key = Keys.nextElement(); + FileUtils.writeStringToFile(new File("Model", "NegativeWeight.txt"), Key+"\t\t\t"+NegativeWeight.get(Key)+"\r\n", "UTF-8", true); + } + System.out.println("Saving PositiveWeight..."); + Keys = PositiveWeight.keys(); + while( Keys.hasMoreElements() ) { + String Key = Keys.nextElement(); + FileUtils.writeStringToFile(new File("Model", "PositiveWeight.txt"), Key+"\t\t\t"+PositiveWeight.get(Key)+"\r\n", "UTF-8", true); + } + System.out.println("Saving UnsureWeight..."); + Keys = UnsureWeight.keys(); + while( Keys.hasMoreElements() ) { + String Key = Keys.nextElement(); + FileUtils.writeStringToFile(new File("Model", "UnsureWeight.txt"), Key+"\t\t\t"+UnsureWeight.get(Key)+"\r\n", "UTF-8", true); + } + + System.out.println("Save Success!"); + } +} \ No newline at end of file diff --git a/src/test/java/eshore/cn/it/sentiment/SentimentTest.java b/src/test/java/eshore/cn/it/sentiment/SentimentTest.java new file mode 100644 index 0000000..283d91f --- /dev/null +++ b/src/test/java/eshore/cn/it/sentiment/SentimentTest.java @@ -0,0 +1,244 @@ +package eshore.cn.it.sentiment; + + +import java.io.*; +import java.util.*; + +import org.apache.commons.io.FileUtils; +import org.dom4j.Document; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; + + +public class SentimentTest { + + static private HashSet Negative, Positive; //两种情感词典 + static private Integer NegativeDoc, PositiveDoc, UnsureDoc; //属于两种情感的文本数 - 所构建模型需要保存下的值 + static private Hashtable NegativeWeight, PositiveWeight, UnsureWeight; //两种情感中所有词与他的权值 - 所构建模型需要保存下的值 + + static final String SENTIMENT_DOC_WEIGHT_PATH = "data/500trainblogxml/negativeout/"; + static final String POSITIVE_DIC_PATH = "data/Sentiment_Dictionary/positive_submit.txt"; + static final String NEGATIVE_DIC_PATH = "data/Sentiment_Dictionary/negative_submit.txt"; + + static final String FILE_ENCODING = "UTF-8"; + + public static void main(String[] args) throws Exception { + // TODO 自动生成的方法存根 + SentimentTest Sentiment_Test = new SentimentTest(); + + Sentiment_Test.Read_Model(); //读取模型 + Sentiment_Test.Classify_Directory(SENTIMENT_DOC_WEIGHT_PATH); + } + + @SuppressWarnings({ "resource", "unchecked" }) + public void Read_Model() throws Exception { + + this.Read_Sentiment_Dictionary(); + + ObjectInputStream OIS; //对象流直接读入 + File ModelPath = new File("Model"); + File NegativeModel = new File(ModelPath, "NegativeModel.txt"); + File PositiveModel = new File(ModelPath, "PositiveModel.txt"); + File UnsureModel = new File(ModelPath, "UnsureModel.txt"); + + System.out.println("Reading NegativeModel..."); + OIS = new ObjectInputStream( new FileInputStream( NegativeModel ) ); + NegativeDoc = (Integer) OIS.readObject(); + NegativeWeight = (Hashtable) OIS.readObject(); + + System.out.println("Reading PositiveModel..."); + OIS = new ObjectInputStream( new FileInputStream( PositiveModel ) ); + PositiveDoc = (Integer) OIS.readObject(); + PositiveWeight = (Hashtable) OIS.readObject(); + + System.out.println("Reading UnsureModel..."); + OIS = new ObjectInputStream( new FileInputStream( UnsureModel ) ); + UnsureDoc = (Integer) OIS.readObject(); + UnsureWeight = (Hashtable) OIS.readObject(); + + System.out.println("Read Success."); + } + + @SuppressWarnings("resource") + public void Read_Sentiment_Dictionary( ) throws Exception { //读入情感词典 + BufferedReader buf; + String str; + + Negative = new HashSet(); + buf = new BufferedReader( new InputStreamReader(new FileInputStream(NEGATIVE_DIC_PATH), FILE_ENCODING) ); + while( (str = buf.readLine()) != null ) { + Negative.add(str); + } + + Positive = new HashSet(); + buf = new BufferedReader( new InputStreamReader(new FileInputStream(POSITIVE_DIC_PATH), FILE_ENCODING) ); + while( (str = buf.readLine()) != null ) { + Positive.add(str); + } + } + + public void Classify_Directory( String DirectoryPath ) throws Exception { + + int PositiveNum = 0, NegativeNum = 0, UnsureNum = 0; + String[] Text_Path = new File( DirectoryPath ).list(); + + for ( int i = 0; i < Text_Path.length; i ++ ) { + + Classify( DirectoryPath+Text_Path[i] ); + double Ans = Classify( DirectoryPath+Text_Path[i] ); //对当前目录下的每一个文件进行测试 + if ( Ans < 0 ) { //根据测试结果将测试文本进行分类 + FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Positive"), Text_Path[i])); + PositiveNum ++; + } + else if ( Ans > 0 ) { + FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Negative"), Text_Path[i])); + NegativeNum ++; + } + else { + FileUtils.copyFile(new File(DirectoryPath+Text_Path[i]), new File( new File("Result", "Unsure"), Text_Path[i])); + UnsureNum ++; + } + System.out.print( "No." + (i+1) + " " + Text_Path[i] + ": " ); + if ( Ans < 0 ) { System.out.println("Positive"); } + else if ( Ans > 0 ) { System.out.println("Negative"); } + else { System.out.println("Unsure"); } + } + System.out.println("End."); + System.out.println("NegativeNum = " + NegativeNum + " PositiveNum = " + PositiveNum + " UnsureNum = " + UnsureNum); + } + + public double Classify( String FilePath ) throws Exception { + + Hashtable FileHashTable = Read_TestFile( FilePath ); + + Enumeration Keys; + double NegativeAns = 1, PositiveAns = 1; + + Keys = FileHashTable.keys(); + while( Keys.hasMoreElements() ) { + String Word = Keys.nextElement(); + NegativeAns *= ( Math.pow(this.PostProbability(Word, NegativeWeight), FileHashTable.get(Word)) ); + } + NegativeAns *= this.PriorProbability(NegativeDoc); + + Keys = FileHashTable.keys(); + while( Keys.hasMoreElements() ) { + String Word = Keys.nextElement(); + PositiveAns *= ( Math.pow(this.PostProbability(Word, PositiveWeight), FileHashTable.get(Word)) ); + } + PositiveAns *= this.PriorProbability(PositiveDoc); + + return ( NegativeAns-PositiveAns ); + } + + public Hashtable Read_TestFile( String FilePath ) throws Exception { + + ArrayList FileCurrentList = new ArrayList(); + ReadXML( FilePath, FileCurrentList ); + Hashtable FileHashTable = HashTable( FileCurrentList ); + + return FileHashTable; + } + + public void ReadXML( String FilePath, ArrayList currentList ) throws Exception { //从指定路径读取XML文件并提取出其情感词返回 + + SAXReader SaxReader = new SAXReader(); + Document Doc = SaxReader.read(new File(FilePath)); + Element root = Doc.getRootElement(); + + Element content = root.element("content"); + List sentenses = content.elements("sentence"); //每一句话作为一项 + + for ( Iterator iter = sentenses.iterator(); iter.hasNext(); ) { + Element sentense = (Element)iter.next(); + + List toks = sentense.elements(); + for ( Iterator iter1 = toks.iterator(); iter1.hasNext(); ) { + Element tok = (Element)iter1.next(); + String Type = tok.attributeValue("type"); + + if ( Type.equals("group") ) { //如果是"atom"一定不存在于情感词中 + GetWord( tok, currentList ); //从"group"中获取词 + } + } + } + } + + public void GetWord( Element root, ArrayList currentList ) { //获取XML中的情感词 + + String Word = ""; + List elements = root.elements("tok"); + for ( Iterator iter = elements.iterator(); iter.hasNext(); ) { + Element tok = (Element)iter.next(); + String Type = tok.attributeValue("type"); + + if ( Type.compareTo("atom") == 0 ) { + Word += tok.getText().trim(); + } + else { + GetWord( tok, currentList ); + } + } + if ( Word.length() > 1 && (Positive.contains(Word) || Negative.contains(Word)) ) { //筛选出情感词 + currentList.add(Word); + } + } + + public Hashtable HashTable( ArrayList currentList ) { //根据文本中的情感词构建哈希表 + + Hashtable HashTable = new Hashtable(); + + for ( Iterator iter = currentList.iterator(); iter.hasNext(); ) { + String Word = (String)iter.next(); + if ( HashTable.containsKey(Word) ) { + Integer Weight = HashTable.get(Word); + HashTable.put(Word, Weight+1); + } + else { + HashTable.put(Word, 1); + } + } + return HashTable; + } + + public double PriorProbability( Integer SentimentDoc ) { + + double Ans = 1; + + Ans = ( (double)SentimentDoc/( (double)NegativeDoc+(double)PositiveDoc+(double)UnsureDoc ) ); + + return Ans; + } + + public double PostProbability( String Word, Hashtable SentimentWeight ) { + + double Ans, V, E; + double Weight = 0, Weights = 0; + + if ( SentimentWeight.containsKey(Word) ) + Weight = (double)SentimentWeight.get(Word); + + Weights = PostWeights( SentimentWeight ); + + V = PostWeights( NegativeWeight ) + PostWeights( PositiveWeight ) + PostWeights( UnsureWeight ); + E = 1/Math.abs(V); + + Ans = ( Weight + E )/( Weights + E*Math.abs(V) ); + + return Ans; + } + + public double PostWeights( Hashtable SentimentWeight ) { + + double Weights = 0; + + Enumeration Keys; + Keys = SentimentWeight.keys(); + while( Keys.hasMoreElements() ) { + String Key = Keys.nextElement(); + Weights += (double)SentimentWeight.get(Key); + } + + return Weights; + } +} \ No newline at end of file