| @@ -1,5 +1,5 @@ | |||||
| # Jiagu自然语言处理工具 | # Jiagu自然语言处理工具 | ||||
| >>> Jiagu以BiLSTM等模型为基础,使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家。 | |||||
| >>> Jiagu以BiLSTM等模型为基础,使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家。 | |||||
| ## 目录 | ## 目录 | ||||
| * [安装方式](#安装方式) | * [安装方式](#安装方式) | ||||
| @@ -17,6 +17,8 @@ | |||||
| * 关键词提取 | * 关键词提取 | ||||
| * 文本摘要 | * 文本摘要 | ||||
| * 新词发现 | * 新词发现 | ||||
| * 情感分析 | |||||
| * 文本聚类 | |||||
| * 等等。。。。 | * 等等。。。。 | ||||
| --- | --- | ||||
| @@ -42,7 +44,7 @@ import jiagu | |||||
| text = '厦门明天会不会下雨' | text = '厦门明天会不会下雨' | ||||
| words = jiagu.seg(text) # 分词 | |||||
| words = jiagu.cut(text) # 分词 | |||||
| print(words) | print(words) | ||||
| pos = jiagu.pos(words) # 词性标注 | pos = jiagu.pos(words) # 词性标注 | ||||
| @@ -54,35 +56,38 @@ print(ner) | |||||
| 2. 中文分词 | 2. 中文分词 | ||||
| 分词各种模式使用方式 | |||||
| 自定义分词模型(将单独提供msr、pku、cnc等分词标准) | |||||
| ```python3 | ```python3 | ||||
| import jiagu | import jiagu | ||||
| text = '汉服和服装' | |||||
| # 独立标准模型路径 | |||||
| # msr:test/extra_data/model/msr.model | |||||
| # pku:test/extra_data/model/pku.model | |||||
| # cnc:test/extra_data/model/cnc.model | |||||
| words = jiagu.seg(text) # 默认分词 | |||||
| print(words) | |||||
| jiagu.load_model('test/extra_data/model/cnc.model') # 使用国家语委分词标准 | |||||
| words = jiagu.seg([text, text, text], input='batch') # 批量分词,加快速度。 | |||||
| print(words) | |||||
| words = jiagu.cut('结婚的和尚未结婚的') | |||||
| words = jiagu.seg(text, model='mmseg') # 使用mmseg算法进行分词 | |||||
| print(list(words)) | |||||
| print(words) | |||||
| ``` | ``` | ||||
| 自定义分词模型(将单独提供msr、pku、cnc等分词标准) | |||||
| 分词各种模式使用方式 | |||||
| ```python3 | ```python3 | ||||
| import jiagu | import jiagu | ||||
| # 独立标准模型路径 | |||||
| # msr:test/extra_data/model/msr.model | |||||
| # pku:test/extra_data/model/pku.model | |||||
| # cnc:test/extra_data/model/cnc.model | |||||
| text = '汉服和服装、知识图谱机器人' | |||||
| jiagu.load_model('test/extra_data/model/cnc.model') # 使用国家语委分词标准 | |||||
| words = jiagu.cut(text) # 默认分词 | |||||
| print(words) | |||||
| words = jiagu.seg(text) # 字典分词 | |||||
| print(words) | |||||
| words = jiagu.seg('结婚的和尚未结婚的') | |||||
| # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | |||||
| jiagu.load_userdict(['知识图谱']) | |||||
| words = jiagu.seg(text) # 自定义分词,字典分词模式有效 | |||||
| print(words) | print(words) | ||||
| ``` | ``` | ||||
| @@ -132,8 +137,31 @@ import jiagu | |||||
| jiagu.findword('input.txt', 'output.txt') # 根据文本,利用信息熵做新词发现。 | jiagu.findword('input.txt', 'output.txt') # 根据文本,利用信息熵做新词发现。 | ||||
| ``` | ``` | ||||
| 7. 情感分析 | |||||
| ```python3 | |||||
| text = '很讨厌还是个懒鬼' | |||||
| sentiment = jiagu.sentiment(text) | |||||
| print(sentiment) | |||||
| ``` | |||||
| 8. 文本聚类 | |||||
| ```python3 | |||||
| docs = [ | |||||
| "百度深度学习中文情感分析工具Senta试用及在线测试", | |||||
| "情感分析是自然语言处理里面一个热门话题", | |||||
| "AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总", | |||||
| "深度学习实践:从零开始做电影评论文本情感分析", | |||||
| "BERT相关论文、文章和代码资源汇总", | |||||
| "将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上", | |||||
| "自然语言处理工具包spaCy介绍", | |||||
| "现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文" | |||||
| ] | |||||
| cluster = jiagu.text_cluster(docs) | |||||
| print(cluster) | |||||
| ``` | |||||
| ### 评价标准 | ### 评价标准 | ||||
| 1. msr测试结果 | |||||
| 1. msr测试结果(旧版本) | |||||
|  |  | ||||
| @@ -5,18 +5,29 @@ import jiagu | |||||
| text = '厦门明天会不会下雨' | text = '厦门明天会不会下雨' | ||||
| words = jiagu.seg(text) # 分词,可以用model选择分词模式,不填则默认,mmseg则使用mmseg算法。 | |||||
| words = jiagu.cut(text) # 分词 | |||||
| print(words) | print(words) | ||||
| # words = jiagu.seg(text, model="mmseg") # mmseg 分词得到generator,需要用list进行转换 | |||||
| # print(list(words)) | |||||
| pos = jiagu.pos(words) # 词性标注 | pos = jiagu.pos(words) # 词性标注 | ||||
| print(pos) | print(pos) | ||||
| ner = jiagu.ner(text) # 命名实体识别 | ner = jiagu.ner(text) # 命名实体识别 | ||||
| print(ner) | print(ner) | ||||
| # 字典模式分词 | |||||
| text = '知识图谱机器人' | |||||
| words = jiagu.seg(text) | |||||
| print(words) | |||||
| # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | |||||
| jiagu.load_userdict(['知识图谱']) | |||||
| words = jiagu.seg(text) | |||||
| print(words) | |||||
| text = ''' | text = ''' | ||||
| 该研究主持者之一、波士顿大学地球与环境科学系博士陈池(音)表示,“尽管中国和印度国土面积仅占全球陆地的9%,但两国为这一绿化过程贡献超过三分之一。考虑到人口过多的国家一般存在对土地过度利用的问题,这个发现令人吃惊。” | 该研究主持者之一、波士顿大学地球与环境科学系博士陈池(音)表示,“尽管中国和印度国土面积仅占全球陆地的9%,但两国为这一绿化过程贡献超过三分之一。考虑到人口过多的国家一般存在对土地过度利用的问题,这个发现令人吃惊。” | ||||
| NASA埃姆斯研究中心的科学家拉玛·内曼尼(Rama Nemani)说,“这一长期数据能让我们深入分析地表绿化背后的影响因素。我们一开始以为,植被增加是由于更多二氧化碳排放,导致气候更加温暖、潮湿,适宜生长。” | NASA埃姆斯研究中心的科学家拉玛·内曼尼(Rama Nemani)说,“这一长期数据能让我们深入分析地表绿化背后的影响因素。我们一开始以为,植被增加是由于更多二氧化碳排放,导致气候更加温暖、潮湿,适宜生长。” | ||||
| @@ -25,17 +36,38 @@ NASA文章介绍,在中国为全球绿化进程做出的贡献中,有42%来 | |||||
| 据观察者网过往报道,2017年我国全国共完成造林736.2万公顷、森林抚育830.2万公顷。其中,天然林资源保护工程完成造林26万公顷,退耕还林工程完成造林91.2万公顷。京津风沙源治理工程完成造林18.5万公顷。三北及长江流域等重点防护林体系工程完成造林99.1万公顷。完成国家储备林建设任务68万公顷。 | 据观察者网过往报道,2017年我国全国共完成造林736.2万公顷、森林抚育830.2万公顷。其中,天然林资源保护工程完成造林26万公顷,退耕还林工程完成造林91.2万公顷。京津风沙源治理工程完成造林18.5万公顷。三北及长江流域等重点防护林体系工程完成造林99.1万公顷。完成国家储备林建设任务68万公顷。 | ||||
| ''' | ''' | ||||
| keywords = jiagu.keywords(text, 5) # 关键词 | |||||
| keywords = jiagu.keywords(text, 5) # 关键词抽取 | |||||
| print(keywords) | print(keywords) | ||||
| summarize = jiagu.summarize(text, 3) # 摘要 | |||||
| summarize = jiagu.summarize(text, 3) # 文本摘要 | |||||
| print(summarize) | print(summarize) | ||||
| # iagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。 | |||||
| # jiagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。 | |||||
| # 知识图谱关系抽取 | |||||
| text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。' | text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。' | ||||
| knowledge = jiagu.knowledge(text) | knowledge = jiagu.knowledge(text) | ||||
| print(knowledge) | print(knowledge) | ||||
| # 情感分析 | |||||
| text = '很讨厌还是个懒鬼' | |||||
| sentiment = jiagu.sentiment(text) | |||||
| print(sentiment) | |||||
| # 文本聚类(需要调参) | |||||
| docs = [ | |||||
| "百度深度学习中文情感分析工具Senta试用及在线测试", | |||||
| "情感分析是自然语言处理里面一个热门话题", | |||||
| "AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总", | |||||
| "深度学习实践:从零开始做电影评论文本情感分析", | |||||
| "BERT相关论文、文章和代码资源汇总", | |||||
| "将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上", | |||||
| "自然语言处理工具包spaCy介绍", | |||||
| "现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文" | |||||
| ] | |||||
| cluster = jiagu.text_cluster(docs) | |||||
| print(cluster) | |||||
| @@ -15,7 +15,7 @@ any = analyze.Analyze() | |||||
| init = any.init | init = any.init | ||||
| # 分词 | # 分词 | ||||
| seg = any.cws | |||||
| seg = any.seg | |||||
| cws = any.cws | cws = any.cws | ||||
| cut = any.cws | cut = any.cws | ||||
| @@ -29,7 +29,7 @@ ner = any.ner | |||||
| # parser | # parser | ||||
| # 加载用户字典 | # 加载用户字典 | ||||
| # load_userdict | |||||
| load_userdict = any.load_userdict | |||||
| # 自定义分词模型 | # 自定义分词模型 | ||||
| load_model = any.load_model | load_model = any.load_model | ||||
| @@ -44,4 +44,10 @@ summarize = any.summarize | |||||
| findword = any.findword | findword = any.findword | ||||
| # 知识图谱 | # 知识图谱 | ||||
| knowledge = any.knowledge | |||||
| knowledge = any.knowledge | |||||
| # 情感分析 | |||||
| sentiment = any.sentiment | |||||
| # 文本聚类 | |||||
| text_cluster = any.text_cluster | |||||
| @@ -14,7 +14,9 @@ from jiagu import findword | |||||
| from jiagu import bilstm_crf | from jiagu import bilstm_crf | ||||
| from jiagu.textrank import Keywords | from jiagu.textrank import Keywords | ||||
| from jiagu.textrank import Summarize | from jiagu.textrank import Summarize | ||||
| from jiagu.segment.nroute import Segment | |||||
| from jiagu.sentiment.bayes import Bayes | |||||
| from jiagu.cluster.text import text_cluster as cluster | |||||
| def add_curr_dir(name): | def add_curr_dir(name): | ||||
| return os.path.join(os.path.dirname(__file__), name) | return os.path.join(os.path.dirname(__file__), name) | ||||
| @@ -32,11 +34,19 @@ class Analyze(object): | |||||
| self.keywords_model = None | self.keywords_model = None | ||||
| self.summarize_model = None | self.summarize_model = None | ||||
| self.seg_nroute = Segment() | |||||
| self.sentiment_model = Bayes() | |||||
| def init(self): | def init(self): | ||||
| self.init_cws() | self.init_cws() | ||||
| self.init_pos() | self.init_pos() | ||||
| self.init_ner() | self.init_ner() | ||||
| self.seg_nroute.init() | |||||
| def load_userdict(self, userdict): | |||||
| self.seg_nroute.load_userdict(userdict) | |||||
| def init_cws(self): | def init_cws(self): | ||||
| if self.seg_model is None: | if self.seg_model is None: | ||||
| @@ -99,6 +109,9 @@ class Analyze(object): | |||||
| sent_words.append(self.__lab2word(text, seg_labels)) | sent_words.append(self.__lab2word(text, seg_labels)) | ||||
| return sent_words | return sent_words | ||||
| def seg(self, sentence): | |||||
| return self.seg_nroute.seg(sentence, mode="default") | |||||
| def cws(self, sentence, input='text', model='default'): | def cws(self, sentence, input='text', model='default'): | ||||
| """中文分词 | """中文分词 | ||||
| @@ -171,9 +184,17 @@ class Analyze(object): | |||||
| self.summarize_model = Summarize(tol=0.0001) | self.summarize_model = Summarize(tol=0.0001) | ||||
| return self.summarize_model.summarize(text, topsen) | return self.summarize_model.summarize(text, topsen) | ||||
| def findword(self, input, output): | |||||
| findword.new_word_find(input, output) | |||||
| def findword(self, input_file, output_file, min_freq=10, min_mtro=80, min_entro=3): | |||||
| findword.new_word_find(input_file, output_file, min_freq, min_mtro, min_entro) | |||||
| def sentiment(self, text): | |||||
| words = self.seg(text) | |||||
| ret, prob = self.sentiment_model.classify(words) | |||||
| return ret, prob | |||||
| def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | |||||
| return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | |||||
| def lab2spo(self, text, epp_labels): | def lab2spo(self, text, epp_labels): | ||||
| subject_list = [] # 存放实体的列表 | subject_list = [] # 存放实体的列表 | ||||
| object_list = [] | object_list = [] | ||||
| @@ -1,5 +1,4 @@ | |||||
| # -*-coding:utf-8-*- | # -*-coding:utf-8-*- | ||||
| import jiagu | |||||
| from collections import Counter | from collections import Counter | ||||
| import numpy as np | import numpy as np | ||||
| @@ -10,7 +9,7 @@ def elu_distance(a, b): | |||||
| return dist | return dist | ||||
| def count_features(corpus, tokenizer=jiagu.cut): | |||||
| def count_features(corpus, tokenizer=list): | |||||
| """词频特征 | """词频特征 | ||||
| :param corpus: list of str | :param corpus: list of str | ||||
| @@ -36,7 +35,7 @@ def count_features(corpus, tokenizer=jiagu.cut): | |||||
| return np.array(features), vocab | return np.array(features), vocab | ||||
| def tfidf_features(corpus, tokenizer=jiagu.cut): | |||||
| def tfidf_features(corpus, tokenizer=list): | |||||
| """文本的 tfidf 特征 | """文本的 tfidf 特征 | ||||
| :param corpus: list of str | :param corpus: list of str | ||||
| @@ -6,7 +6,8 @@ from .dbscan import DBSCAN | |||||
| from .kmeans import KMeans | from .kmeans import KMeans | ||||
| def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_iter=100, eps=None, min_pts=None): | |||||
| def text_cluster(docs, features_method='tfidf', method="dbscan", | |||||
| k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list): | |||||
| """文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 | """文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 | ||||
| :param features_method: str | :param features_method: str | ||||
| @@ -27,9 +28,9 @@ def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_it | |||||
| 聚类结果 | 聚类结果 | ||||
| """ | """ | ||||
| if features_method == 'tfidf': | if features_method == 'tfidf': | ||||
| features, names = tfidf_features(docs) | |||||
| features, names = tfidf_features(docs, tokenizer) | |||||
| elif features_method == 'count': | elif features_method == 'count': | ||||
| features, names = count_features(docs) | |||||
| features, names = count_features(docs, tokenizer) | |||||
| else: | else: | ||||
| raise ValueError('features_method error') | raise ValueError('features_method error') | ||||
| @@ -106,11 +106,7 @@ def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, | |||||
| return entro_dict | return entro_dict | ||||
| def new_word_find(input_file, output_file): | |||||
| min_freq = 10 | |||||
| min_mtro = 80 | |||||
| min_entro = 3 | |||||
| def new_word_find(input_file, output_file, min_freq=10, min_mtro=80, min_entro=3): | |||||
| word_freq = count_words(input_file) | word_freq = count_words(input_file) | ||||
| total_word = sum(word_freq.values()) | total_word = sum(word_freq.values()) | ||||
| @@ -0,0 +1,2 @@ | |||||
| 思知 | |||||
| @@ -0,0 +1,194 @@ | |||||
| import re | |||||
| import os | |||||
| import sys | |||||
| from math import log | |||||
| re_eng = re.compile('[a-zA-Z0-9]', re.U) | |||||
| re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) | |||||
| re_skip = re.compile("(\r\n|\s)", re.U) | |||||
| class Segment: | |||||
| def __init__(self): | |||||
| self.vocab = {} | |||||
| self.max_word_len = 0 | |||||
| self.max_freq = 0 | |||||
| self.total_freq = 0 | |||||
| self.initialized = False | |||||
| def init(self, vocab_path='dict/jiagu.dict'): | |||||
| self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path)) | |||||
| self.initialized = True | |||||
| def load_vocab(self, vocab_path): | |||||
| fin = open(vocab_path, 'r', encoding='utf8') | |||||
| for index, line in enumerate(fin): | |||||
| line = line.strip() | |||||
| if line == '': | |||||
| continue | |||||
| word_freq_tag = line.split('\t') | |||||
| if len(word_freq_tag) == 1: | |||||
| word = word_freq_tag[0] | |||||
| self.add_vocab(word) | |||||
| elif len(word_freq_tag) == 2: | |||||
| word = word_freq_tag[0] | |||||
| freq = int(word_freq_tag[1]) | |||||
| self.add_vocab(word, freq) | |||||
| fin.close() | |||||
| def add_vocab(self, word=None, freq=None, tag=None): | |||||
| if freq == None: | |||||
| freq = self.max_freq | |||||
| if word not in self.vocab: | |||||
| self.vocab[word] = 0 | |||||
| self.vocab[word] += freq | |||||
| self.total_freq += freq | |||||
| if freq > self.max_freq: | |||||
| self.max_freq = freq | |||||
| if len(word) > self.max_word_len: | |||||
| self.max_word_len = len(word) | |||||
| def load_userdict(self, userdict): | |||||
| if self.initialized == False: | |||||
| self.init() | |||||
| if isinstance(userdict, str): | |||||
| self.load_vocab(userdict) | |||||
| for item in userdict: | |||||
| if isinstance(item, list): | |||||
| if len(item) == 1: | |||||
| word = item[0] | |||||
| self.add_vocab(word) | |||||
| elif len(item) == 2: | |||||
| word = item[0] | |||||
| freq = item[1] | |||||
| self.add_vocab(word, freq) | |||||
| elif isinstance(item, str): | |||||
| self.add_vocab(word=item) | |||||
| def calc_route(self, sentence, DAG, route): | |||||
| vocab = self.vocab | |||||
| N = len(sentence) | |||||
| route[N] = (0, 0) | |||||
| logtotal = log(self.total_freq) | |||||
| for idx in range(N - 1, -1, -1): | |||||
| route[idx] = max((log(vocab.get(sentence[idx:x + 1]) or 1) - logtotal + route[x + 1][0], x) for x in DAG[idx]) | |||||
| def create_DAG(self, sentence): | |||||
| vocab = self.vocab | |||||
| max_word_len = self.max_word_len | |||||
| DAG = {} | |||||
| N = len(sentence) | |||||
| for idx in range(N): | |||||
| cand_idx = [idx] | |||||
| for i in range(idx+1, idx + min(max_word_len, N - idx), 1): | |||||
| cand = sentence[idx: i+1] | |||||
| if cand in vocab: | |||||
| cand_idx.append(i) | |||||
| DAG[idx] = cand_idx | |||||
| return DAG | |||||
| def cut_search(self, sentence): | |||||
| DAG = self.create_DAG(sentence) | |||||
| old_j = -1 | |||||
| for k, L in DAG.items(): | |||||
| if len(L) == 1 and k > old_j: | |||||
| yield sentence[k:L[0] + 1] | |||||
| old_j = L[0] | |||||
| else: | |||||
| for j in L: | |||||
| if j > k: | |||||
| yield sentence[k:j + 1] | |||||
| old_j = j | |||||
| def cut_vocab(self, sentence): | |||||
| DAG = self.create_DAG(sentence) | |||||
| route = {} | |||||
| self.calc_route(sentence, DAG, route) | |||||
| x = 0 | |||||
| N = len(sentence) | |||||
| buf = '' | |||||
| while x < N: | |||||
| y = route[x][1] + 1 | |||||
| l_word = sentence[x:y] | |||||
| if buf: | |||||
| yield buf | |||||
| buf = '' | |||||
| yield l_word | |||||
| x = y | |||||
| if buf: | |||||
| yield buf | |||||
| buf = '' | |||||
| def cut_words(self, sentence): | |||||
| DAG = self.create_DAG(sentence) | |||||
| route = {} | |||||
| self.calc_route(sentence, DAG, route) | |||||
| x = 0 | |||||
| N = len(sentence) | |||||
| buf = '' | |||||
| while x < N: | |||||
| y = route[x][1] + 1 | |||||
| l_word = sentence[x:y] | |||||
| if re_eng.match(l_word) and len(l_word) == 1: | |||||
| buf += l_word | |||||
| x = y | |||||
| else: | |||||
| if buf: | |||||
| yield buf | |||||
| buf = '' | |||||
| yield l_word | |||||
| x = y | |||||
| if buf: | |||||
| yield buf | |||||
| buf = '' | |||||
| def seg_default(self, sentence, mode): | |||||
| blocks = re_han.split(sentence) | |||||
| cut_block = self.cut_words | |||||
| cut_all = False | |||||
| for block in blocks: | |||||
| if not block: | |||||
| continue | |||||
| if re_han.match(block): | |||||
| for word in cut_block(block): | |||||
| yield word | |||||
| else: | |||||
| tmp = re_skip.split(block) | |||||
| for x in tmp: | |||||
| if re_skip.match(x): | |||||
| yield x | |||||
| elif not cut_all: | |||||
| for xx in x: | |||||
| yield xx | |||||
| else: | |||||
| yield x | |||||
| def seg(self, sentence, mode="default"): | |||||
| if self.initialized == False: | |||||
| self.init() | |||||
| return list(self.seg_default(sentence, mode=mode)) | |||||
| if __name__=='__main__': | |||||
| s = Segment() | |||||
| # sg.load_userdict('dict/user.dict') | |||||
| s.load_userdict(['知识图谱']) | |||||
| text = '知识图谱机器人hello\nworld¥¥' | |||||
| words = s.seg(text) | |||||
| print(words) | |||||
| @@ -1,4 +1,5 @@ | |||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
| import os | |||||
| import sys | import sys | ||||
| import gzip | import gzip | ||||
| import marshal | import marshal | ||||
| @@ -83,6 +84,9 @@ class Bayes(object): | |||||
| self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys())) | self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys())) | ||||
| def classify(self, x): | def classify(self, x): | ||||
| if self.d == {}: | |||||
| self.load(os.path.join(os.path.dirname(__file__), 'model/sentiment.model')) | |||||
| tmp = {} | tmp = {} | ||||
| for k in self.d: | for k in self.d: | ||||
| tmp[k] = log(self.d[k].getsum()) - log(self.total) | tmp[k] = log(self.d[k].getsum()) - log(self.total) | ||||
| @@ -3,7 +3,7 @@ | |||||
| from setuptools import setup | from setuptools import setup | ||||
| setup(name='jiagu', | setup(name='jiagu', | ||||
| version='0.1.7', | |||||
| version='0.1.8', | |||||
| description='Jiagu Natural Language Processing', | description='Jiagu Natural Language Processing', | ||||
| author='Yener(Zheng Wenyu)', | author='Yener(Zheng Wenyu)', | ||||
| author_email='help@ownthink.com', | author_email='help@ownthink.com', | ||||
| @@ -12,5 +12,7 @@ setup(name='jiagu', | |||||
| install_requires=['tensorflow>=1.6.0', 'numpy>=1.12.1'], | install_requires=['tensorflow>=1.6.0', 'numpy>=1.12.1'], | ||||
| packages=['jiagu'], | packages=['jiagu'], | ||||
| package_dir={'jiagu': 'jiagu'}, | package_dir={'jiagu': 'jiagu'}, | ||||
| package_data={'jiagu': ['*.*', 'model/*', 'data/*']} | |||||
| package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*', | |||||
| 'normal/*', 'segment/*', 'segment/dict/*', | |||||
| 'sentiment/*', 'sentiment/model/*', 'topic/*']} | |||||
| ) | ) | ||||