diff --git a/README.md b/README.md index c841921..ad1a51a 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ * 词性标注 * 命名实体识别 * 情感分析 (模型训练中) -* 知识图谱关系抽取 (模型训练中) +* 知识图谱关系抽取 * 关键词提取 * 文本摘要 * 新词发现 @@ -87,7 +87,16 @@ words = jiagu.seg('结婚的和尚未结婚的') print(words) ``` -3. 关键词提取 +3. 知识图谱关系抽取 +```python3 +import jiagu + +text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。' +knowledge = jiagu.knowledge(text) +print(knowledge) +``` + +4. 关键词提取 ```python3 import jiagu @@ -103,7 +112,7 @@ keywords = jiagu.keywords(text, 5) # 关键词 print(keywords) ``` -4. 文本摘要 +5. 文本摘要 ```python3 fin = open('input.txt', 'r') text = fin.read() @@ -113,7 +122,7 @@ summarize = jiagu.summarize(text, 3) # 摘要 print(summarize) ``` -5. 新词发现 +6. 新词发现 ```python3 import jiagu diff --git a/demo.py b/demo.py index 698a102..2e120de 100644 --- a/demo.py +++ b/demo.py @@ -34,4 +34,8 @@ print(summarize) # iagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。 +text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。' +knowledge = jiagu.knowledge(text) +print(knowledge) + diff --git a/jiagu/__init__.py b/jiagu/__init__.py index 74b40e7..826a4d2 100644 --- a/jiagu/__init__.py +++ b/jiagu/__init__.py @@ -43,3 +43,5 @@ summarize = any.summarize # 新词发现 findword = any.findword +# 知识图谱 +knowledge = any.knowledge \ No newline at end of file diff --git a/jiagu/analyze.py b/jiagu/analyze.py index 5fecfab..6a748ea 100644 --- a/jiagu/analyze.py +++ b/jiagu/analyze.py @@ -17,140 +17,211 @@ from jiagu.textrank import Summarize def add_curr_dir(name): - return os.path.join(os.path.dirname(__file__), name) + return os.path.join(os.path.dirname(__file__), name) class Analyze(object): - def __init__(self): - self.seg_model = None - self.pos_model = None - self.ner_model = None - - self.seg_mmseg = None - - self.keywords_model = None - self.summarize_model = None - - def init(self): - self.init_cws() - self.init_pos() - self.init_ner() - - def init_cws(self): - if self.seg_model is None: - self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model')) - - def load_model(self, model_path): - self.seg_model = bilstm_crf.Predict(model_path) - - def init_pos(self): - if self.pos_model is None: - self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model')) - - def init_ner(self): - if self.ner_model is None: - self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model')) - - def init_mmseg(self): - if self.seg_mmseg is None: - self.seg_mmseg = mmseg.MMSeg() - - @staticmethod - def __lab2word(sentence, labels): - sen_len = len(sentence) - tmp_word = "" - words = [] - for i in range(sen_len): - label = labels[i] - w = sentence[i] - if label == "B": - tmp_word += w - elif label == "M": - tmp_word += w - elif label == "E": - tmp_word += w - words.append(tmp_word) - tmp_word = "" - else: - tmp_word = "" - words.append(w) - if tmp_word: - words.append(tmp_word) - return words - - def cws_text(self, sentence): - if sentence == '': - return [''] - labels = self.seg_model.predict([sentence])[0] - return self.__lab2word(sentence, labels) - - def cws_list(self, sentences): - text_list = sentences - all_labels = self.seg_model.predict(text_list) - sent_words = [] - for ti, text in enumerate(text_list): - seg_labels = all_labels[ti] - sent_words.append(self.__lab2word(text, seg_labels)) - return sent_words - - def cws(self, sentence, input='text', model='default'): - """中文分词 - - :param sentence: str or list - 文本或者文本列表,根据input的模式来定 - :param input: str - 句子输入的格式,text则为默认的文本,batch则为批量的文本列表 - :param model: str - 分词所使用的模式,default为默认模式,mmseg为mmseg分词方式 - :return: - """ - if model == 'default': - self.init_cws() - - if input == 'batch': - words_list = self.cws_list(sentence) - return words_list - else: - words = self.cws_text(sentence) - return words - elif model == 'mmseg': - self.init_mmseg() - - words = self.seg_mmseg.cws(sentence) - return words - else: - pass - return [] - - def pos(self, sentence, input='words'): # 传入的是词语 - self.init_pos() - - if input == 'batch': - all_labels = self.pos_model.predict(sentence) - return all_labels - else: - labels = self.pos_model.predict([sentence])[0] - return labels - - def ner(self, sentence, input='text'): # 传入的是文本 - self.init_ner() - - if input == 'batch': - all_labels = self.ner_model.predict(sentence) - return all_labels - else: - labels = self.ner_model.predict([sentence])[0] - return labels - - def keywords(self, text, topkey=5): - if self.keywords_model == None: - self.keywords_model = Keywords(tol=0.0001, window=2) - return self.keywords_model.keywords(text, topkey) - - def summarize(self, text, topsen=5): - if self.summarize_model == None: - self.summarize_model = Summarize(tol=0.0001) - return self.summarize_model.summarize(text, topsen) - - def findword(self, input, output): - findword.new_word_find(input, output) + def __init__(self): + self.seg_model = None + self.pos_model = None + self.ner_model = None + + self.kg_model = None + + self.seg_mmseg = None + + self.keywords_model = None + self.summarize_model = None + + def init(self): + self.init_cws() + self.init_pos() + self.init_ner() + + def init_cws(self): + if self.seg_model is None: + self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model')) + + def load_model(self, model_path): + self.seg_model = bilstm_crf.Predict(model_path) + + def init_pos(self): + if self.pos_model is None: + self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model')) + + def init_ner(self): + if self.ner_model is None: + self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model')) + + def init_mmseg(self): + if self.seg_mmseg is None: + self.seg_mmseg = mmseg.MMSeg() + + def init_kg(self): + if self.kg_model is None: + self.kg_model = bilstm_crf.Predict(add_curr_dir('model/kg.model')) + + @staticmethod + def __lab2word(sentence, labels): + sen_len = len(sentence) + tmp_word = "" + words = [] + for i in range(sen_len): + label = labels[i] + w = sentence[i] + if label == "B": + tmp_word += w + elif label == "M": + tmp_word += w + elif label == "E": + tmp_word += w + words.append(tmp_word) + tmp_word = "" + else: + tmp_word = "" + words.append(w) + if tmp_word: + words.append(tmp_word) + return words + + def cws_text(self, sentence): + if sentence == '': + return [''] + labels = self.seg_model.predict([sentence])[0] + return self.__lab2word(sentence, labels) + + def cws_list(self, sentences): + text_list = sentences + all_labels = self.seg_model.predict(text_list) + sent_words = [] + for ti, text in enumerate(text_list): + seg_labels = all_labels[ti] + sent_words.append(self.__lab2word(text, seg_labels)) + return sent_words + + def cws(self, sentence, input='text', model='default'): + """中文分词 + + :param sentence: str or list + 文本或者文本列表,根据input的模式来定 + :param input: str + 句子输入的格式,text则为默认的文本,batch则为批量的文本列表 + :param model: str + 分词所使用的模式,default为默认模式,mmseg为mmseg分词方式 + :return: + """ + if model == 'default': + self.init_cws() + + if input == 'batch': + words_list = self.cws_list(sentence) + return words_list + else: + words = self.cws_text(sentence) + return words + elif model == 'mmseg': + self.init_mmseg() + + words = self.seg_mmseg.cws(sentence) + return words + else: + pass + return [] + + def pos(self, sentence, input='words'): # 传入的是词语 + self.init_pos() + + if input == 'batch': + all_labels = self.pos_model.predict(sentence) + return all_labels + else: + labels = self.pos_model.predict([sentence])[0] + return labels + + def ner(self, sentence, input='text'): # 传入的是文本 + self.init_ner() + + if input == 'batch': + all_labels = self.ner_model.predict(sentence) + return all_labels + else: + labels = self.ner_model.predict([sentence])[0] + return labels + + def knowledge(self, sentence, input='text'): + self.init_kg() + + if input == 'batch': + all_labels = self.kg_model.predict(sentence) + result = [] + for sent, labels in zip(sentence, all_labels): + result.append(self.lab2spo(sent, labels)) + return result + else: + labels = self.kg_model.predict([sentence])[0] + return self.lab2spo(sentence, labels) + + def keywords(self, text, topkey=5): + if self.keywords_model == None: + self.keywords_model = Keywords(tol=0.0001, window=2) + return self.keywords_model.keywords(text, topkey) + + def summarize(self, text, topsen=5): + if self.summarize_model == None: + self.summarize_model = Summarize(tol=0.0001) + return self.summarize_model.summarize(text, topsen) + + def findword(self, input, output): + findword.new_word_find(input, output) + + def lab2spo(self, text, epp_labels): + subject_list = [] # 存放实体的列表 + object_list = [] + index = 0 + for word, ep in zip(list(text), epp_labels): + if ep[0] == 'B' and ep[2:] == '实体': + subject_list.append([word, ep[2:], index]) + elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体': + if len(subject_list) == 0: + continue + subject_list[len(subject_list)-1][0] += word + + if ep[0] == 'B' and ep[2:] != '实体': + object_list.append([word, ep[2:], index]) + elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] != '实体': + if len(object_list) == 0: + return [] + object_list[len(object_list)-1][0] += word + + index += 1 + + spo_list = [] + if len(subject_list) == 0 or len(object_list) == 0: + pass + elif len(subject_list) == 1: + entity = subject_list[0] + for obj in object_list: + predicate = obj[1][:-1] + spo_list.append([entity[0], predicate, obj[0]]) + else: + for obj in object_list: + entity = [] + predicate = obj[1][:-1] + direction = obj[1][-1] + for sub in subject_list: + if direction == '+': + if sub[2] > obj[2]: + entity = sub + break + else: + if sub[2] < obj[2]: + entity = sub + + if entity == []: + continue + + spo_list.append([entity[0], predicate, obj[0]]) + + return spo_list + + \ No newline at end of file diff --git a/jiagu/model/kg.model b/jiagu/model/kg.model new file mode 100644 index 0000000..26cf41e Binary files /dev/null and b/jiagu/model/kg.model differ diff --git a/jiagu/utils.py b/jiagu/utils.py index 5834b77..bb19024 100644 --- a/jiagu/utils.py +++ b/jiagu/utils.py @@ -14,169 +14,169 @@ import numpy as np def default_stopwords_file(): - d = os.path.dirname(os.path.realpath(__file__)) - return os.path.join(d, 'data/stopwords.txt') + d = os.path.dirname(os.path.realpath(__file__)) + return os.path.join(d, 'data/stopwords.txt') sentence_delimiters = ['。', '?', '!', '…'] allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns', - 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng'] + 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng'] def as_text(v): - """生成unicode字符串""" - if v is None: - return None - elif isinstance(v, bytes): - return v.decode('utf-8', errors='ignore') - elif isinstance(v, str): - return v - else: - raise ValueError('Unknown type %r' % type(v)) + """生成unicode字符串""" + if v is None: + return None + elif isinstance(v, bytes): + return v.decode('utf-8', errors='ignore') + elif isinstance(v, str): + return v + else: + raise ValueError('Unknown type %r' % type(v)) def is_text(v): - return isinstance(v, str) + return isinstance(v, str) def cut_sentences(sentence): - tmp = [] - for ch in sentence: # 遍历字符串中的每一个字 - tmp.append(ch) - if ch in sentence_delimiters: - yield ''.join(tmp) - tmp = [] - yield ''.join(tmp) + tmp = [] + for ch in sentence: # 遍历字符串中的每一个字 + tmp.append(ch) + if ch in sentence_delimiters: + yield ''.join(tmp) + tmp = [] + yield ''.join(tmp) def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False): - sentences = [] - sents = [] - for sent in cutted_sentences: - sentences.append(sent) - if use_stopwords: - sents.append([word for word in jiagu.cut(sent) if word and word not in stopwords]) # 把句子分成词语 - else: - sents.append([word for word in jiagu.cut(sent) if word]) - return sentences, sents + sentences = [] + sents = [] + for sent in cutted_sentences: + sentences.append(sent) + if use_stopwords: + sents.append([word for word in jiagu.cut(sent) if word and word not in stopwords]) # 把句子分成词语 + else: + sents.append([word for word in jiagu.cut(sent) if word]) + return sentences, sents def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True): - sents = [] - sentences = [] - for sent in cutted_sentences: - sentences.append(sent) + sents = [] + sentences = [] + for sent in cutted_sentences: + sentences.append(sent) - word_list = jiagu.seg(sent) - word_list = [word for word in word_list if len(word) > 0] - if use_stopwords: - word_list = [word.strip() for word in word_list if word.strip() not in stopwords] - sents.append(word_list) - return sentences, sents + word_list = jiagu.seg(sent) + word_list = [word for word in word_list if len(word) > 0] + if use_stopwords: + word_list = [word.strip() for word in word_list if word.strip() not in stopwords] + sents.append(word_list) + return sentences, sents def weight_map_rank(weight_graph, max_iter, tol): - # 初始分数设置为0.5 - # 初始化每个句子的分子和老分数 - scores = [0.5 for _ in range(len(weight_graph))] - old_scores = [0.0 for _ in range(len(weight_graph))] - denominator = get_degree(weight_graph) - - # 开始迭代 - count = 0 - while different(scores, old_scores, tol): - for i in range(len(weight_graph)): - old_scores[i] = scores[i] - # 计算每个句子的分数 - for i in range(len(weight_graph)): - scores[i] = get_score(weight_graph, denominator, i) - count += 1 - if count > max_iter: - break - return scores + # 初始分数设置为0.5 + # 初始化每个句子的分子和老分数 + scores = [0.5 for _ in range(len(weight_graph))] + old_scores = [0.0 for _ in range(len(weight_graph))] + denominator = get_degree(weight_graph) + + # 开始迭代 + count = 0 + while different(scores, old_scores, tol): + for i in range(len(weight_graph)): + old_scores[i] = scores[i] + # 计算每个句子的分数 + for i in range(len(weight_graph)): + scores[i] = get_score(weight_graph, denominator, i) + count += 1 + if count > max_iter: + break + return scores def get_degree(weight_graph): - length = len(weight_graph) - denominator = [0.0 for _ in range(len(weight_graph))] - for j in range(length): - for k in range(length): - denominator[j] += weight_graph[j][k] - if denominator[j] == 0: - denominator[j] = 1.0 - return denominator + length = len(weight_graph) + denominator = [0.0 for _ in range(len(weight_graph))] + for j in range(length): + for k in range(length): + denominator[j] += weight_graph[j][k] + if denominator[j] == 0: + denominator[j] = 1.0 + return denominator def get_score(weight_graph, denominator, i): - """ - - :param weight_graph: - :param denominator: - :param i: int - 第i个句子 - :return: float - """ - length = len(weight_graph) - d = 0.85 - added_score = 0.0 - - for j in range(length): - # [j,i]是指句子j指向句子i - fraction = weight_graph[j][i] * 1.0 - # 除以j的出度 - added_score += fraction / denominator[j] - weighted_score = (1 - d) + d * added_score - return weighted_score + """ + + :param weight_graph: + :param denominator: + :param i: int + 第i个句子 + :return: float + """ + length = len(weight_graph) + d = 0.85 + added_score = 0.0 + + for j in range(length): + # [j,i]是指句子j指向句子i + fraction = weight_graph[j][i] * 1.0 + # 除以j的出度 + added_score += fraction / denominator[j] + weighted_score = (1 - d) + d * added_score + return weighted_score def different(scores, old_scores, tol=0.0001): - flag = False - for i in range(len(scores)): - if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001 - flag = True - break - return flag + flag = False + for i in range(len(scores)): + if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001 + flag = True + break + return flag def cosine_similarity(vec1, vec2): - """计算两个向量的余弦相似度 + """计算两个向量的余弦相似度 - :param vec1: list or np.array - :param vec2: list or np.array - :return: float - """ - tx = np.array(vec1) - ty = np.array(vec2) - cos1 = np.sum(tx * ty) - cos21 = np.sqrt(sum(tx ** 2)) - cos22 = np.sqrt(sum(ty ** 2)) - cosine_value = cos1 / float(cos21 * cos22) - return cosine_value + :param vec1: list or np.array + :param vec2: list or np.array + :return: float + """ + tx = np.array(vec1) + ty = np.array(vec2) + cos1 = np.sum(tx * ty) + cos21 = np.sqrt(sum(tx ** 2)) + cos22 = np.sqrt(sum(ty ** 2)) + cosine_value = cos1 / float(cos21 * cos22) + return cosine_value def combine(word_list, window=2): - if window < 2: - window = 2 - for x in range(1, window): - if x >= len(word_list): - break - word_list2 = word_list[x:] - res = zip(word_list, word_list2) - for r in res: - yield r + if window < 2: + window = 2 + for x in range(1, window): + if x >= len(word_list): + break + word_list2 = word_list[x:] + res = zip(word_list, word_list2) + for r in res: + yield r def sentences_similarity(s1, s2): - """计算两个句子的相似度 - - :param s1: list - :param s2: list - :return: float - """ - counter = 0 - for sent in s1: - if sent in s2: - counter += 1 - if counter == 0: - return 0 - return counter / (math.log(len(s1) + len(s2))) + """计算两个句子的相似度 + + :param s1: list + :param s2: list + :return: float + """ + counter = 0 + for sent in s1: + if sent in s2: + counter += 1 + if counter == 0: + return 0 + return counter / (math.log(len(s1) + len(s2)))