Browse Source

add knowledge

master
Yener 6 years ago
parent
commit
d46369767a
6 changed files with 348 additions and 262 deletions
  1. +13
    -4
      README.md
  2. +4
    -0
      demo.py
  3. +2
    -0
      jiagu/__init__.py
  4. +205
    -134
      jiagu/analyze.py
  5. BIN
      jiagu/model/kg.model
  6. +124
    -124
      jiagu/utils.py

+ 13
- 4
README.md View File

@@ -14,7 +14,7 @@
* 词性标注
* 命名实体识别
* 情感分析 (模型训练中)
* 知识图谱关系抽取 (模型训练中)
* 知识图谱关系抽取
* 关键词提取
* 文本摘要
* 新词发现
@@ -87,7 +87,16 @@ words = jiagu.seg('结婚的和尚未结婚的')
print(words)
```

3. 关键词提取
3. 知识图谱关系抽取
```python3
import jiagu

text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。'
knowledge = jiagu.knowledge(text)
print(knowledge)
```

4. 关键词提取
```python3
import jiagu

@@ -103,7 +112,7 @@ keywords = jiagu.keywords(text, 5) # 关键词
print(keywords)
```

4. 文本摘要
5. 文本摘要
```python3
fin = open('input.txt', 'r')
text = fin.read()
@@ -113,7 +122,7 @@ summarize = jiagu.summarize(text, 3) # 摘要
print(summarize)
```

5. 新词发现
6. 新词发现
```python3
import jiagu



+ 4
- 0
demo.py View File

@@ -34,4 +34,8 @@ print(summarize)
# iagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。


text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。'
knowledge = jiagu.knowledge(text)
print(knowledge)



+ 2
- 0
jiagu/__init__.py View File

@@ -43,3 +43,5 @@ summarize = any.summarize
# 新词发现
findword = any.findword

# 知识图谱
knowledge = any.knowledge

+ 205
- 134
jiagu/analyze.py View File

@@ -17,140 +17,211 @@ from jiagu.textrank import Summarize


def add_curr_dir(name):
return os.path.join(os.path.dirname(__file__), name)
return os.path.join(os.path.dirname(__file__), name)


class Analyze(object):
def __init__(self):
self.seg_model = None
self.pos_model = None
self.ner_model = None

self.seg_mmseg = None

self.keywords_model = None
self.summarize_model = None

def init(self):
self.init_cws()
self.init_pos()
self.init_ner()

def init_cws(self):
if self.seg_model is None:
self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model'))

def load_model(self, model_path):
self.seg_model = bilstm_crf.Predict(model_path)

def init_pos(self):
if self.pos_model is None:
self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model'))

def init_ner(self):
if self.ner_model is None:
self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model'))

def init_mmseg(self):
if self.seg_mmseg is None:
self.seg_mmseg = mmseg.MMSeg()

@staticmethod
def __lab2word(sentence, labels):
sen_len = len(sentence)
tmp_word = ""
words = []
for i in range(sen_len):
label = labels[i]
w = sentence[i]
if label == "B":
tmp_word += w
elif label == "M":
tmp_word += w
elif label == "E":
tmp_word += w
words.append(tmp_word)
tmp_word = ""
else:
tmp_word = ""
words.append(w)
if tmp_word:
words.append(tmp_word)
return words

def cws_text(self, sentence):
if sentence == '':
return ['']
labels = self.seg_model.predict([sentence])[0]
return self.__lab2word(sentence, labels)

def cws_list(self, sentences):
text_list = sentences
all_labels = self.seg_model.predict(text_list)
sent_words = []
for ti, text in enumerate(text_list):
seg_labels = all_labels[ti]
sent_words.append(self.__lab2word(text, seg_labels))
return sent_words

def cws(self, sentence, input='text', model='default'):
"""中文分词

:param sentence: str or list
文本或者文本列表,根据input的模式来定
:param input: str
句子输入的格式,text则为默认的文本,batch则为批量的文本列表
:param model: str
分词所使用的模式,default为默认模式,mmseg为mmseg分词方式
:return:
"""
if model == 'default':
self.init_cws()

if input == 'batch':
words_list = self.cws_list(sentence)
return words_list
else:
words = self.cws_text(sentence)
return words
elif model == 'mmseg':
self.init_mmseg()

words = self.seg_mmseg.cws(sentence)
return words
else:
pass
return []

def pos(self, sentence, input='words'): # 传入的是词语
self.init_pos()

if input == 'batch':
all_labels = self.pos_model.predict(sentence)
return all_labels
else:
labels = self.pos_model.predict([sentence])[0]
return labels

def ner(self, sentence, input='text'): # 传入的是文本
self.init_ner()

if input == 'batch':
all_labels = self.ner_model.predict(sentence)
return all_labels
else:
labels = self.ner_model.predict([sentence])[0]
return labels

def keywords(self, text, topkey=5):
if self.keywords_model == None:
self.keywords_model = Keywords(tol=0.0001, window=2)
return self.keywords_model.keywords(text, topkey)

def summarize(self, text, topsen=5):
if self.summarize_model == None:
self.summarize_model = Summarize(tol=0.0001)
return self.summarize_model.summarize(text, topsen)

def findword(self, input, output):
findword.new_word_find(input, output)
def __init__(self):
self.seg_model = None
self.pos_model = None
self.ner_model = None
self.kg_model = None

self.seg_mmseg = None

self.keywords_model = None
self.summarize_model = None

def init(self):
self.init_cws()
self.init_pos()
self.init_ner()

def init_cws(self):
if self.seg_model is None:
self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model'))

def load_model(self, model_path):
self.seg_model = bilstm_crf.Predict(model_path)

def init_pos(self):
if self.pos_model is None:
self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model'))

def init_ner(self):
if self.ner_model is None:
self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model'))

def init_mmseg(self):
if self.seg_mmseg is None:
self.seg_mmseg = mmseg.MMSeg()

def init_kg(self):
if self.kg_model is None:
self.kg_model = bilstm_crf.Predict(add_curr_dir('model/kg.model'))

@staticmethod
def __lab2word(sentence, labels):
sen_len = len(sentence)
tmp_word = ""
words = []
for i in range(sen_len):
label = labels[i]
w = sentence[i]
if label == "B":
tmp_word += w
elif label == "M":
tmp_word += w
elif label == "E":
tmp_word += w
words.append(tmp_word)
tmp_word = ""
else:
tmp_word = ""
words.append(w)
if tmp_word:
words.append(tmp_word)
return words

def cws_text(self, sentence):
if sentence == '':
return ['']
labels = self.seg_model.predict([sentence])[0]
return self.__lab2word(sentence, labels)

def cws_list(self, sentences):
text_list = sentences
all_labels = self.seg_model.predict(text_list)
sent_words = []
for ti, text in enumerate(text_list):
seg_labels = all_labels[ti]
sent_words.append(self.__lab2word(text, seg_labels))
return sent_words

def cws(self, sentence, input='text', model='default'):
"""中文分词

:param sentence: str or list
文本或者文本列表,根据input的模式来定
:param input: str
句子输入的格式,text则为默认的文本,batch则为批量的文本列表
:param model: str
分词所使用的模式,default为默认模式,mmseg为mmseg分词方式
:return:
"""
if model == 'default':
self.init_cws()

if input == 'batch':
words_list = self.cws_list(sentence)
return words_list
else:
words = self.cws_text(sentence)
return words
elif model == 'mmseg':
self.init_mmseg()

words = self.seg_mmseg.cws(sentence)
return words
else:
pass
return []

def pos(self, sentence, input='words'): # 传入的是词语
self.init_pos()

if input == 'batch':
all_labels = self.pos_model.predict(sentence)
return all_labels
else:
labels = self.pos_model.predict([sentence])[0]
return labels

def ner(self, sentence, input='text'): # 传入的是文本
self.init_ner()

if input == 'batch':
all_labels = self.ner_model.predict(sentence)
return all_labels
else:
labels = self.ner_model.predict([sentence])[0]
return labels

def knowledge(self, sentence, input='text'):
self.init_kg()

if input == 'batch':
all_labels = self.kg_model.predict(sentence)
result = []
for sent, labels in zip(sentence, all_labels):
result.append(self.lab2spo(sent, labels))
return result
else:
labels = self.kg_model.predict([sentence])[0]
return self.lab2spo(sentence, labels)
def keywords(self, text, topkey=5):
if self.keywords_model == None:
self.keywords_model = Keywords(tol=0.0001, window=2)
return self.keywords_model.keywords(text, topkey)

def summarize(self, text, topsen=5):
if self.summarize_model == None:
self.summarize_model = Summarize(tol=0.0001)
return self.summarize_model.summarize(text, topsen)

def findword(self, input, output):
findword.new_word_find(input, output)

def lab2spo(self, text, epp_labels):
subject_list = [] # 存放实体的列表
object_list = []
index = 0
for word, ep in zip(list(text), epp_labels):
if ep[0] == 'B' and ep[2:] == '实体':
subject_list.append([word, ep[2:], index])
elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体':
if len(subject_list) == 0:
continue
subject_list[len(subject_list)-1][0] += word
if ep[0] == 'B' and ep[2:] != '实体':
object_list.append([word, ep[2:], index])
elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] != '实体':
if len(object_list) == 0:
return []
object_list[len(object_list)-1][0] += word
index += 1
spo_list = []
if len(subject_list) == 0 or len(object_list) == 0:
pass
elif len(subject_list) == 1:
entity = subject_list[0]
for obj in object_list:
predicate = obj[1][:-1]
spo_list.append([entity[0], predicate, obj[0]])
else:
for obj in object_list:
entity = []
predicate = obj[1][:-1]
direction = obj[1][-1]
for sub in subject_list:
if direction == '+':
if sub[2] > obj[2]:
entity = sub
break
else:
if sub[2] < obj[2]:
entity = sub
if entity == []:
continue
spo_list.append([entity[0], predicate, obj[0]])
return spo_list

BIN
jiagu/model/kg.model View File


+ 124
- 124
jiagu/utils.py View File

@@ -14,169 +14,169 @@ import numpy as np
def default_stopwords_file():
d = os.path.dirname(os.path.realpath(__file__))
return os.path.join(d, 'data/stopwords.txt')
d = os.path.dirname(os.path.realpath(__file__))
return os.path.join(d, 'data/stopwords.txt')
sentence_delimiters = ['。', '?', '!', '…']
allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns',
'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
def as_text(v):
"""生成unicode字符串"""
if v is None:
return None
elif isinstance(v, bytes):
return v.decode('utf-8', errors='ignore')
elif isinstance(v, str):
return v
else:
raise ValueError('Unknown type %r' % type(v))
"""生成unicode字符串"""
if v is None:
return None
elif isinstance(v, bytes):
return v.decode('utf-8', errors='ignore')
elif isinstance(v, str):
return v
else:
raise ValueError('Unknown type %r' % type(v))
def is_text(v):
return isinstance(v, str)
return isinstance(v, str)
def cut_sentences(sentence):
tmp = []
for ch in sentence: # 遍历字符串中的每一个字
tmp.append(ch)
if ch in sentence_delimiters:
yield ''.join(tmp)
tmp = []
yield ''.join(tmp)
tmp = []
for ch in sentence: # 遍历字符串中的每一个字
tmp.append(ch)
if ch in sentence_delimiters:
yield ''.join(tmp)
tmp = []
yield ''.join(tmp)
def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False):
sentences = []
sents = []
for sent in cutted_sentences:
sentences.append(sent)
if use_stopwords:
sents.append([word for word in jiagu.cut(sent) if word and word not in stopwords]) # 把句子分成词语
else:
sents.append([word for word in jiagu.cut(sent) if word])
return sentences, sents
sentences = []
sents = []
for sent in cutted_sentences:
sentences.append(sent)
if use_stopwords:
sents.append([word for word in jiagu.cut(sent) if word and word not in stopwords]) # 把句子分成词语
else:
sents.append([word for word in jiagu.cut(sent) if word])
return sentences, sents
def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True):
sents = []
sentences = []
for sent in cutted_sentences:
sentences.append(sent)
sents = []
sentences = []
for sent in cutted_sentences:
sentences.append(sent)
word_list = jiagu.seg(sent)
word_list = [word for word in word_list if len(word) > 0]
if use_stopwords:
word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
sents.append(word_list)
return sentences, sents
word_list = jiagu.seg(sent)
word_list = [word for word in word_list if len(word) > 0]
if use_stopwords:
word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
sents.append(word_list)
return sentences, sents
def weight_map_rank(weight_graph, max_iter, tol):
# 初始分数设置为0.5
# 初始化每个句子的分子和老分数
scores = [0.5 for _ in range(len(weight_graph))]
old_scores = [0.0 for _ in range(len(weight_graph))]
denominator = get_degree(weight_graph)
# 开始迭代
count = 0
while different(scores, old_scores, tol):
for i in range(len(weight_graph)):
old_scores[i] = scores[i]
# 计算每个句子的分数
for i in range(len(weight_graph)):
scores[i] = get_score(weight_graph, denominator, i)
count += 1
if count > max_iter:
break
return scores
# 初始分数设置为0.5
# 初始化每个句子的分子和老分数
scores = [0.5 for _ in range(len(weight_graph))]
old_scores = [0.0 for _ in range(len(weight_graph))]
denominator = get_degree(weight_graph)
# 开始迭代
count = 0
while different(scores, old_scores, tol):
for i in range(len(weight_graph)):
old_scores[i] = scores[i]
# 计算每个句子的分数
for i in range(len(weight_graph)):
scores[i] = get_score(weight_graph, denominator, i)
count += 1
if count > max_iter:
break
return scores
def get_degree(weight_graph):
length = len(weight_graph)
denominator = [0.0 for _ in range(len(weight_graph))]
for j in range(length):
for k in range(length):
denominator[j] += weight_graph[j][k]
if denominator[j] == 0:
denominator[j] = 1.0
return denominator
length = len(weight_graph)
denominator = [0.0 for _ in range(len(weight_graph))]
for j in range(length):
for k in range(length):
denominator[j] += weight_graph[j][k]
if denominator[j] == 0:
denominator[j] = 1.0
return denominator
def get_score(weight_graph, denominator, i):
"""
:param weight_graph:
:param denominator:
:param i: int
第i个句子
:return: float
"""
length = len(weight_graph)
d = 0.85
added_score = 0.0
for j in range(length):
# [j,i]是指句子j指向句子i
fraction = weight_graph[j][i] * 1.0
# 除以j的出度
added_score += fraction / denominator[j]
weighted_score = (1 - d) + d * added_score
return weighted_score
"""
:param weight_graph:
:param denominator:
:param i: int
第i个句子
:return: float
"""
length = len(weight_graph)
d = 0.85
added_score = 0.0
for j in range(length):
# [j,i]是指句子j指向句子i
fraction = weight_graph[j][i] * 1.0
# 除以j的出度
added_score += fraction / denominator[j]
weighted_score = (1 - d) + d * added_score
return weighted_score
def different(scores, old_scores, tol=0.0001):
flag = False
for i in range(len(scores)):
if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001
flag = True
break
return flag
flag = False
for i in range(len(scores)):
if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001
flag = True
break
return flag
def cosine_similarity(vec1, vec2):
"""计算两个向量的余弦相似度
"""计算两个向量的余弦相似度
:param vec1: list or np.array
:param vec2: list or np.array
:return: float
"""
tx = np.array(vec1)
ty = np.array(vec2)
cos1 = np.sum(tx * ty)
cos21 = np.sqrt(sum(tx ** 2))
cos22 = np.sqrt(sum(ty ** 2))
cosine_value = cos1 / float(cos21 * cos22)
return cosine_value
:param vec1: list or np.array
:param vec2: list or np.array
:return: float
"""
tx = np.array(vec1)
ty = np.array(vec2)
cos1 = np.sum(tx * ty)
cos21 = np.sqrt(sum(tx ** 2))
cos22 = np.sqrt(sum(ty ** 2))
cosine_value = cos1 / float(cos21 * cos22)
return cosine_value
def combine(word_list, window=2):
if window < 2:
window = 2
for x in range(1, window):
if x >= len(word_list):
break
word_list2 = word_list[x:]
res = zip(word_list, word_list2)
for r in res:
yield r
if window < 2:
window = 2
for x in range(1, window):
if x >= len(word_list):
break
word_list2 = word_list[x:]
res = zip(word_list, word_list2)
for r in res:
yield r
def sentences_similarity(s1, s2):
"""计算两个句子的相似度
:param s1: list
:param s2: list
:return: float
"""
counter = 0
for sent in s1:
if sent in s2:
counter += 1
if counter == 0:
return 0
return counter / (math.log(len(s1) + len(s2)))
"""计算两个句子的相似度
:param s1: list
:param s2: list
:return: float
"""
counter = 0
for sent in s1:
if sent in s2:
counter += 1
if counter == 0:
return 0
return counter / (math.log(len(s1) + len(s2)))

Loading…
Cancel
Save