| @@ -53,7 +53,7 @@ print(words) | |||||
| pos = jiagu.pos(words) # 词性标注 | pos = jiagu.pos(words) # 词性标注 | ||||
| print(pos) | print(pos) | ||||
| ner = jiagu.ner(text) # 命名实体识别 | |||||
| ner = jiagu.ner(words) # 命名实体识别 | |||||
| print(ner) | print(ner) | ||||
| ``` | ``` | ||||
| @@ -61,7 +61,7 @@ print(ner) | |||||
| ```python3 | ```python3 | ||||
| import jiagu | import jiagu | ||||
| text = '汉服和服装、知识图谱机器人' | |||||
| text = '汉服和服装、维基图谱' | |||||
| words = jiagu.cut(text) # 深度学习分词 | words = jiagu.cut(text) # 深度学习分词 | ||||
| print(words) | print(words) | ||||
| @@ -70,7 +70,7 @@ words = jiagu.seg(text) # 字典分词 | |||||
| print(words) | print(words) | ||||
| # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | ||||
| jiagu.load_userdict(['知识图谱']) | |||||
| jiagu.load_userdict(['汉服和服装']) | |||||
| words = jiagu.seg(text) # 自定义分词,字典分词模式有效 | words = jiagu.seg(text) # 自定义分词,字典分词模式有效 | ||||
| print(words) | print(words) | ||||
| @@ -3,25 +3,25 @@ import jiagu | |||||
| # jiagu.init() # 可手动初始化,也可以动态初始化 | # jiagu.init() # 可手动初始化,也可以动态初始化 | ||||
| text = '厦门明天会不会下雨' | |||||
| text = '在苏州冻成狗' | |||||
| words = jiagu.cut(text) # 分词 | |||||
| words = jiagu.seg(text) # 分词 | |||||
| print(words) | print(words) | ||||
| pos = jiagu.pos(words) # 词性标注 | pos = jiagu.pos(words) # 词性标注 | ||||
| print(pos) | print(pos) | ||||
| ner = jiagu.ner(text) # 命名实体识别 | |||||
| ner = jiagu.ner(words) # 命名实体识别 | |||||
| print(ner) | print(ner) | ||||
| # 字典模式分词 | # 字典模式分词 | ||||
| text = '知识图谱机器人' | |||||
| text = '思知机器人挺好用的' | |||||
| words = jiagu.seg(text) | words = jiagu.seg(text) | ||||
| print(words) | print(words) | ||||
| # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | ||||
| jiagu.load_userdict(['知识图谱']) | |||||
| jiagu.load_userdict(['思知机器人']) | |||||
| words = jiagu.seg(text) | words = jiagu.seg(text) | ||||
| print(words) | print(words) | ||||
| @@ -1,13 +1,5 @@ | |||||
| #!/usr/bin/env python3 | #!/usr/bin/env python3 | ||||
| # -*-coding:utf-8-*- | # -*-coding:utf-8-*- | ||||
| """ | |||||
| * Copyright (C) 2018 OwnThink. | |||||
| * | |||||
| * Name : __init__.py | |||||
| * Author : Yener <yener@ownthink.com> | |||||
| * Version : 0.01 | |||||
| * Description : | |||||
| """ | |||||
| from jiagu import analyze | from jiagu import analyze | ||||
| any = analyze.Analyze() | any = analyze.Analyze() | ||||
| @@ -1,11 +1,2 @@ | |||||
| #!/usr/bin/env python3 | #!/usr/bin/env python3 | ||||
| # -*-coding:utf-8-*- | # -*-coding:utf-8-*- | ||||
| """ | |||||
| * Copyright (C) 2018 OwnThink. | |||||
| * | |||||
| * Name : __main__.py | |||||
| * Author : Yener <yener@ownthink.com> | |||||
| * Version : 0.01 | |||||
| * Description : | |||||
| """ | |||||
| @@ -1,17 +1,9 @@ | |||||
| #!/usr/bin/env python3 | #!/usr/bin/env python3 | ||||
| # -*-coding:utf-8-*- | # -*-coding:utf-8-*- | ||||
| """ | |||||
| * Copyright (C) 2018 OwnThink. | |||||
| * | |||||
| * Name : analyze.py - 解析模块 | |||||
| * Author : Yener <yener@ownthink.com> | |||||
| * Version : 0.01 | |||||
| * Description : | |||||
| """ | |||||
| import os | import os | ||||
| from jiagu import mmseg | from jiagu import mmseg | ||||
| from jiagu import findword | from jiagu import findword | ||||
| from jiagu import bilstm_crf | |||||
| from jiagu import perceptron | |||||
| from jiagu.textrank import Keywords | from jiagu.textrank import Keywords | ||||
| from jiagu.textrank import Summarize | from jiagu.textrank import Summarize | ||||
| from jiagu.segment.nroute import Segment | from jiagu.segment.nroute import Segment | ||||
| @@ -50,18 +42,18 @@ class Analyze(object): | |||||
| def init_cws(self): | def init_cws(self): | ||||
| if self.seg_model is None: | if self.seg_model is None: | ||||
| self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model')) | |||||
| self.seg_model = perceptron.Perceptron(add_curr_dir('model/cws.model')) | |||||
| def load_model(self, model_path): | def load_model(self, model_path): | ||||
| self.seg_model = bilstm_crf.Predict(model_path) | |||||
| self.seg_model = perceptron.Perceptron(model_path) | |||||
| def init_pos(self): | def init_pos(self): | ||||
| if self.pos_model is None: | if self.pos_model is None: | ||||
| self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model')) | |||||
| self.pos_model = perceptron.Perceptron(add_curr_dir('model/pos.model')) | |||||
| def init_ner(self): | def init_ner(self): | ||||
| if self.ner_model is None: | if self.ner_model is None: | ||||
| self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model')) | |||||
| self.ner_model = perceptron.Perceptron(add_curr_dir('model/ner.model')) | |||||
| def init_mmseg(self): | def init_mmseg(self): | ||||
| if self.seg_mmseg is None: | if self.seg_mmseg is None: | ||||
| @@ -69,7 +61,7 @@ class Analyze(object): | |||||
| def init_kg(self): | def init_kg(self): | ||||
| if self.kg_model is None: | if self.kg_model is None: | ||||
| self.kg_model = bilstm_crf.Predict(add_curr_dir('model/kg.model')) | |||||
| self.kg_model = perceptron.Perceptron(add_curr_dir('model/kg.model')) | |||||
| @staticmethod | @staticmethod | ||||
| def __lab2word(sentence, labels): | def __lab2word(sentence, labels): | ||||
| @@ -97,22 +89,13 @@ class Analyze(object): | |||||
| def cws_text(self, sentence): | def cws_text(self, sentence): | ||||
| if sentence == '': | if sentence == '': | ||||
| return [''] | return [''] | ||||
| labels = self.seg_model.predict([sentence])[0] | |||||
| labels = self.seg_model.predict(list(sentence)) | |||||
| return self.__lab2word(sentence, labels) | return self.__lab2word(sentence, labels) | ||||
| def cws_list(self, sentences): | |||||
| text_list = sentences | |||||
| all_labels = self.seg_model.predict(text_list) | |||||
| sent_words = [] | |||||
| for ti, text in enumerate(text_list): | |||||
| seg_labels = all_labels[ti] | |||||
| sent_words.append(self.__lab2word(text, seg_labels)) | |||||
| return sent_words | |||||
| def seg(self, sentence): | def seg(self, sentence): | ||||
| return self.seg_nroute.seg(sentence, mode="default") | return self.seg_nroute.seg(sentence, mode="default") | ||||
| def cws(self, sentence, input='text', model='default'): | |||||
| def cws(self, sentence, model='default'): | |||||
| """中文分词 | """中文分词 | ||||
| :param sentence: str or list | :param sentence: str or list | ||||
| @@ -125,54 +108,31 @@ class Analyze(object): | |||||
| """ | """ | ||||
| if model == 'default': | if model == 'default': | ||||
| self.init_cws() | self.init_cws() | ||||
| if input == 'batch': | |||||
| words_list = self.cws_list(sentence) | |||||
| return words_list | |||||
| else: | |||||
| words = self.cws_text(sentence) | |||||
| return words | |||||
| words = self.cws_text(sentence) | |||||
| return words | |||||
| elif model == 'mmseg': | elif model == 'mmseg': | ||||
| self.init_mmseg() | self.init_mmseg() | ||||
| words = self.seg_mmseg.cws(sentence) | words = self.seg_mmseg.cws(sentence) | ||||
| return words | return words | ||||
| else: | else: | ||||
| pass | pass | ||||
| return [] | return [] | ||||
| def pos(self, sentence, input='words'): # 传入的是词语 | |||||
| def pos(self, words): # 传入的是词语 | |||||
| self.init_pos() | self.init_pos() | ||||
| labels = self.pos_model.predict(words) | |||||
| return labels | |||||
| if input == 'batch': | |||||
| all_labels = self.pos_model.predict(sentence) | |||||
| return all_labels | |||||
| else: | |||||
| labels = self.pos_model.predict([sentence])[0] | |||||
| return labels | |||||
| def ner(self, sentence, input='text'): # 传入的是文本 | |||||
| def ner(self, words): # 传入的是词语 | |||||
| self.init_ner() | self.init_ner() | ||||
| labels = self.ner_model.predict(words) | |||||
| return labels | |||||
| if input == 'batch': | |||||
| all_labels = self.ner_model.predict(sentence) | |||||
| return all_labels | |||||
| else: | |||||
| labels = self.ner_model.predict([sentence])[0] | |||||
| return labels | |||||
| def knowledge(self, sentence, input='text'): | |||||
| def knowledge(self, text): # 传入的是文本 | |||||
| self.init_kg() | self.init_kg() | ||||
| if input == 'batch': | |||||
| all_labels = self.kg_model.predict(sentence) | |||||
| result = [] | |||||
| for sent, labels in zip(sentence, all_labels): | |||||
| result.append(self.lab2spo(sent, labels)) | |||||
| return result | |||||
| else: | |||||
| labels = self.kg_model.predict([sentence])[0] | |||||
| return self.lab2spo(sentence, labels) | |||||
| words = self.seg(text) | |||||
| labels = self.kg_model.predict(words) | |||||
| return self.lab2spo(words, labels) | |||||
| def keywords(self, text, topkey=5): | def keywords(self, text, topkey=5): | ||||
| if self.keywords_model == None: | if self.keywords_model == None: | ||||
| @@ -195,11 +155,11 @@ class Analyze(object): | |||||
| def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | ||||
| return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | ||||
| def lab2spo(self, text, epp_labels): | |||||
| def lab2spo(self, words, epp_labels): | |||||
| subject_list = [] # 存放实体的列表 | subject_list = [] # 存放实体的列表 | ||||
| object_list = [] | object_list = [] | ||||
| index = 0 | index = 0 | ||||
| for word, ep in zip(list(text), epp_labels): | |||||
| for word, ep in zip(words, epp_labels): | |||||
| if ep[0] == 'B' and ep[2:] == '实体': | if ep[0] == 'B' and ep[2:] == '实体': | ||||
| subject_list.append([word, ep[2:], index]) | subject_list.append([word, ep[2:], index]) | ||||
| elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体': | elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体': | ||||
| @@ -1,77 +0,0 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # -*-coding:utf-8-*- | |||||
| """ | |||||
| * Copyright (C) 2018 OwnThink. | |||||
| * | |||||
| * Name : bilstm_crf.py - 预测 | |||||
| * Author : Yener <yener@ownthink.com> | |||||
| * Version : 0.01 | |||||
| * Description : | |||||
| """ | |||||
| import os | |||||
| os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' | |||||
| import pickle | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| from tensorflow.contrib.crf import viterbi_decode | |||||
| class Predict(object): | |||||
| def __init__(self, model_file): | |||||
| with open(model_file, 'rb') as f: | |||||
| model, char_to_id, id_to_tag = pickle.load(f) | |||||
| self.char_to_id = char_to_id | |||||
| self.id_to_tag = {int(k): v for k, v in id_to_tag.items()} | |||||
| self.num_class = len(self.id_to_tag) | |||||
| graph_def = tf.GraphDef() | |||||
| graph_def.ParseFromString(model) | |||||
| with tf.Graph().as_default() as graph: | |||||
| tf.import_graph_def(graph_def, name="prefix") | |||||
| self.input_x = graph.get_tensor_by_name("prefix/char_inputs:0") | |||||
| self.lengths = graph.get_tensor_by_name("prefix/lengths:0") | |||||
| self.dropout = graph.get_tensor_by_name("prefix/dropout:0") | |||||
| self.logits = graph.get_tensor_by_name("prefix/project/logits:0") | |||||
| self.trans = graph.get_tensor_by_name("prefix/crf_loss/transitions:0") | |||||
| self.sess = tf.Session(graph=graph) | |||||
| self.sess.as_default() | |||||
| def decode(self, logits, trans, sequence_lengths, tag_num): | |||||
| small = -1000.0 | |||||
| viterbi_sequences = [] | |||||
| start = np.asarray([[small] * tag_num + [0]]) | |||||
| for logit, length in zip(logits, sequence_lengths): | |||||
| score = logit[:length] | |||||
| pad = small * np.ones([length, 1]) | |||||
| score = np.concatenate([score, pad], axis=1) | |||||
| score = np.concatenate([start, score], axis=0) | |||||
| viterbi_seq, viterbi_score = viterbi_decode(score, trans) | |||||
| viterbi_sequences.append(viterbi_seq[1:]) | |||||
| return viterbi_sequences | |||||
| def predict(self, sents): | |||||
| inputs = [] | |||||
| lengths = [len(text) for text in sents] | |||||
| max_len = max(lengths) | |||||
| for sent in sents: | |||||
| sent_ids = [self.char_to_id.get(w) if w in self.char_to_id else self.char_to_id.get("<OOV>") for w in sent] | |||||
| padding = [0] * (max_len - len(sent_ids)) | |||||
| sent_ids += padding | |||||
| inputs.append(sent_ids) | |||||
| inputs = np.array(inputs, dtype=np.int32) | |||||
| feed_dict = { | |||||
| self.input_x: inputs, | |||||
| self.lengths: lengths, | |||||
| self.dropout: 1.0 | |||||
| } | |||||
| logits, trans = self.sess.run([self.logits, self.trans], feed_dict=feed_dict) | |||||
| path = self.decode(logits, trans, lengths, self.num_class) | |||||
| labels = [[self.id_to_tag.get(l) for l in p] for p in path] | |||||
| return labels | |||||
| @@ -1,15 +1,4 @@ | |||||
| # -*- encoding:utf-8 -*- | # -*- encoding:utf-8 -*- | ||||
| """ | |||||
| * Copyright (C) 2017 OwnThink. | |||||
| * | |||||
| * Name : findword.py - 新词发现 | |||||
| * Author : Yener <yener@ownthink.com> | |||||
| * Version : 0.01 | |||||
| * Description : 新词发现算法实现 | |||||
| special thanks to | |||||
| http://www.matrix67.com/blog/archives/5044 | |||||
| https://github.com/zoulala/New_words_find | |||||
| """ | |||||
| import re | import re | ||||
| from math import log | from math import log | ||||
| from collections import Counter | from collections import Counter | ||||
| @@ -1,13 +1,5 @@ | |||||
| #!/usr/bin/env python | #!/usr/bin/env python | ||||
| # encoding: utf-8 | # encoding: utf-8 | ||||
| """ | |||||
| * Copyright (C) 2018 OwnThink. | |||||
| * | |||||
| * Name : mmseg.py | |||||
| * Author : Leo <1162441289@qq.com> | |||||
| * Version : 0.01 | |||||
| * Description : mmseg分词方法,目前算法比较耗时,仍在优化中 | |||||
| """ | |||||
| import os | import os | ||||
| import pickle | import pickle | ||||
| from math import log | from math import log | ||||
| @@ -0,0 +1,227 @@ | |||||
| # -*- coding:utf-8 -*- | |||||
| import os | |||||
| import gzip | |||||
| import pickle | |||||
| import random | |||||
| from collections import defaultdict | |||||
| class AveragedPerceptron(object): | |||||
| def __init__(self): | |||||
| # Each feature gets its own weight vector, so weights is a dict-of-dicts | |||||
| self.weights = {} | |||||
| self.classes = set() | |||||
| # The accumulated values, for the averaging. These will be keyed by | |||||
| # feature/clas tuples | |||||
| self._totals = defaultdict(int) | |||||
| # The last time the feature was changed, for the averaging. Also | |||||
| # keyed by feature/clas tuples | |||||
| # (tstamps is short for timestamps) | |||||
| self._tstamps = defaultdict(int) | |||||
| # Number of instances seen | |||||
| self.i = 0 | |||||
| def predict(self, features): | |||||
| '''Dot-product the features and current weights and return the best label.''' | |||||
| scores = defaultdict(float) | |||||
| for feat, value in features.items(): | |||||
| if feat not in self.weights or value == 0: | |||||
| continue | |||||
| weights = self.weights[feat] | |||||
| for label, weight in weights.items(): | |||||
| scores[label] += value * weight | |||||
| # Do a secondary alphabetic sort, for stability | |||||
| return max(self.classes, key=lambda label: (scores[label], label)) | |||||
| def update(self, truth, guess, features): | |||||
| '''Update the feature weights.''' | |||||
| def upd_feat(c, f, w, v): | |||||
| param = (f, c) | |||||
| self._totals[param] += (self.i - self._tstamps[param]) * w | |||||
| self._tstamps[param] = self.i | |||||
| self.weights[f][c] = w + v | |||||
| self.i += 1 | |||||
| if truth == guess: | |||||
| return None | |||||
| for f in features: | |||||
| weights = self.weights.setdefault(f, {}) | |||||
| upd_feat(truth, f, weights.get(truth, 0.0), 1.0) | |||||
| upd_feat(guess, f, weights.get(guess, 0.0), -1.0) | |||||
| return None | |||||
| def average_weights(self): | |||||
| '''Average weights from all iterations.''' | |||||
| for feat, weights in self.weights.items(): | |||||
| new_feat_weights = {} | |||||
| for clas, weight in weights.items(): | |||||
| param = (feat, clas) | |||||
| total = self._totals[param] | |||||
| total += (self.i - self._tstamps[param]) * weight | |||||
| averaged = round(total / float(self.i), 3) | |||||
| if averaged: | |||||
| new_feat_weights[clas] = averaged | |||||
| self.weights[feat] = new_feat_weights | |||||
| return None | |||||
| class Perceptron: | |||||
| def __init__(self, loc=None): | |||||
| self.START = ['-START-', '-START2-'] | |||||
| self.END = ['-END-', '-END2-'] | |||||
| self.model = AveragedPerceptron() | |||||
| if loc != None: | |||||
| self.load(loc) | |||||
| def predict(self, words): | |||||
| prev, prev2 = self.START | |||||
| labels = [] | |||||
| context = self.START + words + self.END | |||||
| for i, word in enumerate(words): | |||||
| features = self._get_features(i, word, context, prev, prev2) | |||||
| tag = self.model.predict(features) | |||||
| labels.append(tag) | |||||
| prev2 = prev | |||||
| prev = tag | |||||
| return labels | |||||
| def train(self, sentences, save_loc=None, nr_iter=5, shuf=False): | |||||
| self._make_tagdict(sentences) | |||||
| for iter_ in range(nr_iter): | |||||
| c = 0 | |||||
| n = 0 | |||||
| for words, tags in sentences: | |||||
| prev, prev2 = self.START | |||||
| context = self.START + words + self.END | |||||
| for i, word in enumerate(words): | |||||
| feats = self._get_features(i, word, context, prev, prev2) | |||||
| guess = self.model.predict(feats) | |||||
| self.model.update(tags[i], guess, feats) | |||||
| prev2 = prev | |||||
| prev = guess | |||||
| c += guess == tags[i] | |||||
| n += 1 | |||||
| if shuf == True: | |||||
| random.shuffle(sentences) | |||||
| print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100)) | |||||
| self.save(save_loc) | |||||
| self.model.average_weights() | |||||
| self.save(save_loc) | |||||
| def save(self, loc='model/ap.model', zip=True): | |||||
| if zip == False: | |||||
| pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb')) | |||||
| else: | |||||
| pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb')) | |||||
| def load(self, loc='model/ap.model', zip=True): | |||||
| if zip == False: | |||||
| self.model.weights, self.model.classes = pickle.load(open(loc, 'rb')) | |||||
| else: | |||||
| self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb')) | |||||
| def _get_features(self, i, word, context, prev, prev2): | |||||
| '''Map tokens into a feature representation, implemented as a | |||||
| {hashable: float} dict. If the features change, a new model must be | |||||
| trained. | |||||
| ''' | |||||
| def add(name, *args): | |||||
| features[' '.join((name,) + tuple(args))] += 1 | |||||
| i += len(self.START) | |||||
| features = defaultdict(int) | |||||
| # It's useful to have a constant feature, which acts sort of like a prior | |||||
| add('bias') | |||||
| add('i suffix', word[-3:]) | |||||
| add('i pref1', word[0]) | |||||
| add('i-1 tag', prev) | |||||
| add('i-2 tag', prev2) | |||||
| add('i tag+i-2 tag', prev, prev2) | |||||
| add('i word', context[i]) | |||||
| add('i-1 tag+i word', prev, context[i]) | |||||
| add('i-1 word', context[i - 1]) | |||||
| add('i-1 suffix', context[i - 1][-3:]) | |||||
| add('i-2 word', context[i - 2]) | |||||
| add('i+1 word', context[i + 1]) | |||||
| add('i+1 suffix', context[i + 1][-3:]) | |||||
| add('i+2 word', context[i + 2]) | |||||
| return features | |||||
| def _make_tagdict(self, sentences): | |||||
| '''Make a tag dictionary for single-tag words.''' | |||||
| for words, tags in sentences: | |||||
| for word, tag in zip(words, tags): | |||||
| self.model.classes.add(tag) | |||||
| def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): | |||||
| tagger = Perceptron() | |||||
| print('Reading corpus...') | |||||
| training_data = [] | |||||
| sentence = ([], []) | |||||
| fin = open(filepath, 'r', encoding='utf8') | |||||
| for index, line in enumerate(fin): | |||||
| line = line.strip() | |||||
| if line == '': | |||||
| training_data.append(sentence) | |||||
| sentence = ([], []) | |||||
| else: | |||||
| params = line.split() | |||||
| if len(params) != 2: continue | |||||
| sentence[0].append(params[0]) | |||||
| sentence[1].append(params[1]) | |||||
| fin.close() | |||||
| print('training corpus size : %d', len(training_data)) | |||||
| print('Start training...') | |||||
| tagger.train(training_data, save_loc=model, nr_iter=nr_iter) | |||||
| def eval(filepath='data/test.txt', model='model/ap.model'): | |||||
| tagger = Perceptron(model) | |||||
| print('Start testing...') | |||||
| right = 0.0 | |||||
| total = 0.0 | |||||
| sentence = ([], []) | |||||
| fin = open(filepath, 'r', encoding='utf8') | |||||
| for index, line in enumerate(fin): | |||||
| line = line.strip() | |||||
| if line == '': | |||||
| words = sentence[0] | |||||
| tags = sentence[1] | |||||
| outputs = tagger.predict(words) | |||||
| assert len(tags) == len(outputs) | |||||
| total += len(tags) | |||||
| for o, t in zip(outputs, tags): | |||||
| if o == t: right += 1 | |||||
| sentence = ([], []) | |||||
| else: | |||||
| params = line.split() | |||||
| if len(params) != 2: continue | |||||
| sentence[0].append(params[0]) | |||||
| sentence[1].append(params[1]) | |||||
| fin.close() | |||||
| print("Precision : %f", right / total) | |||||
| def predict(model='model/ap.model'): | |||||
| tagger = Perceptron(model) | |||||
| while True: | |||||
| text = input('>') | |||||
| words = list(text) | |||||
| labels = tagger.predict(words) | |||||
| for word, label in zip(words, labels): | |||||
| print(word, label) | |||||
| if __name__ == '__main__': | |||||
| train() | |||||
| eval() | |||||
| # predict() | |||||
| @@ -1,13 +1,4 @@ | |||||
| # -*- encoding:utf-8 -*- | # -*- encoding:utf-8 -*- | ||||
| """ | |||||
| * Copyright (C) 2017 OwnThink. | |||||
| * | |||||
| * Name : textrank.py - 解析 | |||||
| * Author : zengbin93 <zeng_bin8888@163.com> | |||||
| * Version : 0.01 | |||||
| * Description : TextRank算法实现 | |||||
| special thanks to https://github.com/ArtistScript/FastTextRank | |||||
| """ | |||||
| import sys | import sys | ||||
| import numpy as np | import numpy as np | ||||
| from jiagu import utils | from jiagu import utils | ||||
| @@ -1,16 +1,7 @@ | |||||
| # -*- encoding:utf-8 -*- | # -*- encoding:utf-8 -*- | ||||
| """ | |||||
| * Copyright (C) 2017 OwnThink. | |||||
| * | |||||
| * Name : utils.py - 解析 | |||||
| * Author : zengbin93 <zeng_bin8888@163.com> | |||||
| * Version : 0.01 | |||||
| * Description : 常用工具函数 | |||||
| """ | |||||
| import os | import os | ||||
| import jiagu | import jiagu | ||||
| import math | import math | ||||
| import numpy as np | |||||
| def default_stopwords_file(): | def default_stopwords_file(): | ||||
| @@ -138,22 +129,6 @@ def different(scores, old_scores, tol=0.0001): | |||||
| return flag | return flag | ||||
| def cosine_similarity(vec1, vec2): | |||||
| """计算两个向量的余弦相似度 | |||||
| :param vec1: list or np.array | |||||
| :param vec2: list or np.array | |||||
| :return: float | |||||
| """ | |||||
| tx = np.array(vec1) | |||||
| ty = np.array(vec2) | |||||
| cos1 = np.sum(tx * ty) | |||||
| cos21 = np.sqrt(sum(tx ** 2)) | |||||
| cos22 = np.sqrt(sum(ty ** 2)) | |||||
| cosine_value = cos1 / float(cos21 * cos22) | |||||
| return cosine_value | |||||
| def combine(word_list, window=2): | def combine(word_list, window=2): | ||||
| if window < 2: | if window < 2: | ||||
| window = 2 | window = 2 | ||||
| @@ -3,16 +3,15 @@ | |||||
| from setuptools import setup | from setuptools import setup | ||||
| setup(name='jiagu', | setup(name='jiagu', | ||||
| version='0.1.8', | |||||
| version='0.1.9', | |||||
| description='Jiagu Natural Language Processing', | description='Jiagu Natural Language Processing', | ||||
| author='Yener(Zheng Wenyu)', | author='Yener(Zheng Wenyu)', | ||||
| author_email='help@ownthink.com', | author_email='help@ownthink.com', | ||||
| url='https://github.com/ownthink/Jiagu', | url='https://github.com/ownthink/Jiagu', | ||||
| license='MIT', | license='MIT', | ||||
| install_requires=['tensorflow==1.6.0', 'numpy>=1.12.1'], | |||||
| packages=['jiagu'], | packages=['jiagu'], | ||||
| package_dir={'jiagu': 'jiagu'}, | package_dir={'jiagu': 'jiagu'}, | ||||
| package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*', | package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*', | ||||
| 'normal/*', 'segment/*', 'segment/dict/*', | |||||
| 'sentiment/*', 'sentiment/model/*', 'topic/*']} | |||||
| 'normal/*', 'segment/*', 'segment/dict/*', | |||||
| 'sentiment/*', 'sentiment/model/*', 'topic/*']} | |||||
| ) | ) | ||||
| @@ -170,6 +170,7 @@ def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): | |||||
| sentence = ([], []) | sentence = ([], []) | ||||
| else: | else: | ||||
| params = line.split() | params = line.split() | ||||
| if len(params) != 2: continue | |||||
| sentence[0].append(params[0]) | sentence[0].append(params[0]) | ||||
| sentence[1].append(params[1]) | sentence[1].append(params[1]) | ||||
| fin.close() | fin.close() | ||||