@@ -182,6 +182,75 @@ class CWS(API): | |||
return f1, pre, rec | |||
<<<<<<< HEAD | |||
======= | |||
class Parser(API): | |||
def __init__(self, model_path=None, device='cpu'): | |||
super(Parser, self).__init__() | |||
if model_path is None: | |||
model_path = model_urls['parser'] | |||
self.load(model_path, device) | |||
def predict(self, content): | |||
if not hasattr(self, 'pipeline'): | |||
raise ValueError("You have to load model first.") | |||
sentence_list = [] | |||
# 1. 检查sentence的类型 | |||
if isinstance(content, str): | |||
sentence_list.append(content) | |||
elif isinstance(content, list): | |||
sentence_list = content | |||
# 2. 组建dataset | |||
dataset = DataSet() | |||
dataset.add_field('words', sentence_list) | |||
# dataset.add_field('tag', sentence_list) | |||
# 3. 使用pipeline | |||
self.pipeline(dataset) | |||
for ins in dataset: | |||
ins['heads'] = ins['heads'].tolist() | |||
return dataset['heads'], dataset['labels'] | |||
def test(self, filepath): | |||
data = ConllxDataLoader().load(filepath) | |||
ds = DataSet() | |||
for ins1, ins2 in zip(add_seg_tag(data), data): | |||
ds.append(Instance(words=ins1[0], tag=ins1[1], | |||
gold_words=ins2[0], gold_pos=ins2[1], | |||
gold_heads=ins2[2], gold_head_tags=ins2[3])) | |||
pp = self.pipeline | |||
for p in pp: | |||
if p.field_name == 'word_list': | |||
p.field_name = 'gold_words' | |||
elif p.field_name == 'pos_list': | |||
p.field_name = 'gold_pos' | |||
pp(ds) | |||
head_cor, label_cor, total = 0, 0, 0 | |||
for ins in ds: | |||
head_gold = ins['gold_heads'] | |||
head_pred = ins['heads'] | |||
length = len(head_gold) | |||
total += length | |||
for i in range(length): | |||
head_cor += 1 if head_pred[i] == head_gold[i] else 0 | |||
uas = head_cor / total | |||
print('uas:{:.2f}'.format(uas)) | |||
for p in pp: | |||
if p.field_name == 'gold_words': | |||
p.field_name = 'word_list' | |||
elif p.field_name == 'gold_pos': | |||
p.field_name = 'pos_list' | |||
return uas | |||
>>>>>>> b182b39... * fixing unit tests | |||
class Analyzer: | |||
def __init__(self, seg=True, pos=True, parser=True, device='cpu'): | |||
@@ -196,7 +265,13 @@ class Analyzer: | |||
if parser: | |||
self.parser = None | |||
<<<<<<< HEAD | |||
def predict(self, content): | |||
======= | |||
def predict(self, content, seg=False, pos=False, parser=False): | |||
if seg is False and pos is False and parser is False: | |||
seg = True | |||
>>>>>>> b182b39... * fixing unit tests | |||
output_dict = {} | |||
if self.seg: | |||
seg_output = self.cws.predict(content) | |||
@@ -235,9 +310,23 @@ if __name__ == "__main__": | |||
# print(pos.predict(s)) | |||
# cws_model_path = '../../reproduction/chinese_word_segment/models/cws_crf.pkl' | |||
<<<<<<< HEAD | |||
cws = CWS(device='cpu') | |||
s = ['本品是一个抗酸抗胆汁的胃黏膜保护剂' , | |||
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', | |||
======= | |||
# cws = CWS(device='cpu') | |||
# s = ['本品是一个抗酸抗胆汁的胃黏膜保护剂' , | |||
# '这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', | |||
# '那么这款无人机到底有多厉害?'] | |||
# print(cws.test('/Users/yh/Desktop/test_data/cws_test.conll')) | |||
# print(cws.predict(s)) | |||
parser = Parser(device='cpu') | |||
# print(parser.test('/Users/yh/Desktop/test_data/parser_test2.conll')) | |||
s = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。', | |||
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', | |||
>>>>>>> b182b39... * fixing unit tests | |||
'那么这款无人机到底有多厉害?'] | |||
print(cws.test('/Users/yh/Desktop/test_data/small_test.conll')) | |||
print(cws.predict(s)) | |||
@@ -14,8 +14,7 @@ class SpanConverter: | |||
for match in re.finditer(self.pattern, sentence): | |||
start, end = match.span() | |||
span = sentence[start:end] | |||
replaced_sentence += sentence[prev_end:start] + \ | |||
self.span_to_special_tag(span) | |||
replaced_sentence += sentence[prev_end:start] + self.span_to_special_tag(span) | |||
prev_end = end | |||
replaced_sentence += sentence[prev_end:] | |||
@@ -56,8 +55,8 @@ class DigitSpanConverter(SpanConverter): | |||
for idx, char in enumerate(span): | |||
if char == '.' or char == '﹒' or char == '·': | |||
decimal_point_count += 1 | |||
if span[-1] == '.' or span[-1] == '﹒' or span[ | |||
-1] == '·': # last digit being decimal point means this is not a number | |||
if span[-1] == '.' or span[-1] == '﹒' or span[-1] == '·': | |||
# last digit being decimal point means this is not a number | |||
if decimal_point_count == 1: | |||
return span | |||
else: | |||
@@ -53,7 +53,7 @@ class DataSet(object): | |||
length_set = set() | |||
for key, value in data.items(): | |||
length_set.add(len(value)) | |||
assert len(length_set)==1, "Arrays must all be same length." | |||
assert len(length_set) == 1, "Arrays must all be same length." | |||
for key, value in data.items(): | |||
self.add_field(name=key, fields=value) | |||
elif isinstance(data, list): | |||
@@ -191,10 +191,11 @@ class DataSet(object): | |||
else: | |||
return results | |||
if __name__ == '__main__': | |||
from fastNLP.core.instance import Instance | |||
d = DataSet({'a': list('abc')}) | |||
d.a | |||
_ = d.a | |||
d.apply(lambda x: x['a']) | |||
print(d[1]) |
@@ -4,7 +4,8 @@ import torch.nn.functional as F | |||
class CNN_text(nn.Module): | |||
def __init__(self, kernel_h=[3, 4, 5], kernel_num=100, embed_num=1000, embed_dim=300, num_classes=2, dropout=0.5, L2_constrain=3, | |||
def __init__(self, kernel_h=[3, 4, 5], kernel_num=100, embed_num=1000, embed_dim=300, num_classes=2, dropout=0.5, | |||
L2_constrain=3, | |||
pretrained_embeddings=None): | |||
super(CNN_text, self).__init__() | |||
@@ -16,7 +17,7 @@ class CNN_text(nn.Module): | |||
# the network structure | |||
# Conv2d: input- N,C,H,W output- (50,100,62,1) | |||
self.conv1 = nn.ModuleList([nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in kernel_h]) | |||
self.fc1 = nn.Linear(len(kernel_h)*kernel_num, num_classes) | |||
self.fc1 = nn.Linear(len(kernel_h) * kernel_num, num_classes) | |||
def max_pooling(self, x): | |||
x = F.relu(self.conv1(x)).squeeze(3) # N,C,L - (50,100,62) | |||
@@ -34,7 +35,8 @@ class CNN_text(nn.Module): | |||
x = self.fc1(x) | |||
return x | |||
if __name__ == '__main__': | |||
model = CNN_text(kernel_h=[1, 2, 3, 4],embed_num=3, embed_dim=2) | |||
model = CNN_text(kernel_h=[1, 2, 3, 4], embed_num=3, embed_dim=2) | |||
x = torch.LongTensor([[1, 2, 1, 2, 0]]) | |||
print(model(x)) | |||
print(model(x)) |
@@ -1,55 +1,17 @@ | |||
import unittest | |||
import torch | |||
from fastNLP.core.batch import Batch | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.core.field import TextField, LabelField | |||
from fastNLP.core.instance import Instance | |||
raw_texts = ["i am a cat", | |||
"this is a test of new batch", | |||
"ha ha", | |||
"I am a good boy .", | |||
"This is the most beautiful girl ." | |||
] | |||
texts = [text.strip().split() for text in raw_texts] | |||
labels = [0, 1, 0, 0, 1] | |||
# prepare vocabulary | |||
vocab = {} | |||
for text in texts: | |||
for tokens in text: | |||
if tokens not in vocab: | |||
vocab[tokens] = len(vocab) | |||
from fastNLP.core.sampler import SequentialSampler | |||
class TestCase1(unittest.TestCase): | |||
def test(self): | |||
data = DataSet() | |||
for text, label in zip(texts, labels): | |||
x = TextField(text, is_target=False) | |||
y = LabelField(label, is_target=True) | |||
ins = Instance(raw_text=x, label=y) | |||
data.append(ins) | |||
# use vocabulary to index data | |||
# data.index_field("text", vocab) | |||
for ins in data: | |||
ins['text'] = [vocab.to_index(w) for w in ins['raw_text']] | |||
dataset = DataSet([Instance(x=["I", "am", "here"])] * 40) | |||
batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), use_cuda=False) | |||
# define naive sampler for batch class | |||
class SeqSampler: | |||
def __call__(self, dataset): | |||
return list(range(len(dataset))) | |||
for batch_x, batch_y in batch: | |||
print(batch_x, batch_y) | |||
# use batch to iterate dataset | |||
data_iterator = Batch(data, 2, SeqSampler(), False) | |||
total_data = 0 | |||
for batch_x, batch_y in data_iterator: | |||
total_data += batch_x["text"].size(0) | |||
self.assertTrue(batch_x["text"].size(0) == 2 or total_data == len(raw_texts)) | |||
self.assertTrue(isinstance(batch_x, dict)) | |||
self.assertTrue(isinstance(batch_x["text"], torch.LongTensor)) | |||
self.assertTrue(isinstance(batch_y, dict)) | |||
self.assertTrue(isinstance(batch_y["label"], torch.LongTensor)) | |||
# TODO: weird due to change in dataset.py |
@@ -1,7 +1,5 @@ | |||
import unittest | |||
from fastNLP.io.dataset_loader import convert_seq2seq_dataset, convert_seq_dataset | |||
class TestDataSet(unittest.TestCase): | |||
labeled_data_list = [ | |||
@@ -18,37 +16,5 @@ class TestDataSet(unittest.TestCase): | |||
label_vocab = {"1": 1, "2": 2, "3": 3, "4": 4} | |||
def test_case_1(self): | |||
data_set = convert_seq2seq_dataset(self.labeled_data_list) | |||
data_set.index_field("word_seq", self.word_vocab) | |||
data_set.index_field("label_seq", self.label_vocab) | |||
self.assertEqual(len(data_set), len(self.labeled_data_list)) | |||
self.assertTrue(len(data_set) > 0) | |||
self.assertTrue(hasattr(data_set[0], "fields")) | |||
self.assertTrue("word_seq" in data_set[0].fields) | |||
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text")) | |||
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index")) | |||
self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0]) | |||
self.assertEqual(data_set[0].fields["word_seq"]._index, | |||
[self.word_vocab[c] for c in self.labeled_data_list[0][0]]) | |||
self.assertTrue("label_seq" in data_set[0].fields) | |||
self.assertTrue(hasattr(data_set[0].fields["label_seq"], "text")) | |||
self.assertTrue(hasattr(data_set[0].fields["label_seq"], "_index")) | |||
self.assertEqual(data_set[0].fields["label_seq"].text, self.labeled_data_list[0][1]) | |||
self.assertEqual(data_set[0].fields["label_seq"]._index, | |||
[self.label_vocab[c] for c in self.labeled_data_list[0][1]]) | |||
def test_case_2(self): | |||
data_set = convert_seq_dataset(self.unlabeled_data_list) | |||
data_set.index_field("word_seq", self.word_vocab) | |||
self.assertEqual(len(data_set), len(self.unlabeled_data_list)) | |||
self.assertTrue(len(data_set) > 0) | |||
self.assertTrue(hasattr(data_set[0], "fields")) | |||
self.assertTrue("word_seq" in data_set[0].fields) | |||
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text")) | |||
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index")) | |||
self.assertEqual(data_set[0].fields["word_seq"].text, self.unlabeled_data_list[0]) | |||
self.assertEqual(data_set[0].fields["word_seq"]._index, | |||
[self.word_vocab[c] for c in self.unlabeled_data_list[0]]) | |||
# TODO: | |||
pass |
@@ -2,10 +2,10 @@ import os | |||
import unittest | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.core.metrics import SeqLabelEvaluator | |||
from fastNLP.core.field import TextField, LabelField | |||
from fastNLP.core.instance import Instance | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.core.metrics import SeqLabelEvaluator | |||
from fastNLP.core.tester import Tester | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
data_name = "pku_training.utf8" | |||
@@ -49,7 +49,7 @@ class TestTester(unittest.TestCase): | |||
model = SeqLabeling(model_args) | |||
tester = SeqLabelTester(**valid_args) | |||
tester = Tester(**valid_args) | |||
tester.test(network=model, dev_data=data_set) | |||
# If this can run, everything is OK. | |||
@@ -2,12 +2,12 @@ import os | |||
import unittest | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.core.metrics import SeqLabelEvaluator | |||
from fastNLP.core.field import TextField, LabelField | |||
from fastNLP.core.instance import Instance | |||
from fastNLP.core.loss import Loss | |||
from fastNLP.core.metrics import SeqLabelEvaluator | |||
from fastNLP.core.optimizer import Optimizer | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.core.trainer import Trainer | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
@@ -23,7 +23,7 @@ class TestTrainer(unittest.TestCase): | |||
"num_classes": 5, | |||
"evaluator": SeqLabelEvaluator() | |||
} | |||
trainer = SeqLabelTrainer(**args) | |||
trainer = Trainer(**args) | |||
train_data = [ | |||
[['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], | |||
@@ -1,9 +1,9 @@ | |||
import os | |||
from fastNLP.core.metrics import SeqLabelEvaluator | |||
from fastNLP.core.predictor import SeqLabelInfer | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.core.predictor import Predictor | |||
from fastNLP.core.tester import Tester | |||
from fastNLP.core.trainer import Trainer | |||
from fastNLP.core.utils import save_pickle, load_pickle | |||
from fastNLP.core.vocabulary import Vocabulary | |||
from fastNLP.io.config_loader import ConfigLoader, ConfigSection | |||
@@ -41,7 +41,7 @@ def infer(): | |||
infer_data.index_field("word_seq", word2index) | |||
infer_data.set_origin_len("word_seq") | |||
# inference | |||
infer = SeqLabelInfer(pickle_path) | |||
infer = Predictor(pickle_path) | |||
results = infer.predict(model, infer_data) | |||
print(results) | |||
@@ -66,7 +66,7 @@ def train_test(): | |||
save_pickle(label_vocab, pickle_path, "label2id.pkl") | |||
# Trainer | |||
trainer = SeqLabelTrainer(**train_args.data) | |||
trainer = Trainer(**train_args.data) | |||
# Model | |||
model = SeqLabeling(train_args) | |||
@@ -92,7 +92,7 @@ def train_test(): | |||
test_args["evaluator"] = SeqLabelEvaluator() | |||
# Tester | |||
tester = SeqLabelTester(**test_args.data) | |||
tester = Tester(**test_args.data) | |||
# Start testing | |||
data_train.set_target(truth=True) | |||
@@ -2,8 +2,8 @@ import os | |||
from fastNLP.core.metrics import SeqLabelEvaluator | |||
from fastNLP.core.optimizer import Optimizer | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.core.tester import Tester | |||
from fastNLP.core.trainer import Trainer | |||
from fastNLP.core.utils import save_pickle | |||
from fastNLP.core.vocabulary import Vocabulary | |||
from fastNLP.io.config_loader import ConfigLoader, ConfigSection | |||
@@ -40,7 +40,7 @@ def test_training(): | |||
save_pickle(word_vocab, pickle_path, "word2id.pkl") | |||
save_pickle(label_vocab, pickle_path, "label2id.pkl") | |||
trainer = SeqLabelTrainer( | |||
trainer = Trainer( | |||
epochs=trainer_args["epochs"], | |||
batch_size=trainer_args["batch_size"], | |||
validate=False, | |||
@@ -74,12 +74,12 @@ def test_training(): | |||
ConfigLoader().load_config(config_dir, {"test_seq_label_tester": tester_args}) | |||
# Tester | |||
tester = SeqLabelTester(batch_size=4, | |||
use_cuda=False, | |||
pickle_path=pickle_path, | |||
model_name="seq_label_in_test.pkl", | |||
evaluator=SeqLabelEvaluator() | |||
) | |||
tester = Tester(batch_size=4, | |||
use_cuda=False, | |||
pickle_path=pickle_path, | |||
model_name="seq_label_in_test.pkl", | |||
evaluator=SeqLabelEvaluator() | |||
) | |||
# Start testing with validation data | |||
data_dev.set_target(truth=True) | |||