From ba28702e689a2ea063849c2d3b098049d3fd8cfc Mon Sep 17 00:00:00 2001 From: yunfan Date: Sat, 12 Jan 2019 11:22:09 +0800 Subject: [PATCH] update Biaffine Parser, Variational RNN add parser API --- fastNLP/api/api.py | 41 +-- fastNLP/api/processor.py | 24 +- fastNLP/core/utils.py | 2 +- fastNLP/models/biaffine_parser.py | 29 +- fastNLP/modules/encoder/variational_rnn.py | 166 +++++++--- reproduction/Biaffine_parser/cfg.cfg | 17 +- reproduction/Biaffine_parser/run.py | 336 ++++++++------------- test/models/test_biaffine_parser.py | 11 +- 8 files changed, 307 insertions(+), 319 deletions(-) diff --git a/fastNLP/api/api.py b/fastNLP/api/api.py index 47c29214..cb46963d 100644 --- a/fastNLP/api/api.py +++ b/fastNLP/api/api.py @@ -202,30 +202,30 @@ class Parser(API): if model_path is None: model_path = model_urls['parser'] + self.pos_tagger = POS(device=device) self.load(model_path, device) def predict(self, content): if not hasattr(self, 'pipeline'): raise ValueError("You have to load model first.") - sentence_list = [] - # 1. 检查sentence的类型 - if isinstance(content, str): - sentence_list.append(content) - elif isinstance(content, list): - sentence_list = content + # 1. 利用POS得到分词和pos tagging结果 + pos_out = self.pos_tagger.predict(content) + # pos_out = ['这里/NN 是/VB 分词/NN 结果/NN'.split()] # 2. 组建dataset dataset = DataSet() - dataset.add_field('words', sentence_list) - # dataset.add_field('tag', sentence_list) + dataset.add_field('wp', pos_out) + dataset.apply(lambda x: ['']+[w.split('/')[0] for w in x['wp']], new_field_name='words') + dataset.apply(lambda x: ['']+[w.split('/')[1] for w in x['wp']], new_field_name='pos') # 3. 使用pipeline self.pipeline(dataset) - for ins in dataset: - ins['heads'] = ins['heads'].tolist() - - return dataset['heads'], dataset['labels'] + dataset.apply(lambda x: [str(arc) for arc in x['arc_pred']], new_field_name='arc_pred') + dataset.apply(lambda x: [arc + '/' + label for arc, label in + zip(x['arc_pred'], x['label_pred_seq'])][1:], new_field_name='output') + # output like: [['2/top', '0/root', '4/nn', '2/dep']] + return dataset.field_arrays['output'].content def test(self, filepath): data = ConllxDataLoader().load(filepath) @@ -301,12 +301,12 @@ class Analyzer: if __name__ == "__main__": - pos_model_path = '/home/zyfeng/fastnlp/reproduction/pos_tag_model/model_pp.pkl' - pos = POS(pos_model_path, device='cpu') - s = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。', - '这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', - '那么这款无人机到底有多厉害?'] - print(pos.test("/home/zyfeng/data/sample.conllx")) + # pos_model_path = '/home/zyfeng/fastnlp/reproduction/pos_tag_model/model_pp.pkl' + # pos = POS(pos_model_path, device='cpu') + # s = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。', + # '这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', + # '那么这款无人机到底有多厉害?'] + # print(pos.test("/home/zyfeng/data/sample.conllx")) # print(pos.predict(s)) # cws_model_path = '../../reproduction/chinese_word_segment/models/cws_crf.pkl' @@ -317,9 +317,10 @@ if __name__ == "__main__": # print(cws.test('/Users/yh/Desktop/test_data/cws_test.conll')) # print(cws.predict(s)) - # parser = Parser(device='cpu') + parser_path = '/home/yfshao/workdir/fastnlp/reproduction/Biaffine_parser/pipe.pkl' + parser = Parser(parser_path, device='cpu') # print(parser.test('/Users/yh/Desktop/test_data/parser_test2.conll')) s = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。', '这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', '那么这款无人机到底有多厉害?'] - # print(parser.predict(s)) + print(parser.predict(s)) diff --git a/fastNLP/api/processor.py b/fastNLP/api/processor.py index afa8775b..0838d10b 100644 --- a/fastNLP/api/processor.py +++ b/fastNLP/api/processor.py @@ -302,15 +302,23 @@ class Index2WordProcessor(Processor): return dataset -class SetIsTargetProcessor(Processor): +class SetTargetProcessor(Processor): # TODO; remove it. - def __init__(self, field_dict, default=False): - super(SetIsTargetProcessor, self).__init__(None, None) - self.field_dict = field_dict - self.default = default + def __init__(self, *fields, flag=True): + super(SetTargetProcessor, self).__init__(None, None) + self.fields = fields + self.flag = flag def process(self, dataset): - set_dict = {name: self.default for name in dataset.get_all_fields().keys()} - set_dict.update(self.field_dict) - dataset.set_target(*set_dict.keys()) + dataset.set_target(*self.fields, flag=self.flag) + return dataset + +class SetInputProcessor(Processor): + def __init__(self, *fields, flag=True): + super(SetInputProcessor, self).__init__(None, None) + self.fields = fields + self.flag = flag + + def process(self, dataset): + dataset.set_input(*self.fields, flag=self.flag) return dataset diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index d751fba1..cc44a6c4 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -400,7 +400,7 @@ def seq_lens_to_masks(seq_lens, float=False): assert len(np.shape(seq_lens)) == 1, f"seq_lens can only have one dimension, got {len(np.shape(seq_lens))}." assert seq_lens.dtype in (int, np.int32, np.int64), f"seq_lens can only be integer, not {seq_lens.dtype}." raise NotImplemented - elif isinstance(seq_lens, torch.LongTensor): + elif isinstance(seq_lens, torch.Tensor): assert len(seq_lens.size()) == 1, f"seq_lens can only have one dimension, got {len(seq_lens.size())==1}." batch_size = seq_lens.size(0) max_len = seq_lens.max() diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py index efb07f34..fb687301 100644 --- a/fastNLP/models/biaffine_parser.py +++ b/fastNLP/models/biaffine_parser.py @@ -134,17 +134,13 @@ class GraphParser(BaseModel): def _mst_decoder(self, arc_matrix, mask=None): batch_size, seq_len, _ = arc_matrix.shape - matrix = torch.zeros_like(arc_matrix).copy_(arc_matrix) + matrix = arc_matrix.clone() ans = matrix.new_zeros(batch_size, seq_len).long() lens = (mask.long()).sum(1) if mask is not None else torch.zeros(batch_size) + seq_len batch_idx = torch.arange(batch_size, dtype=torch.long, device=lens.device) - mask[batch_idx, lens-1] = 0 for i, graph in enumerate(matrix): len_i = lens[i] - if len_i == seq_len: - ans[i] = torch.as_tensor(mst(graph.cpu().numpy()), device=ans.device) - else: - ans[i, :len_i] = torch.as_tensor(mst(graph[:len_i, :len_i].cpu().numpy()), device=ans.device) + ans[i, :len_i] = torch.as_tensor(mst(graph.detach()[:len_i, :len_i].cpu().numpy()), device=ans.device) if mask is not None: ans *= mask.long() return ans @@ -219,6 +215,7 @@ class BiaffineParser(GraphParser): self.pos_fc = nn.Linear(pos_emb_dim, pos_hid_dim) self.word_norm = nn.LayerNorm(word_hid_dim) self.pos_norm = nn.LayerNorm(pos_hid_dim) + self.use_var_lstm = use_var_lstm if use_var_lstm: self.lstm = VarLSTM(input_size=word_hid_dim + pos_hid_dim, hidden_size=rnn_hidden_size, @@ -249,10 +246,9 @@ class BiaffineParser(GraphParser): self.label_dep_mlp = copy.deepcopy(self.label_head_mlp) self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True) self.label_predictor = LabelBilinear(label_mlp_size, label_mlp_size, num_label, bias=True) - self.normal_dropout = nn.Dropout(p=dropout) self.use_greedy_infer = use_greedy_infer self.reset_parameters() - self.explore_p = 0.2 + self.dropout = dropout def reset_parameters(self): for m in self.modules(): @@ -278,18 +274,15 @@ class BiaffineParser(GraphParser): head_pred: [batch_size, seq_len] if gold_heads is not provided, predicting the heads """ # prepare embeddings - device = self.parameters().__next__().device - word_seq = word_seq.long().to(device) - pos_seq = pos_seq.long().to(device) - seq_lens = seq_lens.long().to(device).view(-1) batch_size, seq_len = word_seq.shape # print('forward {} {}'.format(batch_size, seq_len)) # get sequence mask mask = seq_mask(seq_lens, seq_len).long() - word = self.normal_dropout(self.word_embedding(word_seq)) # [N,L] -> [N,L,C_0] - pos = self.normal_dropout(self.pos_embedding(pos_seq)) # [N,L] -> [N,L,C_1] + word = self.word_embedding(word_seq) # [N,L] -> [N,L,C_0] + pos = self.pos_embedding(pos_seq) # [N,L] -> [N,L,C_1] + word, pos = self.word_fc(word), self.pos_fc(pos) word, pos = self.word_norm(word), self.pos_norm(pos) x = torch.cat([word, pos], dim=2) # -> [N,L,C] @@ -325,7 +318,7 @@ class BiaffineParser(GraphParser): head_pred = heads else: assert self.training # must be training mode - if torch.rand(1).item() < self.explore_p: + if gold_heads is None: heads = self._greedy_decoder(arc_pred, mask) head_pred = heads else: @@ -355,7 +348,7 @@ class BiaffineParser(GraphParser): batch_size, seq_len, _ = arc_pred.shape flip_mask = (mask == 0) - _arc_pred = arc_pred.new_empty((batch_size, seq_len, seq_len)).copy_(arc_pred) + _arc_pred = arc_pred.clone() _arc_pred.masked_fill_(flip_mask.unsqueeze(1), -np.inf) arc_logits = F.log_softmax(_arc_pred, dim=2) label_logits = F.log_softmax(label_pred, dim=2) @@ -421,7 +414,9 @@ class ParserMetric(MetricBase): if seq_lens is None: seq_mask = arc_pred.new_ones(arc_pred.size(), dtype=torch.long) else: - seq_mask = seq_lens_to_masks(seq_lens, float=False).long() + seq_mask = seq_lens_to_masks(seq_lens.long(), float=False).long() + # mask out tag + seq_mask[:,0] = 0 head_pred_correct = (arc_pred == arc_true).long() * seq_mask label_pred_correct = (label_pred == label_true).long() * head_pred_correct self.num_arc += head_pred_correct.sum().item() diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py index f4a37cf4..fa8e0fcb 100644 --- a/fastNLP/modules/encoder/variational_rnn.py +++ b/fastNLP/modules/encoder/variational_rnn.py @@ -2,8 +2,7 @@ import math import torch import torch.nn as nn -from torch.nn.utils.rnn import PackedSequence - +from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence from fastNLP.modules.utils import initial_parameter try: @@ -25,30 +24,63 @@ class VarRnnCellWrapper(nn.Module): self.input_p = input_p self.hidden_p = hidden_p - def forward(self, input, hidden, mask_x=None, mask_h=None): + def forward(self, input_x, hidden, mask_x, mask_h, is_reversed=False): """ - :param input: [seq_len, batch_size, input_size] + :param PackedSequence input_x: [seq_len, batch_size, input_size] :param hidden: for LSTM, tuple of (h_0, c_0), [batch_size, hidden_size] for other RNN, h_0, [batch_size, hidden_size] :param mask_x: [batch_size, input_size] dropout mask for input :param mask_h: [batch_size, hidden_size] dropout mask for hidden - :return output: [seq_len, bacth_size, hidden_size] + :return PackedSequence output: [seq_len, bacth_size, hidden_size] hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size] for other RNN, h_n, [batch_size, hidden_size] """ + def get_hi(hi, h0, size): + h0_size = size - hi.size(0) + if h0_size > 0: + return torch.cat([hi, h0[:h0_size]], dim=0) + return hi[:size] is_lstm = isinstance(hidden, tuple) - input = input * mask_x.unsqueeze(0) if mask_x is not None else input - output_list = [] - for x in input: + input, batch_sizes = input_x + output = [] + cell = self.cell + if is_reversed: + batch_iter = flip(batch_sizes, [0]) + idx = input.size(0) + else: + batch_iter = batch_sizes + idx = 0 + + if is_lstm: + hn = (hidden[0].clone(), hidden[1].clone()) + else: + hn = hidden.clone() + hi = hidden + for size in batch_iter: + if is_reversed: + input_i = input[idx-size: idx] * mask_x[:size] + idx -= size + else: + input_i = input[idx: idx+size] * mask_x[:size] + idx += size + mask_hi = mask_h[:size] if is_lstm: - hx, cx = hidden - hidden = (hx * mask_h, cx) if mask_h is not None else (hx, cx) + hx, cx = hi + hi = (get_hi(hx, hidden[0], size) * mask_hi, get_hi(cx, hidden[1], size)) + hi = cell(input_i, hi) + hn[0][:size] = hi[0] + hn[1][:size] = hi[1] + output.append(hi[0]) else: - hidden *= mask_h if mask_h is not None else hidden - hidden = self.cell(x, hidden) - output_list.append(hidden[0] if is_lstm else hidden) - output = torch.stack(output_list, dim=0) - return output, hidden + hi = get_hi(hi, hidden, size) * mask_hi + hi = cell(input_i, hi) + hn[:size] = hi + output.append(hi) + + if is_reversed: + output = list(reversed(output)) + output = torch.cat(output, dim=0) + return PackedSequence(output, batch_sizes), hn class VarRNNBase(nn.Module): @@ -77,60 +109,67 @@ class VarRNNBase(nn.Module): cell = Cell(input_size, self.hidden_size, bias) self._all_cells.append(VarRnnCellWrapper(cell, self.hidden_size, input_dropout, hidden_dropout)) initial_parameter(self) + self.is_lstm = (self.mode == "LSTM") + + def _forward_one(self, n_layer, n_direction, input, hx, mask_x, mask_h): + is_lstm = self.is_lstm + idx = self.num_directions * n_layer + n_direction + cell = self._all_cells[idx] + hi = (hx[0][idx], hx[1][idx]) if is_lstm else hx[idx] + output_x, hidden_x = cell(input, hi, mask_x, mask_h, is_reversed=(n_direction == 1)) + return output_x, hidden_x def forward(self, input, hx=None): + is_lstm = self.is_lstm is_packed = isinstance(input, PackedSequence) - is_lstm = (self.mode == "LSTM") - if is_packed: - input, batch_sizes = input - max_batch_size = int(batch_sizes[0]) - else: - batch_sizes = None + if not is_packed: + seq_len = input.size(1) if self.batch_first else input.size(0) max_batch_size = input.size(0) if self.batch_first else input.size(1) + seq_lens = torch.LongTensor([seq_len for _ in range(max_batch_size)]) + input, batch_sizes = pack_padded_sequence(input, seq_lens, batch_first=self.batch_first) + else: + max_batch_size = int(input.batch_sizes[0]) + input, batch_sizes = input if hx is None: hx = input.new_zeros(self.num_layers * self.num_directions, - max_batch_size, self.hidden_size, - requires_grad=False) + max_batch_size, self.hidden_size, requires_grad=True) if is_lstm: - hx = (hx, hx) - - if self.batch_first: - input = input.transpose(0, 1) - batch_size = input.shape[1] + hx = (hx, hx.new_zeros(hx.size(), requires_grad=True)) - mask_x = input.new_ones((batch_size, self.input_size)) - mask_out = input.new_ones((batch_size, self.hidden_size * self.num_directions)) - mask_h_ones = input.new_ones((batch_size, self.hidden_size)) + mask_x = input.new_ones((max_batch_size, self.input_size)) + mask_out = input.new_ones((max_batch_size, self.hidden_size * self.num_directions)) + mask_h_ones = input.new_ones((max_batch_size, self.hidden_size)) nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True) nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True) - hidden_list = [] + hidden = input.new_zeros((self.num_layers*self.num_directions, max_batch_size, self.hidden_size)) + if is_lstm: + cellstate = input.new_zeros((self.num_layers*self.num_directions, max_batch_size, self.hidden_size)) for layer in range(self.num_layers): output_list = [] + input_seq = PackedSequence(input, batch_sizes) mask_h = nn.functional.dropout(mask_h_ones, p=self.hidden_dropout, training=self.training, inplace=False) for direction in range(self.num_directions): - input_x = input if direction == 0 else flip(input, [0]) + output_x, hidden_x = self._forward_one(layer, direction, input_seq, hx, + mask_x if layer == 0 else mask_out, mask_h) + output_list.append(output_x.data) idx = self.num_directions * layer + direction - cell = self._all_cells[idx] - hi = (hx[0][idx], hx[1][idx]) if is_lstm else hx[idx] - mask_xi = mask_x if layer == 0 else mask_out - output_x, hidden_x = cell(input_x, hi, mask_xi, mask_h) - output_list.append(output_x if direction == 0 else flip(output_x, [0])) - hidden_list.append(hidden_x) + if is_lstm: + hidden[idx] = hidden_x[0] + cellstate[idx] = hidden_x[1] + else: + hidden[idx] = hidden_x input = torch.cat(output_list, dim=-1) - output = input.transpose(0, 1) if self.batch_first else input if is_lstm: - h_list, c_list = zip(*hidden_list) - hn = torch.stack(h_list, dim=0) - cn = torch.stack(c_list, dim=0) - hidden = (hn, cn) - else: - hidden = torch.stack(hidden_list, dim=0) + hidden = (hidden, cellstate) if is_packed: - output = PackedSequence(output, batch_sizes) + output = PackedSequence(input, batch_sizes) + else: + input = PackedSequence(input, batch_sizes) + output, _ = pad_packed_sequence(input, batch_first=self.batch_first) return output, hidden @@ -152,3 +191,36 @@ class VarGRU(VarRNNBase): """ def __init__(self, *args, **kwargs): super(VarGRU, self).__init__(mode="GRU", Cell=nn.GRUCell, *args, **kwargs) + +# if __name__ == '__main__': +# x = torch.Tensor([[1,2,3], [4,5,0], [6,0,0]])[:,:,None] * 0.1 +# mask = (x != 0).float().view(3, -1) +# seq_lens = torch.LongTensor([3,2,1]) +# y = torch.Tensor([[0,1,1], [1,1,0], [0,0,0]]) +# # rev = _reverse_packed_sequence(pack) +# # # print(rev) +# lstm = VarLSTM(input_size=1, num_layers=2, hidden_size=2, +# batch_first=True, bidirectional=True, +# input_dropout=0.0, hidden_dropout=0.0,) +# # lstm = nn.LSTM(input_size=1, num_layers=2, hidden_size=2, +# # batch_first=True, bidirectional=True,) +# loss = nn.BCELoss() +# m = nn.Sigmoid() +# optim = torch.optim.SGD(lstm.parameters(), lr=1e-3) +# for i in range(2000): +# optim.zero_grad() +# pack = pack_padded_sequence(x, seq_lens, batch_first=True) +# out, hidden = lstm(pack) +# out, lens = pad_packed_sequence(out, batch_first=True) +# # print(lens) +# # print(out) +# # print(hidden[0]) +# # print(hidden[0].size()) +# # print(hidden[1]) +# out = out.sum(-1) +# out = m(out) * mask +# l = loss(out, y) +# l.backward() +# optim.step() +# if i % 50 == 0: +# print(out) diff --git a/reproduction/Biaffine_parser/cfg.cfg b/reproduction/Biaffine_parser/cfg.cfg index 8ee6f5fe..9b00c209 100644 --- a/reproduction/Biaffine_parser/cfg.cfg +++ b/reproduction/Biaffine_parser/cfg.cfg @@ -1,13 +1,8 @@ [train] -epochs = -1 -batch_size = 16 -pickle_path = "./save/" -validate = true -save_best_dev = true -eval_sort_key = "UAS" +n_epochs = 40 +batch_size = 32 use_cuda = true -model_saved_path = "./save/" -print_every_step = 20 +validate_every = 500 use_golden_train=true [test] @@ -32,9 +27,9 @@ arc_mlp_size = 500 label_mlp_size = 100 num_label = -1 dropout = 0.33 -use_var_lstm=false +use_var_lstm=true use_greedy_infer=false [optim] -lr = 2e-3 -weight_decay = 5e-5 +lr = 3e-4 +;weight_decay = 3e-5 diff --git a/reproduction/Biaffine_parser/run.py b/reproduction/Biaffine_parser/run.py index 0519201a..656da201 100644 --- a/reproduction/Biaffine_parser/run.py +++ b/reproduction/Biaffine_parser/run.py @@ -3,24 +3,26 @@ import sys sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) +import fastNLP import torch -import re from fastNLP.core.trainer import Trainer -from fastNLP.core.metrics import Evaluator from fastNLP.core.instance import Instance +from fastNLP.api.pipeline import Pipeline +from fastNLP.models.biaffine_parser import BiaffineParser, ParserMetric, ParserLoss from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.dataset import DataSet -from fastNLP.core.field import TextField, SeqLabelField from fastNLP.core.tester import Tester from fastNLP.io.config_io import ConfigLoader, ConfigSection -from fastNLP.io.model_io import ModelLoader, ModelSaver +from fastNLP.io.model_io import ModelLoader from fastNLP.io.embed_loader import EmbedLoader -from fastNLP.models.biaffine_parser import BiaffineParser +from fastNLP.io.model_io import ModelSaver +from reproduction.Biaffine_parser.util import ConllxDataLoader, MyDataloader +from fastNLP.api.processor import * BOS = '' EOS = '' -UNK = '' +UNK = '' NUM = '' ENG = '' @@ -28,85 +30,25 @@ ENG = '' if len(os.path.dirname(__file__)) != 0: os.chdir(os.path.dirname(__file__)) -class ConlluDataLoader(object): - def load(self, path): - datalist = [] - with open(path, 'r', encoding='utf-8') as f: - sample = [] - for line in f: - if line.startswith('\n'): - datalist.append(sample) - sample = [] - elif line.startswith('#'): - continue - else: - sample.append(line.split('\t')) - if len(sample) > 0: - datalist.append(sample) - - ds = DataSet(name='conll') - for sample in datalist: - # print(sample) - res = self.get_one(sample) - ds.append(Instance(word_seq=TextField(res[0], is_target=False), - pos_seq=TextField(res[1], is_target=False), - head_indices=SeqLabelField(res[2], is_target=True), - head_labels=TextField(res[3], is_target=True))) - - return ds - - def get_one(self, sample): - text = [] - pos_tags = [] - heads = [] - head_tags = [] - for w in sample: - t1, t2, t3, t4 = w[1], w[3], w[6], w[7] - if t3 == '_': - continue - text.append(t1) - pos_tags.append(t2) - heads.append(int(t3)) - head_tags.append(t4) - return (text, pos_tags, heads, head_tags) - -class CTBDataLoader(object): - def load(self, data_path): - with open(data_path, "r", encoding="utf-8") as f: - lines = f.readlines() - data = self.parse(lines) - return self.convert(data) - - def parse(self, lines): - """ - [ - [word], [pos], [head_index], [head_tag] - ] - """ - sample = [] - data = [] - for i, line in enumerate(lines): - line = line.strip() - if len(line) == 0 or i+1 == len(lines): - data.append(list(map(list, zip(*sample)))) - sample = [] - else: - sample.append(line.split()) - return data - - def convert(self, data): - dataset = DataSet() - for sample in data: - word_seq = [BOS] + sample[0] + [EOS] - pos_seq = [BOS] + sample[1] + [EOS] - heads = [0] + list(map(int, sample[2])) + [0] - head_tags = [BOS] + sample[3] + [EOS] - dataset.append(Instance(word_seq=TextField(word_seq, is_target=False), - pos_seq=TextField(pos_seq, is_target=False), - gold_heads=SeqLabelField(heads, is_target=False), - head_indices=SeqLabelField(heads, is_target=True), - head_labels=TextField(head_tags, is_target=True))) - return dataset +def convert(data): + dataset = DataSet() + for sample in data: + word_seq = [BOS] + sample[0] + pos_seq = [BOS] + sample[1] + heads = [0] + list(map(int, sample[2])) + head_tags = [BOS] + sample[3] + dataset.append(Instance(words=word_seq, + pos=pos_seq, + gold_heads=heads, + arc_true=heads, + tags=head_tags)) + return dataset + + +def load(path): + data = ConllxDataLoader().load(path) + return convert(data) + # datadir = "/mnt/c/Me/Dev/release-2.2-st-train-dev-data/ud-treebanks-v2.2/UD_English-EWT" # datadir = "/home/yfshao/UD_English-EWT" @@ -115,26 +57,29 @@ class CTBDataLoader(object): # emb_file_name = '/home/yfshao/glove.6B.100d.txt' # loader = ConlluDataLoader() -datadir = '/home/yfshao/workdir/parser-data/' -train_data_name = "train_ctb5.txt" -dev_data_name = "dev_ctb5.txt" -test_data_name = "test_ctb5.txt" -emb_file_name = "/home/yfshao/workdir/parser-data/word_OOVthr_30_100v.txt" -# emb_file_name = "/home/yfshao/workdir/word_vector/cc.zh.300.vec" -loader = CTBDataLoader() +# datadir = '/home/yfshao/workdir/parser-data/' +# train_data_name = "train_ctb5.txt" +# dev_data_name = "dev_ctb5.txt" +# test_data_name = "test_ctb5.txt" + +datadir = "/home/yfshao/workdir/ctb7.0/" +train_data_name = "train.conllx" +dev_data_name = "dev.conllx" +test_data_name = "test.conllx" +# emb_file_name = "/home/yfshao/workdir/parser-data/word_OOVthr_30_100v.txt" +emb_file_name = "/home/yfshao/workdir/word_vector/cc.zh.300.vec" cfgfile = './cfg.cfg' processed_datadir = './save' # Config Loader train_args = ConfigSection() -test_args = ConfigSection() model_args = ConfigSection() optim_args = ConfigSection() -ConfigLoader.load_config(cfgfile, {"train": train_args, "test": test_args, "model": model_args, "optim": optim_args}) +ConfigLoader.load_config(cfgfile, {"train": train_args, "model": model_args, "optim": optim_args}) print('trainre Args:', train_args.data) -print('test Args:', test_args.data) -print('optim Args:', optim_args.data) +print('model Args:', model_args.data) +print('optim_args', optim_args.data) # Pickle Loader @@ -159,84 +104,36 @@ def load_data(dirpath): return datas def P2(data, field, length): - ds = [ins for ins in data if ins[field].get_length() >= length] + ds = [ins for ins in data if len(ins[field]) >= length] data.clear() data.extend(ds) return ds -def P1(data, field): - def reeng(w): - return w if w == BOS or w == EOS or re.search(r'^([a-zA-Z]+[\.\-]*)+$', w) is None else ENG - def renum(w): - return w if re.search(r'^[0-9]+\.?[0-9]*$', w) is None else NUM - for ins in data: - ori = ins[field].contents() - s = list(map(renum, map(reeng, ori))) - if s != ori: - # print(ori) - # print(s) - # print() - ins[field] = ins[field].new(s) - return data - -class ParserEvaluator(Evaluator): - def __init__(self, ignore_label): - super(ParserEvaluator, self).__init__() - self.ignore = ignore_label - - def __call__(self, predict_list, truth_list): - head_all, label_all, total_all = 0, 0, 0 - for pred, truth in zip(predict_list, truth_list): - head, label, total = self.evaluate(**pred, **truth) - head_all += head - label_all += label - total_all += total - - return {'UAS': head_all*1.0 / total_all, 'LAS': label_all*1.0 / total_all} - - def evaluate(self, head_pred, label_pred, head_indices, head_labels, seq_mask, **_): - """ - Evaluate the performance of prediction. - - :return : performance results. - head_pred_corrct: number of correct predicted heads. - label_pred_correct: number of correct predicted labels. - total_tokens: number of predicted tokens - """ - seq_mask *= (head_labels != self.ignore).long() - head_pred_correct = (head_pred == head_indices).long() * seq_mask - _, label_preds = torch.max(label_pred, dim=2) - label_pred_correct = (label_preds == head_labels).long() * head_pred_correct - return head_pred_correct.sum().item(), label_pred_correct.sum().item(), seq_mask.sum().item() - -try: - data_dict = load_data(processed_datadir) - word_v = data_dict['word_v'] - pos_v = data_dict['pos_v'] - tag_v = data_dict['tag_v'] - train_data = data_dict['train_data'] - dev_data = data_dict['dev_data'] - test_data = data_dict['test_data'] - print('use saved pickles') - -except Exception as _: - print('load raw data and preprocess') - # use pretrain embedding - word_v = Vocabulary(need_default=True, min_freq=2) - word_v.unknown_label = UNK - pos_v = Vocabulary(need_default=True) - tag_v = Vocabulary(need_default=False) - train_data = loader.load(os.path.join(datadir, train_data_name)) - dev_data = loader.load(os.path.join(datadir, dev_data_name)) - test_data = loader.load(os.path.join(datadir, test_data_name)) - train_data.update_vocab(word_seq=word_v, pos_seq=pos_v, head_labels=tag_v) - datasets = (train_data, dev_data, test_data) - save_data(processed_datadir, word_v=word_v, pos_v=pos_v, tag_v=tag_v, train_data=train_data, dev_data=dev_data, test_data=test_data) - -embed, _ = EmbedLoader.load_embedding(model_args['word_emb_dim'], emb_file_name, 'glove', word_v, os.path.join(processed_datadir, 'word_emb.pkl')) - -print(len(word_v)) -print(embed.size()) +def update_v(vocab, data, field): + data.apply(lambda x: vocab.add_word_lst(x[field]), new_field_name=None) + + +print('load raw data and preprocess') +# use pretrain embedding +word_v = Vocabulary() +word_v.unknown_label = UNK +pos_v = Vocabulary() +tag_v = Vocabulary(unknown=None, padding=None) +train_data = load(os.path.join(datadir, train_data_name)) +dev_data = load(os.path.join(datadir, dev_data_name)) +test_data = load(os.path.join(datadir, test_data_name)) +print(train_data[0]) +num_p = Num2TagProcessor('words', 'words') +for ds in (train_data, dev_data, test_data): + num_p(ds) + +update_v(word_v, train_data, 'words') +update_v(pos_v, train_data, 'pos') +update_v(tag_v, train_data, 'tags') + +print('vocab build success {}, {}, {}'.format(len(word_v), len(pos_v), len(tag_v))) +# embed, _ = EmbedLoader.fast_load_embedding(model_args['word_emb_dim'], emb_file_name, word_v) +# print(embed.size()) # Model model_args['word_vocab_size'] = len(word_v) @@ -245,50 +142,49 @@ model_args['num_label'] = len(tag_v) model = BiaffineParser(**model_args.data) model.reset_parameters() -datasets = (train_data, dev_data, test_data) -for ds in datasets: - ds.index_field("word_seq", word_v).index_field("pos_seq", pos_v).index_field("head_labels", tag_v) - ds.set_origin_len('word_seq') + +word_idxp = IndexerProcessor(word_v, 'words', 'word_seq') +pos_idxp = IndexerProcessor(pos_v, 'pos', 'pos_seq') +tag_idxp = IndexerProcessor(tag_v, 'tags', 'label_true') +seq_p = SeqLenProcessor('word_seq', 'seq_lens') + +set_input_p = SetInputProcessor('word_seq', 'pos_seq', 'seq_lens', flag=True) +set_target_p = SetTargetProcessor('arc_true', 'label_true', 'seq_lens', flag=True) + +label_toword_p = Index2WordProcessor(vocab=tag_v, field_name='label_pred', new_added_field_name='label_pred_seq') + +for ds in (train_data, dev_data, test_data): + word_idxp(ds) + pos_idxp(ds) + tag_idxp(ds) + seq_p(ds) + set_input_p(ds) + set_target_p(ds) + if train_args['use_golden_train']: - train_data.set_target(gold_heads=False) -else: - train_data.set_target(gold_heads=None) + train_data.set_input('gold_heads', flag=True) train_args.data.pop('use_golden_train') -ignore_label = pos_v['P'] +ignore_label = pos_v['punct'] print(test_data[0]) -print(len(train_data)) -print(len(dev_data)) -print(len(test_data)) +print('train len {}'.format(len(train_data))) +print('dev len {}'.format(len(dev_data))) +print('test len {}'.format(len(test_data))) def train(path): + # test saving pipeline + save_pipe(path) + # Trainer - trainer = Trainer(**train_args.data) - - def _define_optim(obj): - lr = optim_args.data['lr'] - embed_params = set(obj._model.word_embedding.parameters()) - decay_params = set(obj._model.arc_predictor.parameters()) | set(obj._model.label_predictor.parameters()) - params = [p for p in obj._model.parameters() if p not in decay_params and p not in embed_params] - obj._optimizer = torch.optim.Adam([ - {'params': list(embed_params), 'lr':lr*0.1}, - {'params': list(decay_params), **optim_args.data}, - {'params': params} - ], lr=lr, betas=(0.9, 0.9)) - obj._scheduler = torch.optim.lr_scheduler.LambdaLR(obj._optimizer, lambda ep: max(.75 ** (ep / 5e4), 0.05)) - - def _update(obj): - # torch.nn.utils.clip_grad_norm_(obj._model.parameters(), 5.0) - obj._scheduler.step() - obj._optimizer.step() - - trainer.define_optimizer = lambda: _define_optim(trainer) - trainer.update = lambda: _update(trainer) - trainer.set_validator(Tester(**test_args.data, evaluator=ParserEvaluator(ignore_label))) - - model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=False) + trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, + loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS', + **train_args.data, + optimizer=fastNLP.Adam(**optim_args.data), + save_path=path) + + # model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=False) model.word_embedding.padding_idx = word_v.padding_idx model.word_embedding.weight.data[word_v.padding_idx].fill_(0) model.pos_embedding.padding_idx = pos_v.padding_idx @@ -302,18 +198,23 @@ def train(path): # pass # Start training - trainer.train(model, train_data, dev_data) + trainer.train() print("Training finished!") - # Saver - saver = ModelSaver("./save/saved_model.pkl") - saver.save_pytorch(model) - print("Model saved!") + # save pipeline + save_pipe(path) + print('pipe saved') + +def save_pipe(path): + pipe = Pipeline(processors=[num_p, word_idxp, pos_idxp, seq_p, set_input_p]) + pipe.add_processor(ModelProcessor(model=model, batch_size=32)) + pipe.add_processor(label_toword_p) + torch.save(pipe, os.path.join(path, 'pipe.pkl')) def test(path): # Tester - tester = Tester(**test_args.data, evaluator=ParserEvaluator(ignore_label)) + tester = Tester(**test_args.data) # Model model = BiaffineParser(**model_args.data) @@ -333,13 +234,18 @@ def test(path): print("Testing Test data") tester.test(model, test_data) +def build_pipe(parser_pipe_path): + parser_pipe = torch.load(parser_pipe_path) + + if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') - parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) + parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer', 'save']) parser.add_argument('--path', type=str, default='') + # parser.add_argument('--dst', type=str, default='') args = parser.parse_args() if args.mode == 'train': train(args.path) @@ -347,6 +253,12 @@ if __name__ == "__main__": test(args.path) elif args.mode == 'infer': pass + # elif args.mode == 'save': + # print(f'save model from {args.path} to {args.dst}') + # save_model(args.path, args.dst) + # load_path = os.path.dirname(args.dst) + # print(f'save pipeline in {load_path}') + # build(load_path) else: print('no mode specified for model!') parser.print_help() diff --git a/test/models/test_biaffine_parser.py b/test/models/test_biaffine_parser.py index 8fafd00b..54935f76 100644 --- a/test/models/test_biaffine_parser.py +++ b/test/models/test_biaffine_parser.py @@ -10,6 +10,8 @@ data_file = """ 4 will _ AUX MD _ 6 aux _ _ 5 be _ VERB VB _ 6 cop _ _ 6 payable _ ADJ JJ _ 0 root _ _ +7 mask _ ADJ JJ _ 6 punct _ _ +8 mask _ ADJ JJ _ 6 punct _ _ 9 cents _ NOUN NNS _ 4 nmod _ _ 10 from _ ADP IN _ 12 case _ _ 11 seven _ NUM CD _ 12 nummod _ _ @@ -58,13 +60,13 @@ def init_data(): data.append(line) for name in ['word_seq', 'pos_seq', 'label_true']: - ds.apply(lambda x: ['']+list(x[name])+[''], new_field_name=name) + ds.apply(lambda x: ['']+list(x[name]), new_field_name=name) ds.apply(lambda x: v[name].add_word_lst(x[name])) for name in ['word_seq', 'pos_seq', 'label_true']: ds.apply(lambda x: [v[name].to_index(w) for w in x[name]], new_field_name=name) - ds.apply(lambda x: [0]+list(map(int, x['arc_true']))+[1], new_field_name='arc_true') + ds.apply(lambda x: [0]+list(map(int, x['arc_true'])), new_field_name='arc_true') ds.apply(lambda x: len(x['word_seq']), new_field_name='seq_lens') ds.set_input('word_seq', 'pos_seq', 'seq_lens', flag=True) ds.set_target('arc_true', 'label_true', 'seq_lens', flag=True) @@ -75,8 +77,11 @@ class TestBiaffineParser(unittest.TestCase): ds, v1, v2, v3 = init_data() model = BiaffineParser(word_vocab_size=len(v1), word_emb_dim=30, pos_vocab_size=len(v2), pos_emb_dim=30, - num_label=len(v3)) + num_label=len(v3), use_var_lstm=True) trainer = fastNLP.Trainer(model=model, train_data=ds, dev_data=ds, loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS', n_epochs=10, use_cuda=False, use_tqdm=False) trainer.train(load_best_model=False) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file