diff --git a/fastNLP/api/pos_tagger.py b/fastNLP/api/pos_tagger.py new file mode 100644 index 00000000..fbd689c1 --- /dev/null +++ b/fastNLP/api/pos_tagger.py @@ -0,0 +1,44 @@ +import pickle + +import numpy as np + +from fastNLP.core.dataset import DataSet +from fastNLP.loader.model_loader import ModelLoader +from fastNLP.core.predictor import Predictor + + +class POS_tagger: + def __init__(self): + pass + + def predict(self, query): + """ + :param query: List[str] + :return answer: List[str] + + """ + # TODO: 根据query 构建DataSet + pos_dataset = DataSet() + pos_dataset["text_field"] = np.array(query) + + # 加载pipeline和model + pipeline = self.load_pipeline("./xxxx") + + # 将DataSet作为参数运行 pipeline + pos_dataset = pipeline(pos_dataset) + + # 加载模型 + model = ModelLoader().load_pytorch("./xxx") + + # 调 predictor + predictor = Predictor() + output = predictor.predict(model, pos_dataset) + + # TODO: 转成最终输出 + return None + + @staticmethod + def load_pipeline(path): + with open(path, "r") as fp: + pipeline = pickle.load(fp) + return pipeline diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 0381d267..397a3ddb 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -56,8 +56,8 @@ class Batch(object): indices = self.idx_list[self.curidx:endidx] for field_name, field in self.dataset.get_fields(): - batch = field.get(indices) - if not field.tensorable: #TODO 修改 + batch = torch.from_numpy(field.get(indices)) + if not field.need_tensor: #TODO 修改 pass elif field.is_target: batch_y[field_name] = batch diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index c2a10210..e626ff26 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -2,10 +2,12 @@ import random import sys from collections import defaultdict from copy import deepcopy +import numpy as np from fastNLP.core.field import TextField, LabelField from fastNLP.core.instance import Instance from fastNLP.core.vocabulary import Vocabulary +from fastNLP.core.fieldarray import FieldArray _READERS = {} @@ -14,43 +16,36 @@ class DataSet(object): """ - def __init__(self, fields=None): - """ - - """ - pass - - def index_all(self, vocab): - for ins in self: - ins.index_all(vocab) - return self + def __init__(self, instance=None): + if instance is not None: + self._convert_ins(instance) + else: + self.field_arrays = {} - def index_field(self, field_name, vocab): - if isinstance(field_name, str): - field_list = [field_name] - vocab_list = [vocab] + def _convert_ins(self, ins_list): + if isinstance(ins_list, list): + for ins in ins_list: + self.append(ins) else: - classes = (list, tuple) - assert isinstance(field_name, classes) and isinstance(vocab, classes) and len(field_name) == len(vocab) - field_list = field_name - vocab_list = vocab - - for name, vocabs in zip(field_list, vocab_list): - for ins in self: - ins.index_field(name, vocabs) - return self + self.append(ins) - def to_tensor(self, idx: int, padding_length: dict): - """Convert an instance in a dataset to tensor. + def append(self, ins): + # no field + if len(self.field_arrays) == 0: + for name, field in ins.field.items(): + self.field_arrays[name] = FieldArray(name, [field]) + else: + assert len(self.field_arrays) == len(ins.field) + for name, field in ins.field.items(): + assert name in self.field_arrays + self.field_arrays[name].append(field) - :param idx: int, the index of the instance in the dataset. - :param padding_length: int - :return tensor_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) - tensor_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) + def get_fields(self): + return self.field_arrays - """ - ins = self[idx] - return ins.to_tensor(padding_length, self.origin_len) + def __len__(self): + field = self.field_arrays.values()[0] + return len(field) def get_length(self): """Fetch lengths of all fields in all instances in a dataset. @@ -59,15 +54,10 @@ class DataSet(object): The list contains lengths of this field in all instances. """ - lengths = defaultdict(list) - for ins in self: - for field_name, field_length in ins.get_length().items(): - lengths[field_name].append(field_length) - return lengths + pass def shuffle(self): - random.shuffle(self) - return self + pass def split(self, ratio, shuffle=True): """Train/dev splitting @@ -78,58 +68,37 @@ class DataSet(object): dev_set: a DataSet object, representing the validation set """ - assert 0 < ratio < 1 - if shuffle: - self.shuffle() - split_idx = int(len(self) * ratio) - dev_set = deepcopy(self) - train_set = deepcopy(self) - del train_set[:split_idx] - del dev_set[split_idx:] - return train_set, dev_set + pass def rename_field(self, old_name, new_name): """rename a field """ - for ins in self: - ins.rename_field(old_name, new_name) + if old_name in self.field_arrays: + self.field_arrays[new_name] = self.field_arrays.pop(old_name) + else: + raise KeyError return self - def set_target(self, **fields): + def set_is_target(self, **fields): """Change the flag of `is_target` for all instance. For fields not set here, leave their `is_target` unchanged. - :param key-value pairs for field-name and `is_target` value(True, False or None). - """ - for ins in self: - ins.set_target(**fields) - return self - - def update_vocab(self, **name_vocab): - """using certain field data to update vocabulary. - - e.g. :: - - # update word vocab and label vocab seperately - dataset.update_vocab(word_seq=word_vocab, label_seq=label_vocab) + :param key-value pairs for field-name and `is_target` value(True, False). """ - for field_name, vocab in name_vocab.items(): - for ins in self: - vocab.update(ins[field_name].contents()) + for name, val in fields.items(): + if name in self.field_arrays: + assert isinstance(val, bool) + self.field_arrays[name].is_target = val + else: + raise KeyError return self - def set_origin_len(self, origin_field, origin_len_name=None): - """make dataset tensor output contain origin_len field. - - e.g. :: - - # output "word_seq_origin_len", lengths based on "word_seq" field - dataset.set_origin_len("word_seq") - """ - if origin_field is None: - self.origin_len = None - else: - self.origin_len = (origin_field + "_origin_len", origin_field) \ - if origin_len_name is None else (origin_len_name, origin_field) + def set_need_tensor(self, **kwargs): + for name, val in kwargs.items(): + if name in self.field_arrays: + assert isinstance(val, bool) + self.field_arrays[name].need_tensor = val + else: + raise KeyError return self def __getattribute__(self, name): diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 8720bf1b..5b9c1b63 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -7,10 +7,9 @@ class Field(object): """ - def __init__(self, name, is_target: bool): - self.name = name + def __init__(self, content, is_target: bool): self.is_target = is_target - self.content = None + self.content = content def index(self, vocab): """create index field @@ -29,23 +28,15 @@ class Field(object): raise NotImplementedError def __repr__(self): - return self.contents().__repr__() - - def new(self, *args, **kwargs): - return self.__class__(*args, **kwargs, is_target=self.is_target) + return self.content.__repr__() class TextField(Field): - def __init__(self, name, text, is_target): + def __init__(self, text, is_target): """ :param text: list of strings :param is_target: bool """ - super(TextField, self).__init__(name, is_target) - self.content = text - - def index(self, vocab): - idx_field = IndexField(self.name+'_idx', self.content, vocab, self.is_target) - return idx_field + super(TextField, self).__init__(text, is_target) class IndexField(Field): @@ -82,75 +73,19 @@ class LabelField(Field): """ def __init__(self, label, is_target=True): - super(LabelField, self).__init__(is_target) - self.label = label - self._index = None + super(LabelField, self).__init__(label, is_target) - def get_length(self): - """Fetch the length of the label field. - - :return length: int, the length of the label, always 1. - """ - return 1 - - def index(self, vocab): - if self._index is None: - if isinstance(self.label, str): - self._index = vocab[self.label] - return self._index - - def to_tensor(self, padding_length): - if self._index is None: - if isinstance(self.label, int): - return torch.tensor(self.label) - elif isinstance(self.label, str): - raise RuntimeError("Field {} not indexed. Call index method.".format(self.label)) - else: - raise RuntimeError( - "Not support type for LabelField. Expect str or int, got {}.".format(type(self.label))) - else: - return torch.LongTensor([self._index]) - - def contents(self): - return [self.label] class SeqLabelField(Field): def __init__(self, label_seq, is_target=True): - super(SeqLabelField, self).__init__(is_target) - self.label_seq = label_seq - self._index = None - - def get_length(self): - return len(self.label_seq) - - def index(self, vocab): - if self._index is None: - self._index = [vocab[c] for c in self.label_seq] - return self._index - - def to_tensor(self, padding_length): - pads = [0] * (padding_length - self.get_length()) - if self._index is None: - if self.get_length() == 0: - return torch.LongTensor(pads) - elif isinstance(self.label_seq[0], int): - return torch.LongTensor(self.label_seq + pads) - elif isinstance(self.label_seq[0], str): - raise RuntimeError("Field {} not indexed. Call index method.".format(self.label)) - else: - raise RuntimeError( - "Not support type for SeqLabelField. Expect str or int, got {}.".format(type(self.label))) - else: - return torch.LongTensor(self._index + pads) - - def contents(self): - return self.label_seq.copy() + super(SeqLabelField, self).__init__(label_seq, is_target) class CharTextField(Field): def __init__(self, text, max_word_len, is_target=False): super(CharTextField, self).__init__(is_target) - self.text = text + # TODO + raise NotImplementedError self.max_word_len = max_word_len self._index = [] diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py new file mode 100644 index 00000000..9710f991 --- /dev/null +++ b/fastNLP/core/fieldarray.py @@ -0,0 +1,39 @@ +import torch +import numpy as np + +class FieldArray(object): + def __init__(self, name, content, padding_val=0, is_target=True, need_tensor=True): + self.name = name + self.data = [self._convert_np(val) for val in content] + self.padding_val = padding_val + self.is_target = is_target + self.need_tensor = need_tensor + + def _convert_np(self, val): + if not isinstance(val, np.array): + return np.array(val) + else: + return val + + def append(self, val): + self.data.append(self._convert_np(val)) + + def get(self, idxes): + if isinstance(idxes, int): + return self.data[idxes] + elif isinstance(idxes, list): + id_list = np.array(idxes) + batch_size = len(id_list) + len_list = [(i, self.data[i].shape[0]) for i in id_list] + _, max_len = max(len_list, key=lambda x: x[1]) + array = np.full((batch_size, max_len), self.padding_val, dtype=np.int32) + + for i, (idx, length) in enumerate(len_list): + if length == max_len: + array[i] = self.data[idx] + else: + array[i][:length] = self.data[idx] + return array + + def __len__(self): + return len(self.data) diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py index 50787fd1..a2686da8 100644 --- a/fastNLP/core/instance.py +++ b/fastNLP/core/instance.py @@ -7,8 +7,6 @@ class Instance(object): def __init__(self, **fields): self.fields = fields - self.has_index = False - self.indexes = {} def add_field(self, field_name, field): self.fields[field_name] = field @@ -17,8 +15,6 @@ class Instance(object): def rename_field(self, old_name, new_name): if old_name in self.fields: self.fields[new_name] = self.fields.pop(old_name) - if old_name in self.indexes: - self.indexes[new_name] = self.indexes.pop(old_name) else: raise KeyError("error, no such field: {}".format(old_name)) return self @@ -38,53 +34,5 @@ class Instance(object): def __setitem__(self, name, field): return self.add_field(name, field) - def get_length(self): - """Fetch the length of all fields in the instance. - - :return length: dict of (str: int), which means (field name: field length). - - """ - length = {name: field.get_length() for name, field in self.fields.items()} - return length - - def index_field(self, field_name, vocab): - """use `vocab` to index certain field - """ - self.indexes[field_name] = self.fields[field_name].index(vocab) - return self - - def index_all(self, vocab): - """use `vocab` to index all fields - """ - if self.has_index: - print("error") - return self.indexes - indexes = {name: field.index(vocab) for name, field in self.fields.items()} - self.indexes = indexes - return indexes - - def to_tensor(self, padding_length: dict, origin_len=None): - """Convert instance to tensor. - - :param padding_length: dict of (str: int), which means (field name: padding_length of this field) - :return tensor_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) - tensor_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) - If is_target is False for all fields, tensor_y would be an empty dict. - """ - tensor_x = {} - tensor_y = {} - for name, field in self.fields.items(): - if field.is_target is True: - tensor_y[name] = field.to_tensor(padding_length[name]) - elif field.is_target is False: - tensor_x[name] = field.to_tensor(padding_length[name]) - else: - # is_target is None - continue - if origin_len is not None: - name, field_name = origin_len - tensor_x[name] = torch.LongTensor([self.fields[field_name].get_length()]) - return tensor_x, tensor_y - def __repr__(self): return self.fields.__repr__() \ No newline at end of file diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index c5d22df4..63e5b7ca 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -2,9 +2,7 @@ import numpy as np import torch from fastNLP.core.batch import Batch -from fastNLP.core.preprocess import load_pickle from fastNLP.core.sampler import SequentialSampler -from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq2tag_dataset, convert_seq_dataset class Predictor(object): @@ -16,19 +14,9 @@ class Predictor(object): Currently, Predictor does not support GPU. """ - def __init__(self, pickle_path, post_processor): - """ - - :param pickle_path: str, the path to the pickle files. - :param post_processor: a function or callable object, that takes list of batch outputs as input - - """ + def __init__(self): self.batch_size = 1 self.batch_output = [] - self.pickle_path = pickle_path - self._post_processor = post_processor - self.label_vocab = load_pickle(self.pickle_path, "label2id.pkl") - self.word_vocab = load_pickle(self.pickle_path, "word2id.pkl") def predict(self, network, data): """Perform inference using the trained model. @@ -37,9 +25,6 @@ class Predictor(object): :param data: a DataSet object. :return: list of list of strings, [num_examples, tag_seq_length] """ - # transform strings into DataSet object - # data = self.prepare_input(data) - # turn on the testing mode; clean up the history self.mode(network, test=True) batch_output = [] @@ -51,7 +36,7 @@ class Predictor(object): prediction = self.data_forward(network, batch_x) batch_output.append(prediction) - return self._post_processor(batch_output, self.label_vocab) + return batch_output def mode(self, network, test=True): if test: @@ -64,37 +49,19 @@ class Predictor(object): y = network(**x) return y - def prepare_input(self, data): - """Transform two-level list of strings into an DataSet object. - In the training pipeline, this is done by Preprocessor. But in inference time, we do not call Preprocessor. - - :param data: list of list of strings. - :: - [ - [word_11, word_12, ...], - [word_21, word_22, ...], - ... - ] - - :return data_set: a DataSet instance. - """ - assert isinstance(data, list) - data = convert_seq_dataset(data) - data.index_field("word_seq", self.word_vocab) - class SeqLabelInfer(Predictor): def __init__(self, pickle_path): print( "[FastNLP Warning] SeqLabelInfer will be deprecated. Please use Predictor directly.") - super(SeqLabelInfer, self).__init__(pickle_path, seq_label_post_processor) + super(SeqLabelInfer, self).__init__() class ClassificationInfer(Predictor): def __init__(self, pickle_path): print( "[FastNLP Warning] ClassificationInfer will be deprecated. Please use Predictor directly.") - super(ClassificationInfer, self).__init__(pickle_path, text_classify_post_processor) + super(ClassificationInfer, self).__init__() def seq_label_post_processor(batch_outputs, label_vocab): diff --git a/fastNLP/loader/model_loader.py b/fastNLP/loader/model_loader.py index c07576b8..5c8a1371 100644 --- a/fastNLP/loader/model_loader.py +++ b/fastNLP/loader/model_loader.py @@ -8,8 +8,8 @@ class ModelLoader(BaseLoader): Loader for models. """ - def __init__(self, data_path): - super(ModelLoader, self).__init__(data_path) + def __init__(self): + super(ModelLoader, self).__init__() @staticmethod def load_pytorch(empty_model, model_path): @@ -19,3 +19,10 @@ class ModelLoader(BaseLoader): :param model_path: str, the path to the saved model. """ empty_model.load_state_dict(torch.load(model_path)) + + @staticmethod + def load_pytorch(model_path): + """Load the entire model. + + """ + return torch.load(model_path) \ No newline at end of file diff --git a/fastNLP/models/sequence_modeling.py b/fastNLP/models/sequence_modeling.py index 464f99be..11e49ee1 100644 --- a/fastNLP/models/sequence_modeling.py +++ b/fastNLP/models/sequence_modeling.py @@ -127,7 +127,8 @@ class AdvSeqLabel(SeqLabeling): :param word_seq: LongTensor, [batch_size, mex_len] :param word_seq_origin_len: list of int. :param truth: LongTensor, [batch_size, max_len] - :return y: + :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. + If truth is not None, return loss, a scalar. Used in training. """ self.mask = self.make_mask(word_seq, word_seq_origin_len) diff --git a/fastNLP/saver/model_saver.py b/fastNLP/saver/model_saver.py index 74518a44..fd391f69 100644 --- a/fastNLP/saver/model_saver.py +++ b/fastNLP/saver/model_saver.py @@ -15,10 +15,14 @@ class ModelSaver(object): """ self.save_path = save_path - def save_pytorch(self, model): + def save_pytorch(self, model, param_only=True): """Save a pytorch model into .pkl file. :param model: a PyTorch model + :param param_only: bool, whether only to save the model parameters or the entire model. """ - torch.save(model.state_dict(), self.save_path) + if param_only is True: + torch.save(model.state_dict(), self.save_path) + else: + torch.save(model, self.save_path) diff --git a/reproduction/pos_tag_model/train_pos_tag.py b/reproduction/pos_tag_model/train_pos_tag.py index 45cfbbc0..fb077fe3 100644 --- a/reproduction/pos_tag_model/train_pos_tag.py +++ b/reproduction/pos_tag_model/train_pos_tag.py @@ -59,42 +59,37 @@ def infer(): print("Inference finished!") -def train(): - # Config Loader - train_args = ConfigSection() - test_args = ConfigSection() - ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args}) +def train(): + # load config + trainer_args = ConfigSection() + model_args = ConfigSection() + ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args}) # Data Loader loader = PeopleDailyCorpusLoader() train_data, _ = loader.load() - # Preprocessor - preprocessor = SeqLabelPreprocess() - data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) - train_args["vocab_size"] = preprocessor.vocab_size - train_args["num_classes"] = preprocessor.num_classes + # TODO: define processors + + # define pipeline + pp = Pipeline() + # TODO: pp.add_processor() - # Trainer - trainer = SeqLabelTrainer(**train_args.data) + # run the pipeline, get data_set + train_data = pp(train_data) - # Model + # define a model model = AdvSeqLabel(train_args) - try: - ModelLoader.load_pytorch(model, "./save/saved_model.pkl") - print('model parameter loaded!') - except Exception as e: - print("No saved model. Continue.") - pass - # Start training + # call trainer to train + trainer = SeqLabelTrainer(train_args) trainer.train(model, data_train, data_dev) - print("Training finished!") - # Saver - saver = ModelSaver("./save/saved_model.pkl") - saver.save_pytorch(model) - print("Model saved!") + # save model + ModelSaver("./saved_model.pkl").save_pytorch(model, param_only=False) + + # TODO:save pipeline + def test(): diff --git a/requirements.txt b/requirements.txt index 954dd741..a775c8ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ numpy>=1.14.2 -torch==0.4.0 +torch>=0.4.0 torchvision>=0.1.8 tensorboardX