diff --git a/fastNLP/api/model_zoo.py b/fastNLP/api/model_zoo.py index 9069ae55..a54a53d9 100644 --- a/fastNLP/api/model_zoo.py +++ b/fastNLP/api/model_zoo.py @@ -1,5 +1,3 @@ -import torch - import hashlib import os import re @@ -7,6 +5,8 @@ import shutil import sys import tempfile +import torch + try: from requests.utils import urlparse from requests import get as urlopen @@ -132,7 +132,3 @@ if tqdm is None: sys.stderr.write('\n') - -if __name__ == '__main__': - pipeline = load_url('http://10.141.208.102:5000/file/download/infer_context-4e86fd93.pkl', model_dir='.') - print(type(pipeline)) diff --git a/fastNLP/api/processor.py b/fastNLP/api/processor.py index 711f2b67..d6a68412 100644 --- a/fastNLP/api/processor.py +++ b/fastNLP/api/processor.py @@ -1,14 +1,15 @@ -import torch -from collections import defaultdict import re +from collections import defaultdict + +import torch -from fastNLP.core.dataset import DataSet -from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.batch import Batch +from fastNLP.core.dataset import DataSet from fastNLP.core.sampler import SequentialSampler +from fastNLP.core.vocabulary import Vocabulary -class Processor: +class Processor(object): def __init__(self, field_name, new_added_field_name): self.field_name = field_name if new_added_field_name is None: @@ -17,7 +18,7 @@ class Processor: self.new_added_field_name = new_added_field_name def process(self, *args, **kwargs): - pass + raise NotImplementedError def __call__(self, *args, **kwargs): return self.process(*args, **kwargs) @@ -132,13 +133,14 @@ class Num2TagProcessor(Processor): class IndexerProcessor(Processor): - def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False): + def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False, is_input=True): assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) super(IndexerProcessor, self).__init__(field_name, new_added_field_name) self.vocab = vocab self.delete_old_field = delete_old_field + self.is_input = is_input def set_vocab(self, vocab): assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) @@ -146,13 +148,14 @@ class IndexerProcessor(Processor): self.vocab = vocab def process(self, dataset): - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) + assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) for ins in dataset: tokens = ins[self.field_name] index = [self.vocab.to_index(token) for token in tokens] ins[self.new_added_field_name] = index - dataset._set_need_tensor(**{self.new_added_field_name: True}) + if self.is_input: + dataset.set_input(self.new_added_field_name) if self.delete_old_field: dataset.delete_field(self.field_name) @@ -161,6 +164,9 @@ class IndexerProcessor(Processor): class VocabProcessor(Processor): + """Build vocabulary with a field in the data set. + + """ def __init__(self, field_name): super(VocabProcessor, self).__init__(field_name, None) self.vocab = Vocabulary() @@ -178,17 +184,20 @@ class VocabProcessor(Processor): class SeqLenProcessor(Processor): - def __init__(self, field_name, new_added_field_name='seq_lens'): + def __init__(self, field_name, new_added_field_name='seq_lens', is_input=True): super(SeqLenProcessor, self).__init__(field_name, new_added_field_name) + self.is_input = is_input def process(self, dataset): assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) for ins in dataset: length = len(ins[self.field_name]) ins[self.new_added_field_name] = length - dataset._set_need_tensor(**{self.new_added_field_name: True}) + if self.is_input: + dataset.set_input(self.new_added_field_name) return dataset + class ModelProcessor(Processor): def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32): """ @@ -238,6 +247,7 @@ class ModelProcessor(Processor): device = torch.device(device) self.model.to(device) + class Index2WordProcessor(Processor): def __init__(self, vocab, field_name, new_added_field_name): super(Index2WordProcessor, self).__init__(field_name, new_added_field_name) @@ -251,6 +261,7 @@ class Index2WordProcessor(Processor): class SetTensorProcessor(Processor): + # TODO: remove it. It is strange. def __init__(self, field_dict, default=False): super(SetTensorProcessor, self).__init__(None, None) self.field_dict = field_dict @@ -264,6 +275,7 @@ class SetTensorProcessor(Processor): class SetIsTargetProcessor(Processor): + # TODO; remove it. def __init__(self, field_dict, default=False): super(SetIsTargetProcessor, self).__init__(None, None) self.field_dict = field_dict diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index b16fe165..b62d5624 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -2,7 +2,7 @@ from .batch import Batch from .dataset import DataSet from .fieldarray import FieldArray from .instance import Instance -from .losses import Loss +from .losses import LossFromTorch from .optimizer import Optimizer from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler from .tester import Tester diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 57171e25..f4963d0a 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -9,32 +9,20 @@ from fastNLP.core.utils import get_func_signature _READERS = {} -def construct_dataset(sentences): - """Construct a data set from a list of sentences. - - :param sentences: list of list of str - :return dataset: a DataSet object - """ - dataset = DataSet() - for sentence in sentences: - instance = Instance() - instance['raw_sentence'] = sentence - dataset.append(instance) - return dataset - - class DataSet(object): """DataSet is the collection of examples. DataSet provides instance-level interface. You can append and access an instance of the DataSet. However, it stores data in a different way: Field-first, Instance-second. """ + def __init__(self, data=None): """ - :param data: a dict or a list. If it is a dict, the key is the name of a field and the value is the field. - All values must be of the same length. - If it is a list, it must be a list of Instance objects. + :param data: a dict or a list. + If `data` is a dict, the key is the name of a FieldArray and the value is the FieldArray. All values + must be of the same length. + If `data` is a list, it must be a list of Instance objects. """ self.field_arrays = {} if data is not None: @@ -60,6 +48,7 @@ class DataSet(object): def iter_func(): for idx in range(len(self)): yield self[idx] + return iter_func() def _inner_iter(self): @@ -69,7 +58,8 @@ class DataSet(object): self.idx = idx def __getitem__(self, item): - assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[self.idx]) + assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[ + self.idx]) assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) return self.dataset.field_arrays[item][self.idx] @@ -79,6 +69,7 @@ class DataSet(object): def inner_iter_func(): for idx in range(len(self)): yield Iter_ptr(self, idx) + return inner_iter_func() def __getitem__(self, idx): @@ -217,9 +208,17 @@ class DataSet(object): raise KeyError("{} is not a valid field name.".format(name)) def get_input_name(self): + """Get all field names with `is_input` as True. + + :return list field_names: a list of str + """ return [name for name, field in self.field_arrays.items() if field.is_input] def get_target_name(self): + """Get all field names with `is_target` as True. + + :return list field_names: a list of str + """ return [name for name, field in self.field_arrays.items() if field.is_target] @classmethod @@ -243,7 +242,7 @@ class DataSet(object): :return results: if new_field_name is not passed, returned values of the function over all instances. """ results = [func(ins) for ins in self._inner_iter()] - if len(list(filter(lambda x: x is not None, results)))==0: # all None + if len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(get_func_signature(func=func))) extra_param = {} @@ -269,6 +268,12 @@ class DataSet(object): return results def drop(self, func): + """Drop instances if a condition holds. + + :param func: a function that takes an Instance object as input, and returns bool. + The instance will be dropped if the function returns True. + + """ results = [ins for ins in self._inner_iter() if not func(ins)] for name, old_field in self.field_arrays.items(): self.field_arrays[name].content = [ins[name] for ins in results] @@ -338,10 +343,33 @@ class DataSet(object): return cls(_dict) def save(self, path): + """Save the DataSet object as pickle. + + :param str path: the path to the pickle + """ with open(path, 'wb') as f: pickle.dump(self, f) @staticmethod def load(path): + """Load a DataSet object from pickle. + + :param str path: the path to the pickle + :return DataSet data_set: + """ with open(path, 'rb') as f: return pickle.load(f) + + +def construct_dataset(sentences): + """Construct a data set from a list of sentences. + + :param sentences: list of list of str + :return dataset: a DataSet object + """ + dataset = DataSet() + for sentence in sentences: + instance = Instance() + instance['raw_sentence'] = sentence + dataset.append(instance) + return dataset diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index fbd64e81..ed935c9d 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -7,14 +7,13 @@ import torch.nn.functional as F from fastNLP.core.utils import CheckError from fastNLP.core.utils import CheckRes from fastNLP.core.utils import _build_args -from fastNLP.core.utils import _check_function_or_method from fastNLP.core.utils import _check_arg_dict_list +from fastNLP.core.utils import _check_function_or_method from fastNLP.core.utils import get_func_signature class LossBase(object): def __init__(self): - # key: name in target function; value: name in output function self.param_map = {} self._checked = False @@ -159,8 +158,18 @@ class LossBase(object): return loss + class LossFunc(LossBase): + """A wrapper of user-provided loss function. + + """ def __init__(self, func, key_map=None, **kwargs): + """ + + :param func: a callable object, such as a function. + :param dict key_map: + :param kwargs: + """ super(LossFunc, self).__init__() _check_function_or_method(func) if key_map is not None: @@ -254,19 +263,19 @@ def _prepare_losser(losser): def squash(predict, truth, **kwargs): - '''To reshape tensors in order to fit Loss functions in pytorch + """To reshape tensors in order to fit loss functions in pytorch :param predict : Tensor, model output :param truth : Tensor, truth from dataset :param **kwargs : extra arguments :return predict , truth: predict & truth after processing - ''' + """ return predict.view(-1, predict.size()[-1]), truth.view(-1, ) def unpad(predict, truth, **kwargs): - '''To process padded sequence output to get true loss + """To process padded sequence output to get true loss Using pack_padded_sequence() method This method contains squash() @@ -277,7 +286,7 @@ def unpad(predict, truth, **kwargs): the i-th element is true lengths of i-th sequence :return predict , truth: predict & truth after processing - ''' + """ if kwargs.get("lens") is None: return predict, truth lens = torch.LongTensor(kwargs["lens"]) @@ -288,7 +297,7 @@ def unpad(predict, truth, **kwargs): def unpad_mask(predict, truth, **kwargs): - '''To process padded sequence output to get true loss + """To process padded sequence output to get true loss Using mask() method This method contains squash() @@ -299,7 +308,7 @@ def unpad_mask(predict, truth, **kwargs): the i-th element is true lengths of i-th sequence :return predict , truth: predict & truth after processing - ''' + """ if kwargs.get("lens") is None: return predict, truth mas = make_mask(kwargs["lens"], truth.size()[1]) @@ -307,7 +316,7 @@ def unpad_mask(predict, truth, **kwargs): def mask(predict, truth, **kwargs): - '''To select specific elements from Tensor + """To select specific elements from Tensor This method contains squash() :param predict : Tensor, [batch_size , max_len , tag_size] @@ -317,7 +326,7 @@ def mask(predict, truth, **kwargs): the mask Tensor , the position that is 1 will be selected :return predict , truth: predict & truth after processing - ''' + """ if kwargs.get("mask") is None: return predict, truth mask = kwargs["mask"] @@ -332,14 +341,14 @@ def mask(predict, truth, **kwargs): def make_mask(lens, tar_len): - '''to generate a mask that select [:lens[i]] for i-th element + """to generate a mask that select [:lens[i]] for i-th element embezzle from fastNLP.models.sequence_modeling.seq_mask :param lens : list or LongTensor, [batch_size] :param tar_len : int :return mask : ByteTensor - ''' + """ lens = torch.LongTensor(lens) mask = [torch.ge(lens, i + 1) for i in range(tar_len)] mask = torch.stack(mask, 1) @@ -376,9 +385,11 @@ loss_function_name = { } -class Loss(object): - """a Loss object is a callable object represents loss functions +class LossFromTorch(object): + """a LossFromTorch object is a callable object represents loss functions + This class only helps you with loss functions from PyTorch. + It has nothing to do with Trainer. """ def __init__(self, loss_name, pre_pro=[squash], **kwargs): @@ -408,11 +419,11 @@ class Loss(object): self.pre_pro = [f if callable(f) else method_dict.get(f) for f in pre_pro] def add_pre_pro(self, func): - '''add a pre_pro function + """add a pre_pro function :param func: a function or str, methods to reform parameters before calculating loss the strings will be auto translated to pre-defined functions - ''' + """ if not callable(func): func = method_dict.get(func) if func is None: @@ -421,12 +432,12 @@ class Loss(object): @staticmethod def _get_loss(loss_name, **kwargs): - '''Get loss function from torch + """Get loss function from torch :param loss_name: str, the name of loss function :param **kwargs: kwargs for torch loss function :return: A callable loss function object - ''' + """ loss_name = loss_name.strip().lower() loss_name = "".join(loss_name.split("_")) @@ -435,19 +446,19 @@ class Loss(object): return loss_function_name[loss_name](**kwargs) def get(self): - '''This method exists just for make some existing codes run error-freely - ''' + """This method exists just for make some existing codes run error-freely + """ return self def __call__(self, predict, truth, **kwargs): - '''call a loss function + """Call a loss function predict and truth will be processed by pre_pro methods in order of addition :param predict : Tensor, model output :param truth : Tensor, truth from dataset :param **kwargs : extra arguments, pass to pre_pro functions for example, if used unpad_mask() in pre_pro, there should be a kwarg named lens - ''' + """ for f in self.pre_pro: if f is None: continue diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 32c2306f..929d6ee1 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -308,6 +308,13 @@ def _prepare_metrics(metrics): return _metrics +""" + Attention: Codes below are not used in current FastNLP. + However, it is useful. + +""" + + def _conver_numpy(x): """convert input data to numpy array diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index 469c5632..dfcf83f9 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -11,6 +11,12 @@ class Optimizer(object): class SGD(Optimizer): def __init__(self, model_params=None, lr=0.01, momentum=0): + """ + + :param model_params: a generator. E.g. model.parameters() for PyTorch models. + :param float lr: learning rate. Default: 0.01 + :param float momentum: momentum. Default: 0 + """ super(SGD, self).__init__(model_params, lr=lr, momentum=momentum) def construct_from_pytorch(self, model_params): @@ -23,6 +29,12 @@ class SGD(Optimizer): class Adam(Optimizer): def __init__(self, model_params=None, lr=0.01, weight_decay=0): + """ + + :param model_params: a generator. E.g. model.parameters() for PyTorch models. + :param float lr: learning rate + :param float weight_decay: + """ super(Adam, self).__init__(model_params, lr=lr, weight_decay=weight_decay) def construct_from_pytorch(self, model_params): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index a3f81c00..c2bca3a2 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -140,7 +140,6 @@ class Trainer(object): def train(self): """Start Training. - :return: """ try: if torch.cuda.is_available() and self.use_cuda: @@ -216,14 +215,6 @@ class Trainer(object): pbar.close() def _print_train(self): - """ - - :param data_iterator: - :param model: - :param epoch: - :param start: - :return: - """ epoch = 1 start = time.time() while epoch <= self.n_epochs: diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py index b67bc4ab..b0b0d864 100644 --- a/fastNLP/io/base_loader.py +++ b/fastNLP/io/base_loader.py @@ -29,19 +29,3 @@ class BaseLoader(object): with open(cache_path, 'wb') as f: pickle.dump(obj, f) return obj - - -class ToyLoader0(BaseLoader): - """ - For CharLM - """ - - def __init__(self, data_path): - super(ToyLoader0, self).__init__(data_path) - - def load(self): - with open(self.data_path, 'r') as f: - corpus = f.read().lower() - import re - corpus = re.sub(r"", "unk", corpus) - return corpus.split() diff --git a/fastNLP/io/config_saver.py b/fastNLP/io/config_io.py similarity index 52% rename from fastNLP/io/config_saver.py rename to fastNLP/io/config_io.py index 49d6804d..52c5e789 100644 --- a/fastNLP/io/config_saver.py +++ b/fastNLP/io/config_io.py @@ -1,6 +1,152 @@ +import configparser +import json import os -from fastNLP.io.config_loader import ConfigSection, ConfigLoader +from fastNLP.io.base_loader import BaseLoader + + +class ConfigLoader(BaseLoader): + """loader for configuration files""" + + def __init__(self, data_path=None): + super(ConfigLoader, self).__init__() + if data_path is not None: + self.config = self.parse(super(ConfigLoader, self).load(data_path)) + + @staticmethod + def parse(string): + raise NotImplementedError + + @staticmethod + def load_config(file_path, sections): + """ + :param file_path: the path of config file + :param sections: the dict of {section_name(string): Section instance} + Example: + test_args = ConfigSection() + ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) + :return: return nothing, but the value of attributes are saved in sessions + """ + assert isinstance(sections, dict) + cfg = configparser.ConfigParser() + if not os.path.exists(file_path): + raise FileNotFoundError("config file {} not found. ".format(file_path)) + cfg.read(file_path) + for s in sections: + attr_list = [i for i in sections[s].__dict__.keys() if + not callable(getattr(sections[s], i)) and not i.startswith("__")] + if s not in cfg: + print('section %s not found in config file' % (s)) + continue + gen_sec = cfg[s] + for attr in gen_sec.keys(): + try: + val = json.loads(gen_sec[attr]) + # print(s, attr, val, type(val)) + if attr in attr_list: + assert type(val) == type(getattr(sections[s], attr)), \ + 'type not match, except %s but got %s' % \ + (type(getattr(sections[s], attr)), type(val)) + """ + if attr in attr_list then check its type and + update its value. + else add a new attr in sections[s] + """ + setattr(sections[s], attr, val) + except Exception as e: + print("cannot load attribute %s in section %s" + % (attr, s)) + pass + + +class ConfigSection(object): + + def __init__(self): + pass + + def __getitem__(self, key): + """ + :param key: str, the name of the attribute + :return attr: the value of this attribute + if key not in self.__dict__.keys(): + return self[key] + else: + raise AttributeError + """ + if key in self.__dict__.keys(): + return getattr(self, key) + raise AttributeError("do NOT have attribute %s" % key) + + def __setitem__(self, key, value): + """ + :param key: str, the name of the attribute + :param value: the value of this attribute + if key not in self.__dict__.keys(): + self[key] will be added + else: + self[key] will be updated + """ + if key in self.__dict__.keys(): + if not isinstance(value, type(getattr(self, key))): + raise AttributeError("attr %s except %s but got %s" % + (key, str(type(getattr(self, key))), str(type(value)))) + setattr(self, key, value) + + def __contains__(self, item): + """ + :param item: The key of item. + :return: True if the key in self.__dict__.keys() else False. + """ + return item in self.__dict__.keys() + + def __eq__(self, other): + """Overwrite the == operator + + :param other: Another ConfigSection() object which to be compared. + :return: True if value of each key in each ConfigSection() object are equal to the other, else False. + """ + for k in self.__dict__.keys(): + if k not in other.__dict__.keys(): + return False + if getattr(self, k) != getattr(self, k): + return False + + for k in other.__dict__.keys(): + if k not in self.__dict__.keys(): + return False + if getattr(self, k) != getattr(self, k): + return False + + return True + + def __ne__(self, other): + """Overwrite the != operator + + :param other: + :return: + """ + return not self.__eq__(other) + + @property + def data(self): + return self.__dict__ + + +if __name__ == "__main__": + config = ConfigLoader('there is no data') + + section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()} + """ + General and My can be found in config file, so the attr and + value will be updated + A cannot be found in config file, so nothing will be done + """ + + config.load_config("../../test/data_for_tests/config", section) + for s in section: + print(s) + for attr in section[s].__dict__.keys(): + print(s, attr, getattr(section[s], attr), type(getattr(section[s], attr))) class ConfigSaver(object): @@ -125,7 +271,7 @@ class ConfigSaver(object): # logger = create_logger(__name__, "./config_loader.log") # logger.warning("section [%s] in config file [%s] has been changed" % ( # section_name, self.file_path - #)) + # )) change_file = True break if not change_file: diff --git a/fastNLP/io/config_loader.py b/fastNLP/io/config_loader.py deleted file mode 100644 index 66051e4d..00000000 --- a/fastNLP/io/config_loader.py +++ /dev/null @@ -1,149 +0,0 @@ -import configparser -import json -import os - -from fastNLP.io.base_loader import BaseLoader - - -class ConfigLoader(BaseLoader): - """loader for configuration files""" - - def __init__(self, data_path=None): - super(ConfigLoader, self).__init__() - if data_path is not None: - self.config = self.parse(super(ConfigLoader, self).load(data_path)) - - @staticmethod - def parse(string): - raise NotImplementedError - - @staticmethod - def load_config(file_path, sections): - """ - :param file_path: the path of config file - :param sections: the dict of {section_name(string): Section instance} - Example: - test_args = ConfigSection() - ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) - :return: return nothing, but the value of attributes are saved in sessions - """ - assert isinstance(sections, dict) - cfg = configparser.ConfigParser() - if not os.path.exists(file_path): - raise FileNotFoundError("config file {} not found. ".format(file_path)) - cfg.read(file_path) - for s in sections: - attr_list = [i for i in sections[s].__dict__.keys() if - not callable(getattr(sections[s], i)) and not i.startswith("__")] - if s not in cfg: - print('section %s not found in config file' % (s)) - continue - gen_sec = cfg[s] - for attr in gen_sec.keys(): - try: - val = json.loads(gen_sec[attr]) - # print(s, attr, val, type(val)) - if attr in attr_list: - assert type(val) == type(getattr(sections[s], attr)), \ - 'type not match, except %s but got %s' % \ - (type(getattr(sections[s], attr)), type(val)) - """ - if attr in attr_list then check its type and - update its value. - else add a new attr in sections[s] - """ - setattr(sections[s], attr, val) - except Exception as e: - print("cannot load attribute %s in section %s" - % (attr, s)) - pass - - -class ConfigSection(object): - - def __init__(self): - pass - - def __getitem__(self, key): - """ - :param key: str, the name of the attribute - :return attr: the value of this attribute - if key not in self.__dict__.keys(): - return self[key] - else: - raise AttributeError - """ - if key in self.__dict__.keys(): - return getattr(self, key) - raise AttributeError("do NOT have attribute %s" % key) - - def __setitem__(self, key, value): - """ - :param key: str, the name of the attribute - :param value: the value of this attribute - if key not in self.__dict__.keys(): - self[key] will be added - else: - self[key] will be updated - """ - if key in self.__dict__.keys(): - if not isinstance(value, type(getattr(self, key))): - raise AttributeError("attr %s except %s but got %s" % - (key, str(type(getattr(self, key))), str(type(value)))) - setattr(self, key, value) - - def __contains__(self, item): - """ - :param item: The key of item. - :return: True if the key in self.__dict__.keys() else False. - """ - return item in self.__dict__.keys() - - def __eq__(self, other): - """Overwrite the == operator - - :param other: Another ConfigSection() object which to be compared. - :return: True if value of each key in each ConfigSection() object are equal to the other, else False. - """ - for k in self.__dict__.keys(): - if k not in other.__dict__.keys(): - return False - if getattr(self, k) != getattr(self, k): - return False - - for k in other.__dict__.keys(): - if k not in self.__dict__.keys(): - return False - if getattr(self, k) != getattr(self, k): - return False - - return True - - def __ne__(self, other): - """Overwrite the != operator - - :param other: - :return: - """ - return not self.__eq__(other) - - @property - def data(self): - return self.__dict__ - - -if __name__ == "__main__": - config = ConfigLoader('there is no data') - - section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()} - """ - General and My can be found in config file, so the attr and - value will be updated - A cannot be found in config file, so nothing will be done - """ - - config.load_config("../../test/data_for_tests/config", section) - for s in section: - print(s) - for attr in section[s].__dict__.keys(): - print(s, attr, getattr(section[s], attr), type(getattr(section[s], attr))) diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 79cb30ad..fc2edb23 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -1,4 +1,3 @@ -#TODO: need fix for current DataSet import os from fastNLP.core.dataset import DataSet @@ -20,8 +19,7 @@ def convert_seq_dataset(data): """ dataset = DataSet() for word_seq in data: - x = TextField(word_seq, is_target=False) - dataset.append(Instance(word_seq=x)) + dataset.append(Instance(word_seq=word_seq)) return dataset @@ -40,11 +38,7 @@ def convert_seq2tag_dataset(data): """ dataset = DataSet() for sample in data: - word_seq, label = sample[0], sample[1] - ins = Instance() - ins.add_field("word_seq", TextField(word_seq, is_target=False)) \ - .add_field("label", LabelField(label, is_target=True)) - dataset.append(ins) + dataset.append(Instance(word_seq=sample[0], label=sample[1])) return dataset @@ -63,11 +57,7 @@ def convert_seq2seq_dataset(data): """ dataset = DataSet() for sample in data: - word_seq, label_seq = sample[0], sample[1] - ins = Instance() - ins.add_field("word_seq", TextField(word_seq, is_target=False)) \ - .add_field("label_seq", TextField(label_seq, is_target=True)) - dataset.append(ins) + dataset.append(Instance(word_seq=sample[0], label_seq=sample[1])) return dataset @@ -273,85 +263,6 @@ class ClassDataSetLoader(DataSetLoader): return convert_seq2tag_dataset(data) -@DataSet.set_reader('read_conll') -class ConllLoader(DataSetLoader): - """loader for conll format files""" - - def __init__(self): - """ - :param str data_path: the path to the conll data set - """ - super(ConllLoader, self).__init__() - - def load(self, data_path): - """ - :return: list lines: all lines in a conll file - """ - with open(data_path, "r", encoding="utf-8") as f: - lines = f.readlines() - data = self.parse(lines) - return self.convert(data) - - @staticmethod - def parse(lines): - """ - :param list lines:a list containing all lines in a conll file. - :return: a 3D list - """ - sentences = list() - tokens = list() - for line in lines: - if line[0] == "#": - # skip the comments - continue - if line == "\n": - sentences.append(tokens) - tokens = [] - continue - tokens.append(line.split()) - return sentences - - def convert(self, data): - pass - - -@DataSet.set_reader('read_lm') -class LMDataSetLoader(DataSetLoader): - """Language Model Dataset Loader - - This loader produces data for language model training in a supervised way. - That means it has X and Y. - - """ - - def __init__(self): - super(LMDataSetLoader, self).__init__() - - def load(self, data_path): - if not os.path.exists(data_path): - raise FileNotFoundError("file {} not found.".format(data_path)) - with open(data_path, "r", encoding="utf=8") as f: - text = " ".join(f.readlines()) - tokens = text.strip().split() - data = self.sentence_cut(tokens) - return self.convert(data) - - def sentence_cut(self, tokens, sentence_length=15): - start_idx = 0 - data_set = [] - for idx in range(len(tokens) // sentence_length): - x = tokens[start_idx * idx: start_idx * idx + sentence_length] - y = tokens[start_idx * idx + 1: start_idx * idx + sentence_length + 1] - if start_idx * idx + sentence_length + 1 >= len(tokens): - # ad hoc - y.extend([""]) - data_set.append([x, y]) - return data_set - - def convert(self, data): - pass - - @DataSet.set_reader('read_people_daily') class PeopleDailyCorpusLoader(DataSetLoader): """ @@ -403,10 +314,19 @@ class PeopleDailyCorpusLoader(DataSetLoader): pos_tag_examples.append([sent_words, sent_pos_tag]) ner_examples.append([sent_words, sent_ner]) # List[List[List[str], List[str]]] - return pos_tag_examples, ner_examples + # ner_examples not used + return self.convert(pos_tag_examples) def convert(self, data): - pass + data_set = DataSet() + for item in data: + sent_words, sent_pos_tag = item[0], item[1] + data_set.append(Instance(words=sent_words, tags=sent_pos_tag)) + data_set.apply(lambda ins: len(ins), new_field_name="seq_len") + data_set.set_target("tags") + data_set.set_input("sent_words") + data_set.set_input("seq_len") + return data_set class SNLIDataSetLoader(DataSetLoader): @@ -462,17 +382,13 @@ class SNLIDataSetLoader(DataSetLoader): for example in data: p, h, l = example # list, list, str - x1 = TextField(p, is_target=False) - x2 = TextField(h, is_target=False) - x1_len = TextField([1] * len(p), is_target=False) - x2_len = TextField([1] * len(h), is_target=False) - y = LabelField(l, is_target=True) instance = Instance() - instance.add_field("premise", x1) - instance.add_field("hypothesis", x2) - instance.add_field("premise_len", x1_len) - instance.add_field("hypothesis_len", x2_len) - instance.add_field("truth", y) + instance.add_field("premise", p) + instance.add_field("hypothesis", h) + instance.add_field("truth", l) data_set.append(instance) - + data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len") + data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len") + data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len") + data_set.set_target("truth") return data_set diff --git a/fastNLP/io/model_saver.py b/fastNLP/io/model_io.py similarity index 51% rename from fastNLP/io/model_saver.py rename to fastNLP/io/model_io.py index fd391f69..e1264b47 100644 --- a/fastNLP/io/model_saver.py +++ b/fastNLP/io/model_io.py @@ -1,5 +1,32 @@ import torch +from fastNLP.io.base_loader import BaseLoader + + +class ModelLoader(BaseLoader): + """ + Loader for models. + """ + + def __init__(self): + super(ModelLoader, self).__init__() + + @staticmethod + def load_pytorch(empty_model, model_path): + """ + Load model parameters from .pkl files into the empty PyTorch model. + :param empty_model: a PyTorch model with initialized parameters. + :param model_path: str, the path to the saved model. + """ + empty_model.load_state_dict(torch.load(model_path)) + + @staticmethod + def load_pytorch_model(model_path): + """Load the entire model. + + """ + return torch.load(model_path) + class ModelSaver(object): """Save a model @@ -8,6 +35,7 @@ class ModelSaver(object): saver.save_pytorch(model) """ + def __init__(self, save_path): """ diff --git a/fastNLP/io/model_loader.py b/fastNLP/io/model_loader.py deleted file mode 100644 index afa05b93..00000000 --- a/fastNLP/io/model_loader.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch - -from fastNLP.io.base_loader import BaseLoader - - -class ModelLoader(BaseLoader): - """ - Loader for models. - """ - - def __init__(self): - super(ModelLoader, self).__init__() - - @staticmethod - def load_pytorch(empty_model, model_path): - """ - Load model parameters from .pkl files into the empty PyTorch model. - :param empty_model: a PyTorch model with initialized parameters. - :param model_path: str, the path to the saved model. - """ - empty_model.load_state_dict(torch.load(model_path)) - - @staticmethod - def load_pytorch_model(model_path): - """Load the entire model. - - """ - return torch.load(model_path) diff --git a/reproduction/Biaffine_parser/infer.py b/reproduction/Biaffine_parser/infer.py index 7d05c62b..8ebfa91c 100644 --- a/reproduction/Biaffine_parser/infer.py +++ b/reproduction/Biaffine_parser/infer.py @@ -5,7 +5,7 @@ sys.path.extend(['/home/yfshao/workdir/dev_fastnlp']) from fastNLP.api.processor import * from fastNLP.models.biaffine_parser import BiaffineParser -from fastNLP.io.config_loader import ConfigSection, ConfigLoader +from fastNLP.io.config_io import ConfigSection, ConfigLoader import _pickle as pickle import torch diff --git a/reproduction/Biaffine_parser/run.py b/reproduction/Biaffine_parser/run.py index 15dd3d4f..0519201a 100644 --- a/reproduction/Biaffine_parser/run.py +++ b/reproduction/Biaffine_parser/run.py @@ -13,11 +13,10 @@ from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.dataset import DataSet from fastNLP.core.field import TextField, SeqLabelField from fastNLP.core.tester import Tester -from fastNLP.io.config_loader import ConfigLoader, ConfigSection -from fastNLP.io.model_loader import ModelLoader +from fastNLP.io.config_io import ConfigLoader, ConfigSection +from fastNLP.io.model_io import ModelLoader, ModelSaver from fastNLP.io.embed_loader import EmbedLoader from fastNLP.models.biaffine_parser import BiaffineParser -from fastNLP.io.model_saver import ModelSaver BOS = '' EOS = '' diff --git a/reproduction/LSTM+self_attention_sentiment_analysis/main.py b/reproduction/LSTM+self_attention_sentiment_analysis/main.py index 2a64c8d3..61ab79f4 100644 --- a/reproduction/LSTM+self_attention_sentiment_analysis/main.py +++ b/reproduction/LSTM+self_attention_sentiment_analysis/main.py @@ -2,8 +2,8 @@ import torch.nn.functional as F from fastNLP.core.trainer import ClassificationTrainer from fastNLP.core.utils import ClassPreprocess as Preprocess -from fastNLP.io.config_loader import ConfigLoader -from fastNLP.io.config_loader import ConfigSection +from fastNLP.io.config_io import ConfigLoader +from fastNLP.io.config_io import ConfigSection from fastNLP.io.dataset_loader import ClassDataSetLoader as Dataset_loader from fastNLP.models.base_model import BaseModel from fastNLP.modules.aggregator.self_attention import SelfAttention diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py index 7dd5091a..e7804bae 100644 --- a/reproduction/chinese_word_segment/run.py +++ b/reproduction/chinese_word_segment/run.py @@ -3,12 +3,11 @@ import sys sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) -from fastNLP.io.config_loader import ConfigLoader, ConfigSection +from fastNLP.io.config_io import ConfigLoader, ConfigSection from fastNLP.core.trainer import SeqLabelTrainer from fastNLP.io.dataset_loader import BaseLoader, TokenizeDataSetLoader from fastNLP.core.utils import load_pickle -from fastNLP.io.model_saver import ModelSaver -from fastNLP.io.model_loader import ModelLoader +from fastNLP.io.model_io import ModelLoader, ModelSaver from fastNLP.core.tester import SeqLabelTester from fastNLP.models.sequence_modeling import AdvSeqLabel from fastNLP.core.predictor import SeqLabelInfer diff --git a/setup.py b/setup.py index 0da887a3..a8b4834e 100644 --- a/setup.py +++ b/setup.py @@ -12,12 +12,12 @@ with open('requirements.txt', encoding='utf-8') as f: reqs = f.read() setup( - name='fastNLP', + name='FastNLP', version='0.1.1', description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', long_description=readme, license=license, - author='fudanNLP', + author='FudanNLP', python_requires='>=3.5', packages=find_packages(), install_requires=reqs.strip().split('\n'), diff --git a/test/api/test_processor.py b/test/api/test_processor.py new file mode 100644 index 00000000..fa6133b9 --- /dev/null +++ b/test/api/test_processor.py @@ -0,0 +1,12 @@ +import unittest + +from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor +from fastNLP.core.dataset import DataSet + + +class TestProcessor(unittest.TestCase): + def test_FullSpaceToHalfSpaceProcessor(self): + ds = DataSet({"word": ["00, u1, u), (u2, u2"]}) + proc = FullSpaceToHalfSpaceProcessor("word") + ds = proc(ds) + self.assertTrue(ds.field_arrays["word"].content, ["00, u1, u), (u2, u2"]) diff --git a/test/core/test_loss.py b/test/core/test_loss.py index a7c303e2..52860b36 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -45,7 +45,7 @@ class TestLoss(unittest.TestCase): # 验证squash()的正确性 log = math.log - loss_func = loss.Loss("nll") + loss_func = loss.LossFromTorch("nll") y = tc.Tensor( [ @@ -129,7 +129,7 @@ class TestLoss(unittest.TestCase): lens = [4, 2, 1] y = tc.log(y) - loss_func = loss.Loss("nll", pre_pro=["unpad"]) + loss_func = loss.LossFromTorch("nll", pre_pro=["unpad"]) los = loss_func(y, gy, lens=lens) r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) @@ -169,7 +169,7 @@ class TestLoss(unittest.TestCase): lens = [2, 4, 2] - loss_func = loss.Loss("nll", pre_pro=["mask"]) + loss_func = loss.LossFromTorch("nll", pre_pro=["mask"]) los = loss_func(y, gy, mask=mask) los2 = loss_func(y, gy, mask=loss.make_mask(lens, gy.size()[-1])) @@ -205,7 +205,7 @@ class TestLoss(unittest.TestCase): y = tc.log(y) - loss_func = loss.Loss("nll", pre_pro=["unpad_mask"]) + loss_func = loss.LossFromTorch("nll", pre_pro=["unpad_mask"]) los = loss_func(y, gy, lens=lens) r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1) @@ -235,7 +235,7 @@ class TestLoss(unittest.TestCase): lens = [4, 2, 1] y = tc.log(y) - loss_func = loss.Loss("nll", pre_pro=[], weight=tc.Tensor([1, 1, 0])) + loss_func = loss.LossFromTorch("nll", pre_pro=[], weight=tc.Tensor([1, 1, 0])) loss_func.add_pre_pro("unpad_mask") los = loss_func(y, gy, lens=lens) diff --git a/test/io/test_config_saver.py b/test/io/test_config_saver.py index 4a223f91..f29097c5 100644 --- a/test/io/test_config_saver.py +++ b/test/io/test_config_saver.py @@ -1,8 +1,7 @@ import os import unittest -from fastNLP.io.config_loader import ConfigSection, ConfigLoader -from fastNLP.io.config_saver import ConfigSaver +from fastNLP.io.config_io import ConfigSection, ConfigLoader, ConfigSaver class TestConfigSaver(unittest.TestCase):