From 4fd49cc333fd8e571e220169e376346c720f3293 Mon Sep 17 00:00:00 2001 From: xuyige Date: Thu, 11 Apr 2019 15:00:10 +0800 Subject: [PATCH 01/13] add sigmoid activate function in MLP --- fastNLP/modules/decoder/MLP.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastNLP/modules/decoder/MLP.py b/fastNLP/modules/decoder/MLP.py index d75f6b48..3a793f24 100644 --- a/fastNLP/modules/decoder/MLP.py +++ b/fastNLP/modules/decoder/MLP.py @@ -36,6 +36,7 @@ class MLP(nn.Module): actives = { 'relu': nn.ReLU(), 'tanh': nn.Tanh(), + 'sigmoid': nn.Sigmoid(), } if not isinstance(activation, list): activation = [activation] * (len(size_layer) - 2) From 4d1721ffe365d53351c21241dfd7fdb6114c6bed Mon Sep 17 00:00:00 2001 From: yh Date: Sun, 14 Apr 2019 18:00:21 +0800 Subject: [PATCH 02/13] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=83=A8=E5=88=86bug?= =?UTF-8?q?=EF=BC=9B=E8=B0=83=E6=95=B4callback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/batch.py | 6 ++-- fastNLP/core/callback.py | 51 ++++++++++----------------- fastNLP/core/dataset.py | 32 ++++++++--------- fastNLP/core/fieldarray.py | 17 +++++++++ fastNLP/core/metrics.py | 72 ++++---------------------------------- fastNLP/core/trainer.py | 31 ++++++++++------ setup.py | 4 +-- test/automl/test_enas.py | 2 +- test/core/test_dataset.py | 4 +-- test/test_tutorials.py | 4 +-- 10 files changed, 87 insertions(+), 136 deletions(-) diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 88d9185d..d07df047 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -14,15 +14,17 @@ class Batch(object): :param DataSet dataset: a DataSet object :param int batch_size: the size of the batch - :param Sampler sampler: a Sampler object + :param Sampler sampler: a Sampler object. If None, use fastNLP.sampler.RandomSampler :param bool as_numpy: If True, return Numpy array. Otherwise, return torch tensors. :param bool prefetch: If True, use multiprocessing to fetch next batch when training. :param str or torch.device device: the batch's device, if as_numpy is True, device is ignored. """ - def __init__(self, dataset, batch_size, sampler=RandomSampler(), as_numpy=False, prefetch=False): + def __init__(self, dataset, batch_size, sampler=None, as_numpy=False, prefetch=False): self.dataset = dataset self.batch_size = batch_size + if sampler is None: + sampler = RandomSampler() self.sampler = sampler self.as_numpy = as_numpy self.idx_list = None diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index e3b4f36e..57f94bc4 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -17,37 +17,37 @@ class Callback(object): super(Callback, self).__init__() self.trainer = None # 在Trainer内部被重新赋值 - # callback只读属性 - self._n_epochs = None - self._n_steps = None - self._batch_size = None - self._model = None - self._pbar = None - self._optimizer = None - @property def n_epochs(self): - return self._n_epochs + return self.trainer.n_epochs + + @property + def epoch(self): + return self.trainer.epoch @property def n_steps(self): - return self._n_steps + return self.trainer.n_steps + + @property + def step(self): + return self.trainer.step @property def batch_size(self): - return self._batch_size + return self.trainer.batch_size @property def model(self): - return self._model + return self.trainer.model @property def pbar(self): - return self._pbar + return self.trainer.pbar @property def optimizer(self): - return self._optimizer + return self.trainer.optimizer def on_train_begin(self): # before the main training loop @@ -82,13 +82,14 @@ class Callback(object): def on_valid_begin(self): pass - def on_valid_end(self, eval_result, metric_key, optimizer): + def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): """ 每次执行验证机的evaluation后会调用。传入eval_result :param eval_result: Dict[str: Dict[str: float]], evaluation的结果 :param metric_key: str - :param optimizer: + :param optimizer: optimizer passed to trainer + :param is_better_eval: bool, 当前dev结果是否比之前的好 :return: """ pass @@ -145,11 +146,10 @@ class CallbackManager(Callback): """ - def __init__(self, env, attr, callbacks=None): + def __init__(self, env, callbacks=None): """ :param dict env: The key is the name of the Trainer attribute(str). The value is the attribute itself. - :param dict attr: read-only attributes for all callbacks :param Callback callbacks: """ super(CallbackManager, self).__init__() @@ -170,19 +170,6 @@ class CallbackManager(Callback): for callback in self.callbacks: setattr(callback, env_name, env_val) # Callback.trainer - self.set_property(**attr) - - def set_property(self, **kwargs): - """设置所有callback的只读属性 - - :param kwargs: - :return: - """ - for callback in self.callbacks: - for k, v in kwargs.items(): - setattr(callback, "_" + k, v) - - @transfer def on_train_begin(self): pass @@ -220,7 +207,7 @@ class CallbackManager(Callback): pass @transfer - def on_valid_end(self, eval_result, metric_key, optimizer): + def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): pass @transfer diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 24376a72..068afb38 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -90,7 +90,7 @@ class DataSet(object): data_set = DataSet() for field in self.field_arrays.values(): data_set.add_field(name=field.name, fields=field.content[idx], padder=field.padder, - is_input=field.is_input, is_target=field.is_target) + is_input=field.is_input, is_target=field.is_target, ignore_type=field.ignore_type) return data_set elif isinstance(idx, str): if idx not in self: @@ -313,16 +313,23 @@ class DataSet(object): else: return results - def drop(self, func): + def drop(self, func, inplace=True): """Drop instances if a condition holds. :param func: a function that takes an Instance object as input, and returns bool. The instance will be dropped if the function returns True. + :param inplace: bool, whether to drop inpalce. Otherwise a new dataset will be returned. """ - results = [ins for ins in self._inner_iter() if not func(ins)] - for name, old_field in self.field_arrays.items(): - self.field_arrays[name].content = [ins[name] for ins in results] + if inplace: + results = [ins for ins in self._inner_iter() if not func(ins)] + for name, old_field in self.field_arrays.items(): + self.field_arrays[name].content = [ins[name] for ins in results] + else: + results = [ins for ins in self if not func(ins)] + data = DataSet(results) + for field_name, field in self.field_arrays.items(): + data.field_arrays[field_name].to(field) def split(self, dev_ratio): """Split the dataset into training and development(validation) set. @@ -346,19 +353,8 @@ class DataSet(object): for idx in train_indices: train_set.append(self[idx]) for field_name in self.field_arrays: - train_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input - train_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target - train_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder - train_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype - train_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype - train_set.field_arrays[field_name].content_dim = self.field_arrays[field_name].content_dim - - dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input - dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target - dev_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder - dev_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype - dev_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype - dev_set.field_arrays[field_name].content_dim = self.field_arrays[field_name].content_dim + train_set.field_arrays[field_name].to(self.field_arrays[field_name]) + dev_set.field_arrays[field_name].to(self.field_arrays[field_name]) return train_set, dev_set diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 72bb30b5..10fbbebe 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -383,6 +383,23 @@ class FieldArray(object): """ return len(self.content) + def to(self, other): + """ + 将other的属性复制给本fieldarray(必须通过fieldarray类型). 包含 is_input, is_target, padder, dtype, pytype, content_dim + ignore_type + + :param other: FieldArray + :return: + """ + assert isinstance(other, FieldArray), "Only support FieldArray type, not {}.".format(type(other)) + + self.is_input = other.is_input + self.is_target = other.is_target + self.padder = other.padder + self.dtype = other.dtype + self.pytype = other.pytype + self.content_dim = other.content_dim + self.ignore_type = other.ignore_type def is_iterable(content): try: diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 64555e12..3d3647c4 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -91,7 +91,6 @@ class MetricBase(object): Besides, before passing params into self.evaluate, this function will filter out params from output_dict and target_dict which are not used in self.evaluate. (but if **kwargs presented in self.evaluate, no filtering will be conducted.) - However, in some cases where type check is not necessary, ``_fast_param_map`` will be used. """ def __init__(self): @@ -146,21 +145,6 @@ class MetricBase(object): def get_metric(self, reset=True): raise NotImplemented - def _fast_param_map(self, pred_dict, target_dict): - """Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map. - such as pred_dict has one element, target_dict has one element - - :param pred_dict: - :param target_dict: - :return: dict, if dict is not {}, pass it to self.evaluate. Otherwise do mapping. - """ - fast_param = {} - if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1: - fast_param['pred'] = list(pred_dict.values())[0] - fast_param['target'] = list(target_dict.values())[0] - return fast_param - return fast_param - def __call__(self, pred_dict, target_dict): """ @@ -172,7 +156,6 @@ class MetricBase(object): Besides, before passing params into self.evaluate, this function will filter out params from output_dict and target_dict which are not used in self.evaluate. (but if **kwargs presented in self.evaluate, no filtering will be conducted.) - This function also support _fast_param_map. :param pred_dict: usually the output of forward or prediction function :param target_dict: usually features set as target.. :return: @@ -180,11 +163,6 @@ class MetricBase(object): if not callable(self.evaluate): raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.") - fast_param = self._fast_param_map(pred_dict=pred_dict, target_dict=target_dict) - if fast_param: - self.evaluate(**fast_param) - return - if not self._checked: # 1. check consistence between signature and param_map func_spect = inspect.getfullargspec(self.evaluate) @@ -262,41 +240,6 @@ class AccuracyMetric(MetricBase): self.total = 0 self.acc_count = 0 - def _fast_param_map(self, pred_dict, target_dict): - """Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map. - such as pred_dict has one element, target_dict has one element - - :param pred_dict: - :param target_dict: - :return: dict, if dict is not None, pass it to self.evaluate. Otherwise do mapping. - """ - fast_param = {} - targets = list(target_dict.values()) - if len(targets) == 1 and isinstance(targets[0], torch.Tensor): - if len(pred_dict) == 1: - pred = list(pred_dict.values())[0] - fast_param['pred'] = pred - elif len(pred_dict) == 2: - pred1 = list(pred_dict.values())[0] - pred2 = list(pred_dict.values())[1] - if not (isinstance(pred1, torch.Tensor) and isinstance(pred2, torch.Tensor)): - return fast_param - if len(pred1.size()) < len(pred2.size()) and len(pred1.size()) == 1: - seq_lens = pred1 - pred = pred2 - elif len(pred1.size()) > len(pred2.size()) and len(pred2.size()) == 1: - seq_lens = pred2 - pred = pred1 - else: - return fast_param - fast_param['pred'] = pred - fast_param['seq_lens'] = seq_lens - else: - return fast_param - fast_param['target'] = targets[0] - # TODO need to make sure they all have same batch_size - return fast_param - def evaluate(self, pred, target, seq_lens=None): """ @@ -321,7 +264,7 @@ class AccuracyMetric(MetricBase): f"got {type(seq_lens)}.") if seq_lens is not None: - masks = seq_lens_to_masks(seq_lens=seq_lens, float=True) + masks = seq_lens_to_masks(seq_lens=seq_lens).long() else: masks = None @@ -334,14 +277,12 @@ class AccuracyMetric(MetricBase): f"size:{pred.size()}, target should have size: {pred.size()} or " f"{pred.size()[:-1]}, got {target.size()}.") - pred = pred.float() - target = target.float() if masks is not None: - self.acc_count += torch.sum(torch.eq(pred, target).float() * masks.float()).item() - self.total += torch.sum(masks.float()).item() + self.acc_count += torch.sum(torch.eq(pred, target) * masks).item() + self.total += torch.sum(masks).item() else: - self.acc_count += torch.sum(torch.eq(pred, target).float()).item() + self.acc_count += torch.sum(torch.eq(pred, target)).item() self.total += np.prod(list(pred.size())) def get_metric(self, reset=True): @@ -350,7 +291,7 @@ class AccuracyMetric(MetricBase): :param bool reset: whether to recount next time. :return evaluate_result: {"acc": float} """ - evaluate_result = {'acc': round(self.acc_count / self.total, 6)} + evaluate_result = {'acc': round(float(self.acc_count) / (self.total + 1e-12), 6)} if reset: self.acc_count = 0 self.total = 0 @@ -441,8 +382,7 @@ def bio_tag_to_spans(tags, ignore_labels=None): prev_bio_tag = bio_tag return [(span[0], (span[1][0], span[1][1]+1)) for span in spans - if span[0] not in ignore_labels - ] + if span[0] not in ignore_labels] class SpanFPreRecMetric(MetricBase): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index ca2ff93b..e678ea3d 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -34,7 +34,7 @@ class Trainer(object): def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, validate_every=-1, dev_data=None, save_path=None, optimizer=None, check_code_level=0, metric_key=None, sampler=None, prefetch=False, use_tqdm=True, - use_cuda=False, callbacks=None): + use_cuda=False, callbacks=None, update_every=1): """ :param DataSet train_data: the training data :param torch.nn.modules.module model: a PyTorch model @@ -62,6 +62,8 @@ class Trainer(object): :param bool use_tqdm: whether to use tqdm to show train progress. :param callbacks: List[Callback]. 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以 通过callback机制实现。 + :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128会导致内存 + 不足,通过设置batch_size=32, update_every=4达到目的 """ super(Trainer, self).__init__() @@ -76,6 +78,10 @@ class Trainer(object): if metrics and (dev_data is None): raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") + # check update every + assert update_every>=1, "update_every must be no less than 1." + self.update_every = int(update_every) + # check save_path if not (save_path is None or isinstance(save_path, str)): raise ValueError("save_path can only be None or `str`.") @@ -144,11 +150,9 @@ class Trainer(object): self.start_time = None # start timestamp self.callback_manager = CallbackManager(env={"trainer": self}, - attr={"n_epochs": self.n_epochs, "n_steps": self.step, - "batch_size": self.batch_size, "model": self.model, - "optimizer": self.optimizer}, callbacks=callbacks) + def train(self, load_best_model=True): """ @@ -241,7 +245,6 @@ class Trainer(object): avg_loss = 0 data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, prefetch=self.prefetch) - self.callback_manager.set_property(pbar=pbar) for epoch in range(1, self.n_epochs+1): pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) # early stopping @@ -257,6 +260,7 @@ class Trainer(object): self.callback_manager.on_loss_begin(batch_y, prediction) loss = self._compute_loss(prediction, batch_y) avg_loss += loss.item() + loss = loss/self.update_every # Is loss NaN or inf? requires_grad = False self.callback_manager.on_backward_begin(loss, self.model) @@ -267,8 +271,9 @@ class Trainer(object): self.callback_manager.on_step_end(self.optimizer) if (self.step+1) % self.print_every == 0: + avg_loss = avg_loss / self.print_every if self.use_tqdm: - print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every) + print_output = "loss:{0:<6.5f}".format(avg_loss) pbar.update(self.print_every) else: end = time.time() @@ -286,8 +291,8 @@ class Trainer(object): eval_res = self._do_validation(epoch=epoch, step=self.step) eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \ - self.tester._format_eval_results(eval_res) - pbar.write(eval_str) + self.tester._format_eval_results(eval_res) + pbar.write(eval_str + '\n') # ================= mini-batch end ==================== # @@ -301,6 +306,7 @@ class Trainer(object): self.callback_manager.on_valid_begin() res = self.tester.test() + is_better_eval = False if self._better_eval_result(res): if self.save_path is not None: self._save_model(self.model, @@ -310,8 +316,9 @@ class Trainer(object): self.best_dev_perf = res self.best_dev_epoch = epoch self.best_dev_step = step + is_better_eval = True # get validation results; adjust optimizer - self.callback_manager.on_valid_end(res, self.metric_key, self.optimizer) + self.callback_manager.on_valid_end(res, self.metric_key, self.optimizer, is_better_eval) return res def _mode(self, model, is_test=False): @@ -330,7 +337,8 @@ class Trainer(object): """Perform weight update on a model. """ - self.optimizer.step() + if (self.step+1)%self.update_every==0: + self.optimizer.step() def _data_forward(self, network, x): x = _build_args(network.forward, **x) @@ -346,7 +354,8 @@ class Trainer(object): For PyTorch, just do "loss.backward()" """ - self.model.zero_grad() + if self.step%self.update_every==0: + self.model.zero_grad() loss.backward() def _compute_loss(self, predict, truth): diff --git a/setup.py b/setup.py index a8b4834e..b7834d8d 100644 --- a/setup.py +++ b/setup.py @@ -13,12 +13,12 @@ with open('requirements.txt', encoding='utf-8') as f: setup( name='FastNLP', - version='0.1.1', + version='0.4.0', description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', long_description=readme, license=license, author='FudanNLP', - python_requires='>=3.5', + python_requires='>=3.6', packages=find_packages(), install_requires=reqs.strip().split('\n'), ) diff --git a/test/automl/test_enas.py b/test/automl/test_enas.py index d2d3af05..4fea1063 100644 --- a/test/automl/test_enas.py +++ b/test/automl/test_enas.py @@ -35,7 +35,7 @@ class TestENAS(unittest.TestCase): print(dataset[0]) # DataSet.drop(func)筛除数据 - dataset.drop(lambda x: x['seq_len'] <= 3) + dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 607f9a13..7f4a7184 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -125,7 +125,7 @@ class TestDataSetMethods(unittest.TestCase): def test_drop(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20}) - ds.drop(lambda ins: len(ins["y"]) < 3) + ds.drop(lambda ins: len(ins["y"]) < 3, inplace=True) self.assertEqual(len(ds), 20) def test_contains(self): @@ -169,7 +169,7 @@ class TestDataSetMethods(unittest.TestCase): dataset = DataSet.read_csv('test/data_for_tests/tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), sep='\t') - dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0) + dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0, inplace=True) dataset.apply(split_sent, new_field_name='words', is_input=True) # print(dataset) diff --git a/test/test_tutorials.py b/test/test_tutorials.py index 68c874fa..0056dff7 100644 --- a/test/test_tutorials.py +++ b/test/test_tutorials.py @@ -35,7 +35,7 @@ class TestTutorial(unittest.TestCase): print(dataset[0]) # DataSet.drop(func)筛除数据 - dataset.drop(lambda x: x['seq_len'] <= 3) + dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor @@ -296,7 +296,7 @@ class TestTutorial(unittest.TestCase): # 筛选数据 origin_data_set_len = len(data_set) - data_set.drop(lambda x: len(x['premise']) <= 6) + data_set.drop(lambda x: len(x['premise']) <= 6, inplace=True) origin_data_set_len, len(data_set) # In[17]: From 29f81e79ad73db0d053df5f1417aa0dd6587b97a Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sun, 14 Apr 2019 19:59:04 +0800 Subject: [PATCH 03/13] =?UTF-8?q?=E5=87=86=E5=A4=87=E5=8F=91=E5=B8=830.4.0?= =?UTF-8?q?=E7=89=88=E6=9C=AC=E2=80=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 17 ++++++++--------- test/core/test_dataset.py | 4 ++-- test/core/test_metrics.py | 7 +++---- test/io/test_config_saver.py | 2 +- test/io/test_dataset_loader.py | 8 -------- test/modules/decoder/test_CRF.py | 2 +- test/test_tutorials.py | 6 +++--- 7 files changed, 18 insertions(+), 28 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 3d3647c4..5687cc85 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -243,12 +243,11 @@ class AccuracyMetric(MetricBase): def evaluate(self, pred, target, seq_lens=None): """ - :param pred: List of (torch.Tensor, or numpy.ndarray). Element's shape can be: - torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), torch.Size([B, max_len, n_classes]) - :param target: List of (torch.Tensor, or numpy.ndarray). Element's can be: - torch.Size([B,]), torch.Size([B,]), torch.Size([B, max_len]), torch.Size([B, max_len]) - :param seq_lens: List of (torch.Tensor, or numpy.ndarray). Element's can be: - None, None, torch.Size([B], torch.Size([B]). ignored if masks are provided. + :param pred: . Element's shape can be: torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), + torch.Size([B, max_len, n_classes]) + :param target: Element's can be: torch.Size([B,]), torch.Size([B,]), torch.Size([B, max_len]), + torch.Size([B, max_len]) + :param seq_lens: Element's can be: None, None, torch.Size([B], torch.Size([B]). ignored if masks are provided. """ # TODO 这里报错需要更改,因为pred是啥用户并不知道。需要告知用户真实的value @@ -264,7 +263,7 @@ class AccuracyMetric(MetricBase): f"got {type(seq_lens)}.") if seq_lens is not None: - masks = seq_lens_to_masks(seq_lens=seq_lens).long() + masks = seq_lens_to_masks(seq_lens=seq_lens) else: masks = None @@ -277,9 +276,9 @@ class AccuracyMetric(MetricBase): f"size:{pred.size()}, target should have size: {pred.size()} or " f"{pred.size()[:-1]}, got {target.size()}.") - + target = target.to(pred) if masks is not None: - self.acc_count += torch.sum(torch.eq(pred, target) * masks).item() + self.acc_count += torch.sum(torch.eq(pred, target).masked_fill(masks, 0)).item() self.total += torch.sum(masks).item() else: self.acc_count += torch.sum(torch.eq(pred, target)).item() diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index fb54ee8a..5ed1a711 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -219,8 +219,8 @@ class TestDataSetMethods(unittest.TestCase): def test_add_null(self): # TODO test failed because 'fastNLP\core\fieldarray.py:143: RuntimeError' ds = DataSet() - ds.add_field('test', []) - ds.set_target('test') + with self.assertRaises(RuntimeError) as RE: + ds.add_field('test', []) class TestDataSetIter(unittest.TestCase): diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index 25138478..4fb2a04e 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -15,7 +15,7 @@ class TestAccuracyMetric(unittest.TestCase): target_dict = {'target': torch.zeros(4)} metric = AccuracyMetric() - metric(pred_dict=pred_dict, target_dict=target_dict, ) + metric(pred_dict=pred_dict, target_dict=target_dict) print(metric.get_metric()) def test_AccuracyMetric2(self): @@ -30,7 +30,7 @@ class TestAccuracyMetric(unittest.TestCase): except Exception as e: print(e) return - self.assertTrue(True, False), "No exception catches." + print("No exception catches.") def test_AccuracyMetric3(self): # (3) the second batch is corrupted size @@ -95,10 +95,9 @@ class TestAccuracyMetric(unittest.TestCase): self.assertAlmostEqual(res["acc"], float(ans), places=4) def test_AccuaryMetric8(self): - # (8) check map, does not match. use stop_fast_param to stop fast param map try: metric = AccuracyMetric(pred='predictions', target='targets') - pred_dict = {"prediction": torch.zeros(4, 3, 2), "stop_fast_param": 1} + pred_dict = {"prediction": torch.zeros(4, 3, 2)} target_dict = {'targets': torch.zeros(4, 3)} metric(pred_dict=pred_dict, target_dict=target_dict, ) self.assertDictEqual(metric.get_metric(), {'acc': 1}) diff --git a/test/io/test_config_saver.py b/test/io/test_config_saver.py index f29097c5..a71419e5 100644 --- a/test/io/test_config_saver.py +++ b/test/io/test_config_saver.py @@ -6,7 +6,7 @@ from fastNLP.io.config_io import ConfigSection, ConfigLoader, ConfigSaver class TestConfigSaver(unittest.TestCase): def test_case_1(self): - config_file_dir = "test/io/" + config_file_dir = "test/io" config_file_name = "config" config_file_path = os.path.join(config_file_dir, config_file_name) diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py index 16e7d7ea..4dddc5d0 100644 --- a/test/io/test_dataset_loader.py +++ b/test/io/test_dataset_loader.py @@ -17,11 +17,3 @@ class TestDatasetLoader(unittest.TestCase): def test_PeopleDailyCorpusLoader(self): data_set = PeopleDailyCorpusLoader().load("test/data_for_tests/people_daily_raw.txt") - def test_ConllCWSReader(self): - dataset = ConllCWSReader().load("test/data_for_tests/conll_example.txt") - - def test_ZhConllPOSReader(self): - dataset = ZhConllPOSReader().load("test/data_for_tests/zh_sample.conllx") - - def test_ConllxDataLoader(self): - dataset = ConllxDataLoader().load("test/data_for_tests/zh_sample.conllx") diff --git a/test/modules/decoder/test_CRF.py b/test/modules/decoder/test_CRF.py index a176348f..5dc60640 100644 --- a/test/modules/decoder/test_CRF.py +++ b/test/modules/decoder/test_CRF.py @@ -118,7 +118,7 @@ class TestCRF(unittest.TestCase): feats = nn.Parameter(torch.randn(num_samples, max_len, num_tags)) crf = ConditionalRandomField(num_tags, include_start_end_trans) optimizer = optim.SGD([param for param in crf.parameters() if param.requires_grad] + [feats], lr=0.1) - for _ in range(10000): + for _ in range(10): loss = crf(feats, tags, masks).mean() optimizer.zero_grad() loss.backward() diff --git a/test/test_tutorials.py b/test/test_tutorials.py index c9ffa646..bc0b5d2b 100644 --- a/test/test_tutorials.py +++ b/test/test_tutorials.py @@ -152,7 +152,7 @@ class TestTutorial(unittest.TestCase): train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), - metrics=AccuracyMetric() + metrics=AccuracyMetric(target='label_seq') ) trainer.train() print('Train finished!') @@ -407,7 +407,7 @@ class TestTutorial(unittest.TestCase): train_data=train_data, model=model, loss=CrossEntropyLoss(pred='pred', target='label'), - metrics=AccuracyMetric(), + metrics=AccuracyMetric(target='label'), n_epochs=3, batch_size=16, print_every=-1, @@ -424,7 +424,7 @@ class TestTutorial(unittest.TestCase): tester = Tester( data=test_data, model=model, - metrics=AccuracyMetric(), + metrics=AccuracyMetric(target='label'), batch_size=args["batch_size"], ) tester.test() From 6f010d488db843816a02a81c71c1e290c3077a1a Mon Sep 17 00:00:00 2001 From: Yunfan Shao Date: Sun, 14 Apr 2019 21:11:16 +0800 Subject: [PATCH 04/13] update readme --- reproduction/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reproduction/README.md b/reproduction/README.md index 1c93c6bc..8d14d36d 100644 --- a/reproduction/README.md +++ b/reproduction/README.md @@ -8,7 +8,7 @@ ## Star-Transformer [reference](https://arxiv.org/abs/1902.09113) -### Performance +### Performance (still in progress) |任务| 数据集 | SOTA | 模型表现 | |------|------| ------| ------| |Pos Tagging|CTB 9.0|-|ACC 92.31| From 16fdf20d2630df580e4f5b7af244c66b048f70f3 Mon Sep 17 00:00:00 2001 From: xuyige Date: Sun, 14 Apr 2019 22:20:39 +0800 Subject: [PATCH 05/13] support parallel loss --- fastNLP/core/losses.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 9b8b8d8f..b52244e5 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -251,7 +251,8 @@ class LossInForward(LossBase): if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): if not isinstance(loss, torch.Tensor): raise TypeError(f"Loss excepted to be a torch.Tensor, got {type(loss)}") - raise RuntimeError(f"The size of loss excepts to be torch.Size([]), got {loss.size()}") + loss = torch.sum(loss) / (loss.view(-1)).size(0) + # raise RuntimeError(f"The size of loss excepts to be torch.Size([]), got {loss.size()}") return loss From fdfaf2d6b620ecd167a48fe7b8b22e1ad1921f6b Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sun, 14 Apr 2019 22:38:15 +0800 Subject: [PATCH 06/13] =?UTF-8?q?=E9=98=B2=E6=AD=A2=E5=A4=9A=E5=8D=A1?= =?UTF-8?q?=E7=9A=84=E6=83=85=E5=86=B5=E5=AF=BC=E8=87=B4=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E6=AD=A3=E7=A1=AE=E8=AE=A1=E7=AE=97loss=E2=80=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 2a8d85da..b45dd148 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -265,7 +265,7 @@ class Trainer(object): # edit prediction self.callback_manager.on_loss_begin(batch_y, prediction) - loss = self._compute_loss(prediction, batch_y) + loss = self._compute_loss(prediction, batch_y).mean() avg_loss += loss.item() loss = loss/self.update_every From 76f9bbf5f1fbe50dd7a541b6f9a67e3e808c6989 Mon Sep 17 00:00:00 2001 From: xuyige Date: Sun, 14 Apr 2019 23:30:23 +0800 Subject: [PATCH 07/13] update advance_tutorial jupyter notebook --- .../advance_tutorial.ipynb | 171 ++++++++++-------- 1 file changed, 94 insertions(+), 77 deletions(-) diff --git a/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb b/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb index a787eeaf..64eb3462 100644 --- a/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb +++ b/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb @@ -20,16 +20,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/remote-home/ygxu/anaconda3/envs/no-fastnlp/lib/python3.7/site-packages/tqdm/autonotebook/__init__.py:14: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", - " \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n" - ] - } - ], + "outputs": [], "source": [ "# 声明部件\n", "import torch\n", @@ -179,11 +170,11 @@ { "data": { "text/plain": [ - "DataSet({'image': tensor([[ 2.1747, -1.0147, -1.3853, 0.0216, -0.4957],\n", - " [ 0.8138, -0.2933, -0.1217, -0.6027, 0.3932],\n", - " [ 0.6750, -1.1136, -1.3371, -0.0185, -0.3206],\n", - " [-0.5076, -0.3822, 0.1719, -0.6447, -0.5702],\n", - " [ 0.3804, 0.0889, 0.8027, -0.7121, -0.7320]]) type=torch.Tensor,\n", + "DataSet({'image': tensor([[ 4.7106e-01, -1.2246e+00, 3.1234e-01, -1.6781e+00, -8.7967e-01],\n", + " [ 1.1454e+00, 1.2236e-01, 3.0258e-01, -1.5454e+00, 8.9201e-01],\n", + " [-5.7143e-03, 3.9488e-01, 2.0287e-01, -1.5726e+00, 9.3171e-01],\n", + " [ 6.8914e-01, -2.6302e-01, -8.2694e-01, 9.5942e-01, -5.2589e-01],\n", + " [-5.7798e-03, -9.1621e-03, 1.0077e-03, 9.1716e-02, 1.0565e+00]]) type=torch.Tensor,\n", "'label': 0 type=int})" ] }, @@ -644,20 +635,20 @@ { "data": { "text/plain": [ - "({'premise': [2, 145, 146, 80, 147, 26, 148, 2, 104, 149, 150, 2, 151, 5, 55, 152, 105, 3] type=list,\n", - " 'hypothesis': [22, 80, 8, 1, 1, 20, 1, 3] type=list,\n", - " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", - " 'hypothesis_len': [1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", - " 'label': 2 type=int},\n", - " {'premise': [11, 5, 18, 5, 24, 6, 2, 10, 59, 52, 14, 9, 2, 53, 29, 60, 54, 45, 6, 46, 5, 7, 61, 3] type=list,\n", - " 'hypothesis': [22, 11, 1, 45, 3] type=list,\n", - " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", - " 'hypothesis_len': [1, 1, 1, 1, 1] type=list,\n", + "({'premise': [2, 10, 9, 2, 15, 115, 6, 11, 5, 132, 17, 2, 76, 9, 77, 55, 3] type=list,\n", + " 'hypothesis': [1, 2, 56, 17, 1, 4, 13, 49, 123, 12, 6, 11, 3] type=list,\n", + " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", + " 'hypothesis_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", + " 'label': 0 type=int},\n", + " {'premise': [50, 124, 10, 7, 68, 91, 92, 38, 2, 55, 3] type=list,\n", + " 'hypothesis': [21, 10, 5, 2, 55, 7, 99, 64, 48, 1, 22, 1, 3] type=list,\n", + " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", + " 'hypothesis_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", " 'label': 1 type=int},\n", - " {'premise': [2, 11, 8, 14, 16, 7, 15, 50, 2, 66, 4, 76, 2, 10, 8, 98, 9, 58, 67, 3] type=list,\n", - " 'hypothesis': [22, 27, 50, 3] type=list,\n", - " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", - " 'hypothesis_len': [1, 1, 1, 1] type=list,\n", + " {'premise': [13, 24, 4, 14, 29, 5, 25, 4, 8, 39, 9, 14, 34, 4, 40, 41, 4, 16, 12, 2, 11, 4, 30, 28, 2, 42, 8, 2, 43, 44, 17, 2, 45, 35, 26, 31, 27, 5, 6, 32, 3] type=list,\n", + " 'hypothesis': [37, 49, 123, 30, 28, 2, 55, 12, 2, 11, 3] type=list,\n", + " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", + " 'hypothesis_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", " 'label': 0 type=int})" ] }, @@ -718,15 +709,15 @@ { "data": { "text/plain": [ - "({'premise': [1037, 2210, 2223, 2136, 5363, 2000, 4608, 1037, 5479, 8058, 2046, 1037, 2918, 1999, 2019, 5027, 2208, 1012] type=list,\n", - " 'hypothesis': [100, 2136, 2003, 2652, 3598, 2006, 100, 1012] type=list,\n", - " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", - " 'hypothesis_len': [1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", - " 'label': 2 type=int},\n", - " {'premise': [2450, 1999, 2317, 1999, 100, 1998, 1037, 2158, 3621, 2369, 3788, 2007, 1037, 3696, 2005, 2198, 100, 10733, 1998, 100, 1999, 1996, 4281, 1012] type=list,\n", - " 'hypothesis': [100, 2450, 13063, 10733, 1012] type=list,\n", - " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", - " 'hypothesis_len': [1, 1, 1, 1, 1] type=list,\n", + "({'premise': [1037, 2158, 1998, 1037, 2450, 2892, 1996, 2395, 1999, 2392, 1997, 1037, 10733, 1998, 100, 4825, 1012] type=list,\n", + " 'hypothesis': [100, 1037, 3232, 1997, 7884, 1010, 2048, 2111, 3328, 2408, 1996, 2395, 1012] type=list,\n", + " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", + " 'hypothesis_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", + " 'label': 0 type=int},\n", + " {'premise': [2019, 3080, 2158, 2003, 5948, 4589, 10869, 2012, 1037, 4825, 1012] type=list,\n", + " 'hypothesis': [100, 2158, 1999, 1037, 4825, 2003, 3403, 2005, 2010, 7954, 2000, 7180, 1012] type=list,\n", + " 'premise_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", + " 'hypothesis_len': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] type=list,\n", " 'label': 1 type=int})" ] }, @@ -769,7 +760,7 @@ " 'num_classes': 3,\n", " 'gpu': True,\n", " 'batch_size': 32,\n", - " 'vocab_size': 165}" + " 'vocab_size': 156}" ] }, "execution_count": 26, @@ -797,7 +788,7 @@ "ESIM(\n", " (drop): Dropout(p=0.3)\n", " (embedding): Embedding(\n", - " (embed): Embedding(165, 300, padding_idx=0)\n", + " (embed): Embedding(156, 300, padding_idx=0)\n", " (dropout): Dropout(p=0.3)\n", " )\n", " (embedding_layer): Linear(\n", @@ -821,7 +812,6 @@ " )\n", " (output): Linear(in_features=300, out_features=3, bias=True)\n", " (dropout): Dropout(p=0.3)\n", - " (hidden_active): Tanh()\n", " )\n", ")" ] @@ -848,7 +838,7 @@ "text/plain": [ "CNNText(\n", " (embed): Embedding(\n", - " (embed): Embedding(165, 50, padding_idx=0)\n", + " (embed): Embedding(156, 50, padding_idx=0)\n", " (dropout): Dropout(p=0.0)\n", " )\n", " (conv_pool): ConvMaxpool(\n", @@ -1019,43 +1009,49 @@ "name": "stdout", "output_type": "stream", "text": [ - "training epochs started 2019-01-09 00-08-17\n", - "[tester] \n", - "AccuracyMetric: acc=0.206897\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/remote-home/ygxu/anaconda3/envs/no-fastnlp/lib/python3.7/site-packages/torch/nn/functional.py:1320: UserWarning: nn.functional.tanh is deprecated. Use torch.tanh instead.\n", - " warnings.warn(\"nn.functional.tanh is deprecated. Use torch.tanh instead.\")\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[tester] \n", - "AccuracyMetric: acc=0.206897\n", - "[tester] \n", - "AccuracyMetric: acc=0.206897\n", - "[tester] \n", - "AccuracyMetric: acc=0.206897\n", - "[tester] \n", - "AccuracyMetric: acc=0.206897\n", + "training epochs started 2019-04-14-23-22-28\n", + "[epoch: 1 step: 1] train loss: 1.51372 time: 0:00:00\n", + "[epoch: 1 step: 2] train loss: 1.26874 time: 0:00:00\n", + "[epoch: 1 step: 3] train loss: 1.49786 time: 0:00:00\n", + "[epoch: 1 step: 4] train loss: 1.37505 time: 0:00:00\n", + "Evaluation at Epoch 1/5. Step:4/20. AccuracyMetric: acc=0.344828\n", + "\n", + "[epoch: 2 step: 5] train loss: 1.21877 time: 0:00:00\n", + "[epoch: 2 step: 6] train loss: 1.14183 time: 0:00:00\n", + "[epoch: 2 step: 7] train loss: 1.15934 time: 0:00:00\n", + "[epoch: 2 step: 8] train loss: 1.55148 time: 0:00:00\n", + "Evaluation at Epoch 2/5. Step:8/20. AccuracyMetric: acc=0.344828\n", "\n", - "In Epoch:1/Step:4, got best dev performance:AccuracyMetric: acc=0.206897\n", + "[epoch: 3 step: 9] train loss: 1.1457 time: 0:00:00\n", + "[epoch: 3 step: 10] train loss: 1.0547 time: 0:00:00\n", + "[epoch: 3 step: 11] train loss: 1.40139 time: 0:00:00\n", + "[epoch: 3 step: 12] train loss: 0.551445 time: 0:00:00\n", + "Evaluation at Epoch 3/5. Step:12/20. AccuracyMetric: acc=0.275862\n", + "\n", + "[epoch: 4 step: 13] train loss: 1.07965 time: 0:00:00\n", + "[epoch: 4 step: 14] train loss: 1.04118 time: 0:00:00\n", + "[epoch: 4 step: 15] train loss: 1.11719 time: 0:00:00\n", + "[epoch: 4 step: 16] train loss: 1.09861 time: 0:00:00\n", + "Evaluation at Epoch 4/5. Step:16/20. AccuracyMetric: acc=0.275862\n", + "\n", + "[epoch: 5 step: 17] train loss: 1.10795 time: 0:00:00\n", + "[epoch: 5 step: 18] train loss: 1.26715 time: 0:00:00\n", + "[epoch: 5 step: 19] train loss: 1.19875 time: 0:00:00\n", + "[epoch: 5 step: 20] train loss: 1.09862 time: 0:00:00\n", + "Evaluation at Epoch 5/5. Step:20/20. AccuracyMetric: acc=0.37931\n", + "\n", + "\n", + "In Epoch:5/Step:20, got best dev performance:AccuracyMetric: acc=0.37931\n", "Reloaded the best model.\n" ] }, { "data": { "text/plain": [ - "{'best_eval': {'AccuracyMetric': {'acc': 0.206897}},\n", - " 'best_epoch': 1,\n", - " 'best_step': 4,\n", - " 'seconds': 0.79}" + "{'best_eval': {'AccuracyMetric': {'acc': 0.37931}},\n", + " 'best_epoch': 5,\n", + " 'best_step': 20,\n", + " 'seconds': 0.5}" ] }, "execution_count": 29, @@ -1070,8 +1066,8 @@ "trainer = Trainer(\n", " train_data=train_data,\n", " model=model,\n", - " loss=CrossEntropyLoss(pred='pred', target='label'),\n", - " metrics=AccuracyMetric(),\n", + " loss=CrossEntropyLoss(pred='pred', target='label'), # 模型预测值通过'pred'来取得,目标值(ground truth)由'label'取得\n", + " metrics=AccuracyMetric(target='label'), # 目标值(ground truth)由'label'取得\n", " n_epochs=5,\n", " batch_size=16,\n", " print_every=-1,\n", @@ -1113,13 +1109,13 @@ "output_type": "stream", "text": [ "[tester] \n", - "AccuracyMetric: acc=0.263158\n" + "AccuracyMetric: acc=0.368421\n" ] }, { "data": { "text/plain": [ - "{'AccuracyMetric': {'acc': 0.263158}}" + "{'AccuracyMetric': {'acc': 0.368421}}" ] }, "execution_count": 30, @@ -1131,12 +1127,33 @@ "tester = Tester(\n", " data=test_data,\n", " model=model,\n", - " metrics=AccuracyMetric(),\n", + " metrics=AccuracyMetric(target='label'),\n", " batch_size=args[\"batch_size\"],\n", ")\n", "tester.test()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -1161,7 +1178,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.7.0" } }, "nbformat": 4, From b69f8985c8e74ab063645daa5524f04759a2d2b0 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Wed, 17 Apr 2019 20:24:09 +0800 Subject: [PATCH 08/13] =?UTF-8?q?1.=20=E5=9C=A8embedding=5Floader=E4=B8=AD?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=96=B0=E7=9A=84=E8=AF=BB=E5=8F=96=E5=87=BD?= =?UTF-8?q?=E6=95=B0load=5Fwith=5Fvocab(),=20load=5Fwithout=5Fvocab,=20?= =?UTF-8?q?=E6=AF=94=E4=B9=8B=E5=89=8D=E7=9A=84=E5=87=BD=E6=95=B0=E6=94=B9?= =?UTF-8?q?=E5=8F=98=E4=B8=BB=E8=A6=81=E5=9C=A8(1)=E4=B8=8D=E5=86=8D?= =?UTF-8?q?=E9=9C=80=E8=A6=81=E4=BC=A0=E5=85=A5embed=5Fdim(2)=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E5=88=A4=E6=96=AD=E5=BD=93=E5=89=8D=E6=98=AFword2vec?= =?UTF-8?q?=E8=BF=98=E6=98=AFglove.=202.=20vocabulary=E5=A2=9E=E5=8A=A0fro?= =?UTF-8?q?m=5Fdataset(),=20index=5Fdataset()=E5=87=BD=E6=95=B0=E3=80=82?= =?UTF-8?q?=E9=81=BF=E5=85=8D=E9=9C=80=E8=A6=81=E5=A4=9A=E8=A1=8C=E5=86=99?= =?UTF-8?q?index=20dataset=E7=9A=84=E9=97=AE=E9=A2=98=E3=80=82=203.=20?= =?UTF-8?q?=E5=9C=A8utils=E4=B8=AD=E6=96=B0=E5=A2=9E=E4=B8=80=E4=B8=AAcach?= =?UTF-8?q?e=5Fresult()=E4=BF=AE=E9=A5=B0=E5=99=A8=EF=BC=8C=E7=94=A8?= =?UTF-8?q?=E4=BA=8Ecache=E5=87=BD=E6=95=B0=E7=9A=84=E8=BF=94=E5=9B=9E?= =?UTF-8?q?=E5=80=BC=E3=80=82=204.=20callback=E4=B8=AD=E6=96=B0=E5=A2=9Eup?= =?UTF-8?q?date=5Fevery=E5=B1=9E=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/__init__.py | 4 +- fastNLP/core/callback.py | 4 ++ fastNLP/core/dataset.py | 8 +-- fastNLP/core/utils.py | 58 ++++++++++++++++++ fastNLP/core/vocabulary.py | 64 ++++++++++++++++++- fastNLP/io/__init__.py | 1 + fastNLP/io/embed_loader.py | 96 +++++++++++++++++++++++++++++ fastNLP/io/logger.py | 35 ----------- test/core/test_utils.py | 115 +++++++++++++++++++++++++++++++++++ test/core/test_vocabulary.py | 38 ++++++++++++ test/io/test_embed_loader.py | 32 ++++++++++ 11 files changed, 410 insertions(+), 45 deletions(-) delete mode 100644 fastNLP/io/logger.py create mode 100644 test/core/test_utils.py diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index 0bb6a2dd..dbe86953 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -1,5 +1,5 @@ from .batch import Batch -# from .dataset import DataSet +from .dataset import DataSet from .fieldarray import FieldArray from .instance import Instance from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward @@ -9,5 +9,5 @@ from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSample from .tester import Tester from .trainer import Trainer from .vocabulary import Vocabulary -from ..io.dataset_loader import DataSet from .callback import Callback +from .utils import cache_results \ No newline at end of file diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 01f6ce68..2ee5b3a6 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -61,6 +61,10 @@ class Callback(object): """If use_tqdm, return trainer's tqdm print bar, else return None.""" return self._trainer.pbar + @property + def update_every(self): + """The model in trainer will update parameters every `update_every` batches.""" + return self._trainer.update_every def on_train_begin(self): # before the main training loop pass diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 068afb38..7b0e3b9a 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -6,7 +6,6 @@ from fastNLP.core.fieldarray import AutoPadder from fastNLP.core.fieldarray import FieldArray from fastNLP.core.instance import Instance from fastNLP.core.utils import get_func_signature -from fastNLP.io.base_loader import DataLoaderRegister class DataSet(object): @@ -105,11 +104,6 @@ class DataSet(object): raise AttributeError if isinstance(item, str) and item in self.field_arrays: return self.field_arrays[item] - try: - reader = DataLoaderRegister.get_reader(item) - return reader - except AttributeError: - raise def __setstate__(self, state): self.__dict__ = state @@ -369,7 +363,7 @@ class DataSet(object): :return dataset: the read data set """ - with open(csv_path, "r") as f: + with open(csv_path, "r", encoding='utf-8') as f: start_idx = 0 if headers is None: headers = f.readline().rstrip('\r\n') diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 695efdfc..d9141412 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -11,6 +11,64 @@ import torch CheckRes = namedtuple('CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', 'varargs']) +def _prepare_cache_filepath(filepath): + """ + 检查filepath是否可以作为合理的cache文件. 如果可以的话,会自动创造路径 + :param filepath: str. + :return: None, if not, this function will raise error + """ + _cache_filepath = os.path.abspath(filepath) + if os.path.isdir(_cache_filepath): + raise RuntimeError("The cache_file_path must be a file, not a directory.") + cache_dir = os.path.dirname(_cache_filepath) + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + +def cache_results(cache_filepath, refresh=False, verbose=1): + def wrapper_(func): + signature = inspect.signature(func) + for key, _ in signature.parameters.items(): + if key in ('cache_filepath', 'refresh', 'verbose'): + raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key)) + def wrapper(*args, **kwargs): + if 'cache_filepath' in kwargs: + _cache_filepath = kwargs.pop('cache_filepath') + assert isinstance(_cache_filepath, str), "cache_filepath can only be str." + else: + _cache_filepath = cache_filepath + if 'refresh' in kwargs: + _refresh = kwargs.pop('refresh') + assert isinstance(_refresh, bool), "refresh can only be bool." + else: + _refresh = refresh + if 'verbose' in kwargs: + _verbose = kwargs.pop('verbose') + assert isinstance(_verbose, int), "verbose can only be integer." + refresh_flag = True + + if _cache_filepath is not None and _refresh is False: + # load data + if os.path.exists(_cache_filepath): + with open(_cache_filepath, 'rb') as f: + results = _pickle.load(f) + if verbose==1: + print("Read cache from {}.".format(_cache_filepath)) + refresh_flag = False + + if refresh_flag: + results = func(*args, **kwargs) + if _cache_filepath is not None: + if results is None: + raise RuntimeError("The return value is None. Delete the decorator.") + _prepare_cache_filepath(_cache_filepath) + with open(_cache_filepath, 'wb') as f: + _pickle.dump(results, f) + print("Save cache to {}.".format(_cache_filepath)) + + return results + return wrapper + return wrapper_ def save_pickle(obj, pickle_path, file_name): """Save an object into a pickle file. diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index a1c8e678..a73ce2c7 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -1,5 +1,5 @@ from collections import Counter - +from fastNLP.core.dataset import DataSet def check_build_vocab(func): """A decorator to make sure the indexing is built before used. @@ -151,6 +151,68 @@ class Vocabulary(object): else: raise ValueError("word {} not in vocabulary".format(w)) + @check_build_vocab + def index_dataset(self, *datasets, field_name, new_field_name=None): + """ + example: + # remember to use `field_name` + vocab.index_dataset(tr_data, dev_data, te_data, field_name='words') + + :param datasets: fastNLP Dataset type. you can pass multiple datasets + :param field_name: str, what field to index. Only support 0,1,2 dimension. + :param new_field_name: str. What the indexed field should be named, default is to overwrite field_name + :return: + """ + def index_instance(ins): + """ + 有几种情况, str, 1d-list, 2d-list + :param ins: + :return: + """ + field = ins[field_name] + if isinstance(field, str): + return self.to_index(field) + elif isinstance(field, list): + if not isinstance(field[0], list): + return [self.to_index(w) for w in field] + else: + if isinstance(field[0][0], list): + raise RuntimeError("Only support field with 2 dimensions.") + return[[self.to_index(c) for c in w] for w in field] + + if new_field_name is None: + new_field_name = field_name + for dataset in datasets: + if isinstance(dataset, DataSet): + dataset.apply(index_instance, new_field_name=new_field_name) + else: + raise RuntimeError("Only DataSet type is allowed.") + + def from_dataset(self, *datasets, field_name): + """ + Construct vocab from dataset. + + :param datasets: DataSet. + :param field_name: str, what field is used to construct dataset. + :return: + """ + def construct_vocab(ins): + field = ins[field_name] + if isinstance(field, str): + self.add_word(field) + elif isinstance(field, list): + if not isinstance(field[0], list): + self.add_word_lst(field) + else: + if isinstance(field[0][0], list): + raise RuntimeError("Only support field with 2 dimensions.") + [self.add_word_lst(w) for w in field] + for dataset in datasets: + if isinstance(dataset, DataSet): + dataset.apply(construct_vocab) + else: + raise RuntimeError("Only DataSet type is allowed.") + def to_index(self, w): """ Turn a word to an index. If w is not in Vocabulary, return the unknown label. diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index e69de29b..a3b18aa5 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -0,0 +1 @@ +from .embed_loader import EmbedLoader diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 1615fb7f..08a55aa6 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -1,3 +1,5 @@ +import os + import numpy as np import torch @@ -124,3 +126,97 @@ class EmbedLoader(BaseLoader): size=(len(vocab) - np.sum(hit_flags), emb_dim)) embedding_matrix[np.where(1 - hit_flags)] = sampled_vectors return embedding_matrix + + @staticmethod + def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True): + """ + load pretraining embedding in {embed_file} based on words in vocab. Words in vocab but not in the pretraining + embedding are initialized from a normal distribution which has the mean and std of the found words vectors. + The embedding type is determined automatically, support glove and word2vec(the first line only has two elements). + + :param embed_filepath: str, where to read pretrain embedding + :param vocab: Vocabulary. + :param dtype: the dtype of the embedding matrix + :param normalize: bool, whether to normalize each word vector so that every vector has norm 1. + :return: np.ndarray() will have the same [len(vocab), dimension], dimension is determined by the pretrain + embedding + """ + assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported." + if not os.path.exists(embed_filepath): + raise FileNotFoundError("`{}` does not exist.".format(embed_filepath)) + with open(embed_filepath, 'r', encoding='utf-8') as f: + hit_flags = np.zeros(len(vocab), dtype=bool) + line = f.readline().strip() + parts = line.split() + if len(parts)==2: + dim = int(parts[1]) + else: + dim = len(parts)-1 + f.seek(0) + matrix = np.random.randn(len(vocab), dim).astype(dtype) + for line in f: + parts = line.strip().split() + if parts[0] in vocab: + index = vocab.to_index(parts[0]) + matrix[index] = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) + hit_flags[index] = True + total_hits = sum(hit_flags) + print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab))) + found_vectors = matrix[hit_flags] + if len(found_vectors)!=0: + mean = np.mean(found_vectors, axis=1, keepdims=True) + std = np.std(found_vectors, axis=1, keepdims=True) + unfound_vec_num = len(vocab) - total_hits + r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype)*std + mean + matrix[hit_flags==False] = r_vecs + + if normalize: + matrix /= np.linalg.norm(matrix, axis=1, keepdims=True) + + return matrix + + @staticmethod + def load_without_vocab(embed_filepath, dtype=np.float32, padding='', unknown='', normalize=True): + """ + load pretraining embedding in {embed_file}. And construct a Vocabulary based on the pretraining embedding. + The embedding type is determined automatically, support glove and word2vec(the first line only has two elements). + + :param embed_filepath: str, where to read pretrain embedding + :param dtype: the dtype of the embedding matrix + :param padding: the padding tag for vocabulary. + :param unknown: the unknown tag for vocabulary. + :param normalize: bool, whether to normalize each word vector so that every vector has norm 1. + :return: np.ndarray() is determined by the pretraining embeddings + Vocabulary: contain all pretraining words and two special tag[, ] + + """ + vocab = Vocabulary(padding=padding, unknown=unknown) + vec_dict = {} + + with open(embed_filepath, 'r', encoding='utf-8') as f: + line = f.readline() + start = 1 + dim = -1 + if len(line.strip().split())!=2: + f.seek(0) + start = 0 + for idx, line in enumerate(f, start=start): + parts = line.strip().split() + word = parts[0] + if dim==-1: + dim = len(parts)-1 + vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) + vec_dict[word] = vec + vocab.add_word(word) + if dim==-1: + raise RuntimeError("{} is an empty file.".format(embed_filepath)) + matrix = np.random.randn(len(vocab), dim).astype(dtype) + + for key, vec in vec_dict.items(): + index = vocab.to_index(key) + matrix[index] = vec + + if normalize: + matrix /= np.linalg.norm(matrix, axis=1, keepdims=True) + + return matrix, vocab diff --git a/fastNLP/io/logger.py b/fastNLP/io/logger.py deleted file mode 100644 index 9e9730db..00000000 --- a/fastNLP/io/logger.py +++ /dev/null @@ -1,35 +0,0 @@ -import logging -import os - - -def create_logger(logger_name, log_path, log_format=None, log_level=logging.INFO): - """Create a logger. - - :param str logger_name: - :param str log_path: - :param log_format: - :param log_level: - :return: logger - - To use a logger:: - - logger.debug("this is a debug message") - logger.info("this is a info message") - logger.warning("this is a warning message") - logger.error("this is an error message") - """ - logger = logging.getLogger(logger_name) - logger.setLevel(log_level) - if log_path is None: - handler = logging.StreamHandler() - else: - os.stat(os.path.dirname(os.path.abspath(log_path))) - handler = logging.FileHandler(log_path) - handler.setLevel(log_level) - if log_format is None: - log_format = "[%(asctime)s %(name)-13s %(levelname)s %(process)d %(thread)d " \ - "%(filename)s:%(lineno)-5d] %(message)s" - formatter = logging.Formatter(log_format) - handler.setFormatter(formatter) - logger.addHandler(handler) - return logger diff --git a/test/core/test_utils.py b/test/core/test_utils.py new file mode 100644 index 00000000..5c325127 --- /dev/null +++ b/test/core/test_utils.py @@ -0,0 +1,115 @@ + +import unittest +import _pickle +from fastNLP import cache_results +from fastNLP.io.embed_loader import EmbedLoader +from fastNLP import DataSet +from fastNLP import Instance +import time +import os + +@cache_results('test/demo1.pkl') +def process_data_1(embed_file, cws_train): + embed, vocab = EmbedLoader.load_without_vocab(embed_file) + time.sleep(1) # 测试是否通过读取cache获得结果 + with open(cws_train, 'r', encoding='utf-8') as f: + d = DataSet() + for line in f: + line = line.strip() + if len(line)>0: + d.append(Instance(raw=line)) + return embed, vocab, d + + +class TestCache(unittest.TestCase): + def test_cache_save(self): + try: + start_time = time.time() + embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train') + end_time = time.time() + pre_time = end_time - start_time + with open('test/demo1.pkl', 'rb') as f: + _embed, _vocab, _d = _pickle.load(f) + self.assertEqual(embed.shape, _embed.shape) + for i in range(embed.shape[0]): + self.assertListEqual(embed[i].tolist(), _embed[i].tolist()) + start_time = time.time() + embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train') + end_time = time.time() + read_time = end_time - start_time + print("Read using {:.3f}, while prepare using:{:.3f}".format(read_time, pre_time)) + self.assertGreater(pre_time-0.5, read_time) + finally: + os.remove('test/demo1.pkl') + + def test_cache_save_overwrite_path(self): + try: + start_time = time.time() + embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + cache_filepath='test/demo_overwrite.pkl') + end_time = time.time() + pre_time = end_time - start_time + with open('test/demo_overwrite.pkl', 'rb') as f: + _embed, _vocab, _d = _pickle.load(f) + self.assertEqual(embed.shape, _embed.shape) + for i in range(embed.shape[0]): + self.assertListEqual(embed[i].tolist(), _embed[i].tolist()) + start_time = time.time() + embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + cache_filepath='test/demo_overwrite.pkl') + end_time = time.time() + read_time = end_time - start_time + print("Read using {:.3f}, while prepare using:{:.3f}".format(read_time, pre_time)) + self.assertGreater(pre_time-0.5, read_time) + finally: + os.remove('test/demo_overwrite.pkl') + + def test_cache_refresh(self): + try: + start_time = time.time() + embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + refresh=True) + end_time = time.time() + pre_time = end_time - start_time + with open('test/demo1.pkl', 'rb') as f: + _embed, _vocab, _d = _pickle.load(f) + self.assertEqual(embed.shape, _embed.shape) + for i in range(embed.shape[0]): + self.assertListEqual(embed[i].tolist(), _embed[i].tolist()) + start_time = time.time() + embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + refresh=True) + end_time = time.time() + read_time = end_time - start_time + print("Read using {:.3f}, while prepare using:{:.3f}".format(read_time, pre_time)) + self.assertGreater(0.1, pre_time-read_time) + finally: + os.remove('test/demo1.pkl') + + def test_duplicate_keyword(self): + with self.assertRaises(RuntimeError): + @cache_results(None) + def func_verbose(a, verbose): + pass + func_verbose(0, 1) + with self.assertRaises(RuntimeError): + @cache_results(None) + def func_cache(a, cache_filepath): + pass + func_cache(1, 2) + with self.assertRaises(RuntimeError): + @cache_results(None) + def func_refresh(a, refresh): + pass + func_refresh(1, 2) + + def test_create_cache_dir(self): + @cache_results('test/demo1/demo.pkl') + def cache(): + return 1, 2 + try: + results = cache() + print(results) + finally: + os.remove('test/demo1/demo.pkl') + os.rmdir('test/demo1') \ No newline at end of file diff --git a/test/core/test_vocabulary.py b/test/core/test_vocabulary.py index 2f9cd3b1..0f13b935 100644 --- a/test/core/test_vocabulary.py +++ b/test/core/test_vocabulary.py @@ -2,6 +2,8 @@ import unittest from collections import Counter from fastNLP.core.vocabulary import Vocabulary +from fastNLP.core.dataset import DataSet +from fastNLP.core.instance import Instance text = ["FastNLP", "works", "well", "in", "most", "cases", "and", "scales", "well", "in", "works", "well", "in", "most", "cases", "scales", "well"] @@ -31,6 +33,42 @@ class TestAdd(unittest.TestCase): vocab.update(text) self.assertEqual(vocab.word_count, counter) + def test_from_dataset(self): + start_char = 65 + num_samples = 10 + + # 0 dim + dataset = DataSet() + for i in range(num_samples): + ins = Instance(char=chr(start_char+i)) + dataset.append(ins) + vocab = Vocabulary() + vocab.from_dataset(dataset, field_name='char') + for i in range(num_samples): + self.assertEqual(vocab.to_index(chr(start_char+i)), i+2) + vocab.index_dataset(dataset, field_name='char') + + # 1 dim + dataset = DataSet() + for i in range(num_samples): + ins = Instance(char=[chr(start_char+i)]*6) + dataset.append(ins) + vocab = Vocabulary() + vocab.from_dataset(dataset, field_name='char') + for i in range(num_samples): + self.assertEqual(vocab.to_index(chr(start_char+i)), i+2) + vocab.index_dataset(dataset, field_name='char') + + # 2 dim + dataset = DataSet() + for i in range(num_samples): + ins = Instance(char=[[chr(start_char+i) for _ in range(6)] for _ in range(6)]) + dataset.append(ins) + vocab = Vocabulary() + vocab.from_dataset(dataset, field_name='char') + for i in range(num_samples): + self.assertEqual(vocab.to_index(chr(start_char+i)), i+2) + vocab.index_dataset(dataset, field_name='char') class TestIndexing(unittest.TestCase): def test_len(self): diff --git a/test/io/test_embed_loader.py b/test/io/test_embed_loader.py index 60e3710e..3f1fb5e7 100644 --- a/test/io/test_embed_loader.py +++ b/test/io/test_embed_loader.py @@ -1,4 +1,5 @@ import unittest +import numpy as np from fastNLP.core.vocabulary import Vocabulary from fastNLP.io.embed_loader import EmbedLoader @@ -10,3 +11,34 @@ class TestEmbedLoader(unittest.TestCase): vocab.update(["the", "in", "I", "to", "of", "hahaha"]) embedding = EmbedLoader().fast_load_embedding(50, "test/data_for_tests/glove.6B.50d_test.txt", vocab) self.assertEqual(tuple(embedding.shape), (len(vocab), 50)) + + def test_load_with_vocab(self): + vocab = Vocabulary() + glove = "test/data_for_tests/glove.6B.50d_test.txt" + word2vec = "test/data_for_tests/word2vec_test.txt" + vocab.add_word('the') + g_m = EmbedLoader.load_with_vocab(glove, vocab) + self.assertEqual(g_m.shape, (3, 50)) + w_m = EmbedLoader.load_with_vocab(word2vec, vocab, normalize=True) + self.assertEqual(w_m.shape, (3, 50)) + self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 3) + + def test_load_without_vocab(self): + words = ['the', 'of', 'in', 'a', 'to', 'and'] + glove = "test/data_for_tests/glove.6B.50d_test.txt" + word2vec = "test/data_for_tests/word2vec_test.txt" + g_m, vocab = EmbedLoader.load_without_vocab(glove) + self.assertEqual(g_m.shape, (8, 50)) + for word in words: + self.assertIn(word, vocab) + w_m, vocab = EmbedLoader.load_without_vocab(word2vec, normalize=True) + self.assertEqual(w_m.shape, (8, 50)) + self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 8) + for word in words: + self.assertIn(word, vocab) + # no unk + w_m, vocab = EmbedLoader.load_without_vocab(word2vec, normalize=True, unknown=None) + self.assertEqual(w_m.shape, (7, 50)) + self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 7) + for word in words: + self.assertIn(word, vocab) \ No newline at end of file From c1ee0b27dfb5daa8a0f83f161514f07d4075bb4f Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sun, 21 Apr 2019 09:12:42 +0800 Subject: [PATCH 09/13] =?UTF-8?q?1.DataSet.apply()=E6=8A=A5=E9=94=99?= =?UTF-8?q?=E6=97=B6=E6=8F=90=E4=BE=9B=E9=94=99=E8=AF=AF=E7=9A=84index=202?= =?UTF-8?q?.Vocabulary.from=5Fdataset(),=20index=5Fdataset()=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=8A=A5=E9=94=99=E6=97=B6=E7=9A=84vocab=E9=A1=BA?= =?UTF-8?q?=E5=BA=8F=203.embedloader=E5=9C=A8embed=E8=AF=BB=E5=8F=96?= =?UTF-8?q?=E6=97=B6=E9=81=87=E5=88=B0=E4=B8=8D=E8=A7=84=E5=88=99=E7=9A=84?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=B7=B3=E8=BF=87=E8=BF=99=E4=B8=80=E8=A1=8C?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MANIFEST.in | 5 +++ fastNLP/core/dataset.py | 12 +++++- fastNLP/core/vocabulary.py | 17 ++++++-- fastNLP/io/embed_loader.py | 75 ++++++++++++++++++++++++++++-------- test/io/test_embed_loader.py | 7 ++-- 5 files changed, 91 insertions(+), 25 deletions(-) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..f04509c1 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include requirements.txt +include LICENSE +include README.md +prune test/ +prune reproduction/ diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 7b0e3b9a..76a34655 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -277,7 +277,17 @@ class DataSet(object): (2) is_target: boolean, will be ignored if new_field is None. If True, the new field will be as target. :return results: if new_field_name is not passed, returned values of the function over all instances. """ - results = [func(ins) for ins in self._inner_iter()] + assert len(self)!=0, "Null dataset cannot use .apply()." + results = [] + idx = -1 + try: + for idx, ins in enumerate(self._inner_iter()): + results.append(func(ins)) + except Exception as e: + if idx!=-1: + print("Exception happens at the `{}`th instance.".format(idx)) + raise e + # results = [func(ins) for ins in self._inner_iter()] if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(get_func_signature(func=func))) diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index a73ce2c7..c580dbec 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -182,9 +182,13 @@ class Vocabulary(object): if new_field_name is None: new_field_name = field_name - for dataset in datasets: + for idx, dataset in enumerate(datasets): if isinstance(dataset, DataSet): - dataset.apply(index_instance, new_field_name=new_field_name) + try: + dataset.apply(index_instance, new_field_name=new_field_name) + except Exception as e: + print("When processing the `{}` dataset, the following error occurred.".format(idx)) + raise e else: raise RuntimeError("Only DataSet type is allowed.") @@ -207,11 +211,16 @@ class Vocabulary(object): if isinstance(field[0][0], list): raise RuntimeError("Only support field with 2 dimensions.") [self.add_word_lst(w) for w in field] - for dataset in datasets: + for idx, dataset in enumerate(datasets): if isinstance(dataset, DataSet): - dataset.apply(construct_vocab) + try: + dataset.apply(construct_vocab) + except Exception as e: + print("When processing the `{}` dataset, the following error occurred.".format(idx)) + raise e else: raise RuntimeError("Only DataSet type is allowed.") + return self def to_index(self, w): """ Turn a word to an index. If w is not in Vocabulary, return the unknown label. diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 08a55aa6..5ad27c53 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -6,6 +6,7 @@ import torch from fastNLP.core.vocabulary import Vocabulary from fastNLP.io.base_loader import BaseLoader +import warnings class EmbedLoader(BaseLoader): """docstring for EmbedLoader""" @@ -128,7 +129,7 @@ class EmbedLoader(BaseLoader): return embedding_matrix @staticmethod - def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True): + def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'): """ load pretraining embedding in {embed_file} based on words in vocab. Words in vocab but not in the pretraining embedding are initialized from a normal distribution which has the mean and std of the found words vectors. @@ -138,6 +139,8 @@ class EmbedLoader(BaseLoader): :param vocab: Vocabulary. :param dtype: the dtype of the embedding matrix :param normalize: bool, whether to normalize each word vector so that every vector has norm 1. + :param error: str, 'ignore', 'strict'; if 'ignore' errors will not raise. if strict, any bad format error will + raise :return: np.ndarray() will have the same [len(vocab), dimension], dimension is determined by the pretrain embedding """ @@ -148,24 +151,32 @@ class EmbedLoader(BaseLoader): hit_flags = np.zeros(len(vocab), dtype=bool) line = f.readline().strip() parts = line.split() + start_idx = 0 if len(parts)==2: dim = int(parts[1]) + start_idx += 1 else: dim = len(parts)-1 f.seek(0) matrix = np.random.randn(len(vocab), dim).astype(dtype) - for line in f: - parts = line.strip().split() - if parts[0] in vocab: - index = vocab.to_index(parts[0]) - matrix[index] = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) - hit_flags[index] = True + for idx, line in enumerate(f, start_idx): + try: + parts = line.strip().split() + if parts[0] in vocab: + index = vocab.to_index(parts[0]) + matrix[index] = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) + hit_flags[index] = True + except Exception as e: + if error == 'ignore': + warnings.warn("Error occurred at the {} line.".format(idx)) + else: + raise e total_hits = sum(hit_flags) print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab))) found_vectors = matrix[hit_flags] if len(found_vectors)!=0: - mean = np.mean(found_vectors, axis=1, keepdims=True) - std = np.std(found_vectors, axis=1, keepdims=True) + mean = np.mean(found_vectors, axis=0, keepdims=True) + std = np.std(found_vectors, axis=0, keepdims=True) unfound_vec_num = len(vocab) - total_hits r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype)*std + mean matrix[hit_flags==False] = r_vecs @@ -176,7 +187,8 @@ class EmbedLoader(BaseLoader): return matrix @staticmethod - def load_without_vocab(embed_filepath, dtype=np.float32, padding='', unknown='', normalize=True): + def load_without_vocab(embed_filepath, dtype=np.float32, padding='', unknown='', normalize=True, + error='ignore'): """ load pretraining embedding in {embed_file}. And construct a Vocabulary based on the pretraining embedding. The embedding type is determined automatically, support glove and word2vec(the first line only has two elements). @@ -186,12 +198,16 @@ class EmbedLoader(BaseLoader): :param padding: the padding tag for vocabulary. :param unknown: the unknown tag for vocabulary. :param normalize: bool, whether to normalize each word vector so that every vector has norm 1. + :param error: str, 'ignore', 'strict'; if 'ignore' errors will not raise. if strict, any bad format error will + :raise :return: np.ndarray() is determined by the pretraining embeddings Vocabulary: contain all pretraining words and two special tag[, ] """ vocab = Vocabulary(padding=padding, unknown=unknown) vec_dict = {} + found_unknown = False + found_pad = False with open(embed_filepath, 'r', encoding='utf-8') as f: line = f.readline() @@ -201,16 +217,41 @@ class EmbedLoader(BaseLoader): f.seek(0) start = 0 for idx, line in enumerate(f, start=start): - parts = line.strip().split() - word = parts[0] - if dim==-1: - dim = len(parts)-1 - vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) - vec_dict[word] = vec - vocab.add_word(word) + try: + parts = line.strip().split() + word = parts[0] + if dim==-1: + dim = len(parts)-1 + vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) + vec_dict[word] = vec + vocab.add_word(word) + if unknown is not None and unknown==word: + found_unknown = True + if found_pad is not None and padding==word: + found_pad = True + except Exception as e: + if error=='ignore': + warnings.warn("Error occurred at the {} line.".format(idx)) + pass + else: + raise e if dim==-1: raise RuntimeError("{} is an empty file.".format(embed_filepath)) matrix = np.random.randn(len(vocab), dim).astype(dtype) + # TODO 需要保证unk其它数据同分布的吗? + if (unknown is not None and not found_unknown) or (padding is not None and not found_pad): + start_idx = 0 + if padding is not None: + start_idx += 1 + if unknown is not None: + start_idx += 1 + + mean = np.mean(matrix[start_idx:], axis=0, keepdims=True) + std = np.std(matrix[start_idx:], axis=0, keepdims=True) + if (unknown is not None and not found_unknown): + matrix[start_idx-1] = np.random.randn(1, dim).astype(dtype)*std + mean + if (padding is not None and not found_pad): + matrix[0] = np.random.randn(1, dim).astype(dtype)*std + mean for key, vec in vec_dict.items(): index = vocab.to_index(key) diff --git a/test/io/test_embed_loader.py b/test/io/test_embed_loader.py index 3f1fb5e7..9e325334 100644 --- a/test/io/test_embed_loader.py +++ b/test/io/test_embed_loader.py @@ -17,11 +17,12 @@ class TestEmbedLoader(unittest.TestCase): glove = "test/data_for_tests/glove.6B.50d_test.txt" word2vec = "test/data_for_tests/word2vec_test.txt" vocab.add_word('the') + vocab.add_word('none') g_m = EmbedLoader.load_with_vocab(glove, vocab) - self.assertEqual(g_m.shape, (3, 50)) + self.assertEqual(g_m.shape, (4, 50)) w_m = EmbedLoader.load_with_vocab(word2vec, vocab, normalize=True) - self.assertEqual(w_m.shape, (3, 50)) - self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 3) + self.assertEqual(w_m.shape, (4, 50)) + self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 4) def test_load_without_vocab(self): words = ['the', 'of', 'in', 'a', 'to', 'and'] From 9d43239fc17a8ec6029b5ef20f175cd4a6d9008b Mon Sep 17 00:00:00 2001 From: xuyige Date: Sun, 21 Apr 2019 15:41:20 +0800 Subject: [PATCH 10/13] update attention --- fastNLP/models/snli.py | 38 ++++++++++++------------- fastNLP/modules/aggregator/__init__.py | 2 +- fastNLP/modules/aggregator/attention.py | 36 ++++++++++++++++------- fastNLP/modules/encoder/transformer.py | 4 +-- 4 files changed, 47 insertions(+), 33 deletions(-) diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py index 6a7d8d84..5816d2af 100644 --- a/fastNLP/models/snli.py +++ b/fastNLP/models/snli.py @@ -1,6 +1,5 @@ import torch import torch.nn as nn -import torch.nn.functional as F from fastNLP.models.base_model import BaseModel from fastNLP.modules import decoder as Decoder @@ -40,7 +39,7 @@ class ESIM(BaseModel): batch_first=self.batch_first, bidirectional=True ) - self.bi_attention = Aggregator.Bi_Attention() + self.bi_attention = Aggregator.BiAttention() self.mean_pooling = Aggregator.MeanPoolWithMask() self.max_pooling = Aggregator.MaxPoolWithMask() @@ -53,23 +52,23 @@ class ESIM(BaseModel): self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh', dropout=self.dropout) - def forward(self, premise, hypothesis, premise_len, hypothesis_len): + def forward(self, words1, words2, seq_len1, seq_len2): """ Forward function - :param premise: A Tensor represents premise: [batch size(B), premise seq len(PL)]. - :param hypothesis: A Tensor represents hypothesis: [B, hypothesis seq len(HL)]. - :param premise_len: A Tensor record which is a real word and which is a padding word in premise: [B, PL]. - :param hypothesis_len: A Tensor record which is a real word and which is a padding word in hypothesis: [B, HL]. + :param words1: A Tensor represents premise: [batch size(B), premise seq len(PL)]. + :param words2: A Tensor represents hypothesis: [B, hypothesis seq len(HL)]. + :param seq_len1: A Tensor record which is a real word and which is a padding word in premise: [B]. + :param seq_len2: A Tensor record which is a real word and which is a padding word in hypothesis: [B]. :return: prediction: A Dict with Tensor of classification result: [B, n_labels(N)]. """ - premise0 = self.embedding_layer(self.embedding(premise)) - hypothesis0 = self.embedding_layer(self.embedding(hypothesis)) + premise0 = self.embedding_layer(self.embedding(words1)) + hypothesis0 = self.embedding_layer(self.embedding(words2)) _BP, _PSL, _HP = premise0.size() _BH, _HSL, _HH = hypothesis0.size() - _BPL, _PLL = premise_len.size() - _HPL, _HLL = hypothesis_len.size() + _BPL, _PLL = seq_len1.size() + _HPL, _HLL = seq_len2.size() assert _BP == _BH and _BPL == _HPL and _BP == _BPL assert _HP == _HH @@ -84,7 +83,7 @@ class ESIM(BaseModel): a = torch.mean(a0.view(B, PL, -1, H), dim=2) # a: [B, PL, H] b = torch.mean(b0.view(B, HL, -1, H), dim=2) # b: [B, HL, H] - ai, bi = self.bi_attention(a, b, premise_len, hypothesis_len) + ai, bi = self.bi_attention(a, b, seq_len1, seq_len2) ma = torch.cat((a, ai, a - ai, a * ai), dim=2) # ma: [B, PL, 4 * H] mb = torch.cat((b, bi, b - bi, b * bi), dim=2) # mb: [B, HL, 4 * H] @@ -98,17 +97,18 @@ class ESIM(BaseModel): va = torch.mean(vat.view(B, PL, -1, H), dim=2) # va: [B, PL, H] vb = torch.mean(vbt.view(B, HL, -1, H), dim=2) # vb: [B, HL, H] - va_ave = self.mean_pooling(va, premise_len, dim=1) # va_ave: [B, H] - va_max, va_arg_max = self.max_pooling(va, premise_len, dim=1) # va_max: [B, H] - vb_ave = self.mean_pooling(vb, hypothesis_len, dim=1) # vb_ave: [B, H] - vb_max, vb_arg_max = self.max_pooling(vb, hypothesis_len, dim=1) # vb_max: [B, H] + va_ave = self.mean_pooling(va, seq_len1, dim=1) # va_ave: [B, H] + va_max, va_arg_max = self.max_pooling(va, seq_len1, dim=1) # va_max: [B, H] + vb_ave = self.mean_pooling(vb, seq_len2, dim=1) # vb_ave: [B, H] + vb_max, vb_arg_max = self.max_pooling(vb, seq_len2, dim=1) # vb_max: [B, H] v = torch.cat((va_ave, va_max, vb_ave, vb_max), dim=1) # v: [B, 4 * H] - prediction = F.tanh(self.output(v)) # prediction: [B, N] + prediction = torch.tanh(self.output(v)) # prediction: [B, N] return {'pred': prediction} - def predict(self, premise, hypothesis, premise_len, hypothesis_len): - return self.forward(premise, hypothesis, premise_len, hypothesis_len) + def predict(self, words1, words2, seq_len1, seq_len2): + prediction = self.forward(words1, words2, seq_len1, seq_len2)['pred'] + return torch.argmax(prediction, dim=-1) diff --git a/fastNLP/modules/aggregator/__init__.py b/fastNLP/modules/aggregator/__init__.py index 2fabb89e..43d60cac 100644 --- a/fastNLP/modules/aggregator/__init__.py +++ b/fastNLP/modules/aggregator/__init__.py @@ -5,6 +5,6 @@ from .avg_pool import MeanPoolWithMask from .kmax_pool import KMaxPool from .attention import Attention -from .attention import Bi_Attention +from .attention import BiAttention from .self_attention import SelfAttention diff --git a/fastNLP/modules/aggregator/attention.py b/fastNLP/modules/aggregator/attention.py index ef9d159d..33d73a07 100644 --- a/fastNLP/modules/aggregator/attention.py +++ b/fastNLP/modules/aggregator/attention.py @@ -23,9 +23,9 @@ class Attention(torch.nn.Module): raise NotImplementedError -class DotAtte(nn.Module): +class DotAttention(nn.Module): def __init__(self, key_size, value_size, dropout=0.1): - super(DotAtte, self).__init__() + super(DotAttention, self).__init__() self.key_size = key_size self.value_size = value_size self.scale = math.sqrt(key_size) @@ -48,7 +48,7 @@ class DotAtte(nn.Module): return torch.matmul(output, V) -class MultiHeadAtte(nn.Module): +class MultiHeadAttention(nn.Module): def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1): """ @@ -58,7 +58,7 @@ class MultiHeadAtte(nn.Module): :param num_head: int,head的数量。 :param dropout: float。 """ - super(MultiHeadAtte, self).__init__() + super(MultiHeadAttention, self).__init__() self.input_size = input_size self.key_size = key_size self.value_size = value_size @@ -68,7 +68,7 @@ class MultiHeadAtte(nn.Module): self.q_in = nn.Linear(input_size, in_size) self.k_in = nn.Linear(input_size, in_size) self.v_in = nn.Linear(input_size, in_size) - self.attention = DotAtte(key_size=key_size, value_size=value_size) + self.attention = DotAttention(key_size=key_size, value_size=value_size) self.out = nn.Linear(value_size * num_head, input_size) self.drop = TimestepDropout(dropout) self.reset_parameters() @@ -109,16 +109,30 @@ class MultiHeadAtte(nn.Module): return output -class Bi_Attention(nn.Module): +class BiAttention(nn.Module): + """Bi Attention module + Calculate Bi Attention matrix `e` + .. math:: + \begin{array}{ll} \\ + e_ij = {a}^{\mathbf{T}}_{i}{b}_{j} \\ + a_i = + b_j = + \end{array} + """ + def __init__(self): - super(Bi_Attention, self).__init__() + super(BiAttention, self).__init__() self.inf = 10e12 def forward(self, in_x1, in_x2, x1_len, x2_len): - # in_x1: [batch_size, x1_seq_len, hidden_size] - # in_x2: [batch_size, x2_seq_len, hidden_size] - # x1_len: [batch_size, x1_seq_len] - # x2_len: [batch_size, x2_seq_len] + """ + :param torch.Tensor in_x1: [batch_size, x1_seq_len, hidden_size] 第一句的特征表示 + :param torch.Tensor in_x2: [batch_size, x2_seq_len, hidden_size] 第二句的特征表示 + :param torch.Tensor x1_len: [batch_size, x1_seq_len] 第一句的0/1mask矩阵 + :param torch.Tensor x2_len: [batch_size, x2_seq_len] 第二句的0/1mask矩阵 + :return: torch.Tensor out_x1: [batch_size, x1_seq_len, hidden_size] 第一句attend到的特征表示 + torch.Tensor out_x2: [batch_size, x2_seq_len, hidden_size] 第一句attend到的特征表示 + """ assert in_x1.size()[0] == in_x2.size()[0] assert in_x1.size()[2] == in_x2.size()[2] diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py index d7b8c544..d1262141 100644 --- a/fastNLP/modules/encoder/transformer.py +++ b/fastNLP/modules/encoder/transformer.py @@ -1,6 +1,6 @@ from torch import nn -from ..aggregator.attention import MultiHeadAtte +from ..aggregator.attention import MultiHeadAttention from ..dropout import TimestepDropout @@ -18,7 +18,7 @@ class TransformerEncoder(nn.Module): class SubLayer(nn.Module): def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1): super(TransformerEncoder.SubLayer, self).__init__() - self.atte = MultiHeadAtte(model_size, key_size, value_size, num_head, dropout) + self.atte = MultiHeadAttention(model_size, key_size, value_size, num_head, dropout) self.norm1 = nn.LayerNorm(model_size) self.ffn = nn.Sequential(nn.Linear(model_size, inner_size), nn.ReLU(), From 967e5e568389db8f98fa27c43c2c065470b307f3 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 22 Apr 2019 01:31:41 +0800 Subject: [PATCH 11/13] doc tools --- docs/Makefile | 4 + docs/source/conf.py | 6 +- docs/source/fastNLP.api.rst | 52 +++++-- docs/source/fastNLP.core.rst | 98 ++++++++---- docs/source/fastNLP.io.rst | 48 +++--- docs/source/fastNLP.models.rst | 96 ++++++++++-- docs/source/fastNLP.modules.aggregator.rst | 42 ++++-- docs/source/fastNLP.modules.decoder.rst | 24 ++- docs/source/fastNLP.modules.encoder.rst | 74 ++++++--- docs/source/fastNLP.modules.rst | 33 ++++- docs/source/fastNLP.rst | 13 +- fastNLP/api/__init__.py | 3 + fastNLP/api/api.py | 26 ++-- fastNLP/automl/enas_trainer.py | 15 +- fastNLP/core/dataset.py | 2 +- fastNLP/core/fieldarray.py | 16 +- fastNLP/core/instance.py | 15 +- fastNLP/core/losses.py | 4 +- fastNLP/core/metrics.py | 165 ++++++++++++--------- fastNLP/core/trainer.py | 139 ++++++++--------- fastNLP/core/utils.py | 9 +- fastNLP/models/char_language_model.py | 13 +- fastNLP/models/enas_trainer.py | 15 +- 23 files changed, 599 insertions(+), 313 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index e978dfe6..6a5c7375 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -3,6 +3,7 @@ # You can set these variables from the command line. SPHINXOPTS = +SPHINXAPIDOC = sphinx-apidoc SPHINXBUILD = sphinx-build SPHINXPROJ = fastNLP SOURCEDIR = source @@ -12,6 +13,9 @@ BUILDDIR = build help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) +apidoc: + @$(SPHINXAPIDOC) -f -o source ../fastNLP + .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new diff --git a/docs/source/conf.py b/docs/source/conf.py index e449a9f8..96f7f437 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,9 +23,9 @@ copyright = '2018, xpqiu' author = 'xpqiu' # The short X.Y version -version = '0.2' +version = '0.4' # The full version, including alpha/beta/rc tags -release = '0.2' +release = '0.4' # -- General configuration --------------------------------------------------- @@ -67,7 +67,7 @@ language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path . -exclude_patterns = [] +exclude_patterns = ['modules.rst'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' diff --git a/docs/source/fastNLP.api.rst b/docs/source/fastNLP.api.rst index eb9192da..ee2413fb 100644 --- a/docs/source/fastNLP.api.rst +++ b/docs/source/fastNLP.api.rst @@ -1,36 +1,62 @@ -fastNLP.api -============ +fastNLP.api package +=================== -fastNLP.api.api ----------------- +Submodules +---------- + +fastNLP.api.api module +---------------------- .. automodule:: fastNLP.api.api :members: + :undoc-members: + :show-inheritance: -fastNLP.api.converter ----------------------- +fastNLP.api.converter module +---------------------------- .. automodule:: fastNLP.api.converter :members: + :undoc-members: + :show-inheritance: -fastNLP.api.model\_zoo ------------------------ +fastNLP.api.examples module +--------------------------- -.. automodule:: fastNLP.api.model_zoo +.. automodule:: fastNLP.api.examples :members: + :undoc-members: + :show-inheritance: -fastNLP.api.pipeline ---------------------- +fastNLP.api.pipeline module +--------------------------- .. automodule:: fastNLP.api.pipeline :members: + :undoc-members: + :show-inheritance: -fastNLP.api.processor ----------------------- +fastNLP.api.processor module +---------------------------- .. automodule:: fastNLP.api.processor :members: + :undoc-members: + :show-inheritance: + +fastNLP.api.utils module +------------------------ + +.. automodule:: fastNLP.api.utils + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- .. automodule:: fastNLP.api :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.core.rst b/docs/source/fastNLP.core.rst index b9f6c89f..79d26c76 100644 --- a/docs/source/fastNLP.core.rst +++ b/docs/source/fastNLP.core.rst @@ -1,84 +1,126 @@ -fastNLP.core -============= +fastNLP.core package +==================== -fastNLP.core.batch -------------------- +Submodules +---------- + +fastNLP.core.batch module +------------------------- .. automodule:: fastNLP.core.batch :members: + :undoc-members: + :show-inheritance: + +fastNLP.core.callback module +---------------------------- -fastNLP.core.dataset ---------------------- +.. automodule:: fastNLP.core.callback + :members: + :undoc-members: + :show-inheritance: + +fastNLP.core.dataset module +--------------------------- .. automodule:: fastNLP.core.dataset :members: + :undoc-members: + :show-inheritance: -fastNLP.core.fieldarray ------------------------- +fastNLP.core.fieldarray module +------------------------------ .. automodule:: fastNLP.core.fieldarray :members: + :undoc-members: + :show-inheritance: -fastNLP.core.instance ----------------------- +fastNLP.core.instance module +---------------------------- .. automodule:: fastNLP.core.instance :members: + :undoc-members: + :show-inheritance: -fastNLP.core.losses --------------------- +fastNLP.core.losses module +-------------------------- .. automodule:: fastNLP.core.losses :members: + :undoc-members: + :show-inheritance: -fastNLP.core.metrics ---------------------- +fastNLP.core.metrics module +--------------------------- .. automodule:: fastNLP.core.metrics :members: + :undoc-members: + :show-inheritance: -fastNLP.core.optimizer ------------------------ +fastNLP.core.optimizer module +----------------------------- .. automodule:: fastNLP.core.optimizer :members: + :undoc-members: + :show-inheritance: -fastNLP.core.predictor ------------------------ +fastNLP.core.predictor module +----------------------------- .. automodule:: fastNLP.core.predictor :members: + :undoc-members: + :show-inheritance: -fastNLP.core.sampler ---------------------- +fastNLP.core.sampler module +--------------------------- .. automodule:: fastNLP.core.sampler :members: + :undoc-members: + :show-inheritance: -fastNLP.core.tester --------------------- +fastNLP.core.tester module +-------------------------- .. automodule:: fastNLP.core.tester :members: + :undoc-members: + :show-inheritance: -fastNLP.core.trainer ---------------------- +fastNLP.core.trainer module +--------------------------- .. automodule:: fastNLP.core.trainer :members: + :undoc-members: + :show-inheritance: -fastNLP.core.utils -------------------- +fastNLP.core.utils module +------------------------- .. automodule:: fastNLP.core.utils :members: + :undoc-members: + :show-inheritance: -fastNLP.core.vocabulary ------------------------- +fastNLP.core.vocabulary module +------------------------------ .. automodule:: fastNLP.core.vocabulary :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- .. automodule:: fastNLP.core :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst index d91e0d1c..bb30c5e7 100644 --- a/docs/source/fastNLP.io.rst +++ b/docs/source/fastNLP.io.rst @@ -1,42 +1,54 @@ -fastNLP.io -=========== +fastNLP.io package +================== -fastNLP.io.base\_loader ------------------------- +Submodules +---------- + +fastNLP.io.base\_loader module +------------------------------ .. automodule:: fastNLP.io.base_loader :members: + :undoc-members: + :show-inheritance: -fastNLP.io.config\_io ----------------------- +fastNLP.io.config\_io module +---------------------------- .. automodule:: fastNLP.io.config_io :members: + :undoc-members: + :show-inheritance: -fastNLP.io.dataset\_loader ---------------------------- +fastNLP.io.dataset\_loader module +--------------------------------- .. automodule:: fastNLP.io.dataset_loader :members: + :undoc-members: + :show-inheritance: -fastNLP.io.embed\_loader -------------------------- +fastNLP.io.embed\_loader module +------------------------------- .. automodule:: fastNLP.io.embed_loader :members: + :undoc-members: + :show-inheritance: -fastNLP.io.logger ------------------- - -.. automodule:: fastNLP.io.logger - :members: - -fastNLP.io.model\_io ---------------------- +fastNLP.io.model\_io module +--------------------------- .. automodule:: fastNLP.io.model_io :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- .. automodule:: fastNLP.io :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst index 7452fdf6..3ebf9608 100644 --- a/docs/source/fastNLP.models.rst +++ b/docs/source/fastNLP.models.rst @@ -1,42 +1,110 @@ -fastNLP.models -=============== +fastNLP.models package +====================== -fastNLP.models.base\_model ---------------------------- +Submodules +---------- + +fastNLP.models.base\_model module +--------------------------------- .. automodule:: fastNLP.models.base_model :members: + :undoc-members: + :show-inheritance: + +fastNLP.models.bert module +-------------------------- -fastNLP.models.biaffine\_parser --------------------------------- +.. automodule:: fastNLP.models.bert + :members: + :undoc-members: + :show-inheritance: + +fastNLP.models.biaffine\_parser module +-------------------------------------- .. automodule:: fastNLP.models.biaffine_parser :members: + :undoc-members: + :show-inheritance: -fastNLP.models.char\_language\_model -------------------------------------- +fastNLP.models.char\_language\_model module +------------------------------------------- .. automodule:: fastNLP.models.char_language_model :members: + :undoc-members: + :show-inheritance: -fastNLP.models.cnn\_text\_classification ------------------------------------------ +fastNLP.models.cnn\_text\_classification module +----------------------------------------------- .. automodule:: fastNLP.models.cnn_text_classification :members: + :undoc-members: + :show-inheritance: + +fastNLP.models.enas\_controller module +-------------------------------------- + +.. automodule:: fastNLP.models.enas_controller + :members: + :undoc-members: + :show-inheritance: + +fastNLP.models.enas\_model module +--------------------------------- + +.. automodule:: fastNLP.models.enas_model + :members: + :undoc-members: + :show-inheritance: -fastNLP.models.sequence\_modeling ----------------------------------- +fastNLP.models.enas\_trainer module +----------------------------------- + +.. automodule:: fastNLP.models.enas_trainer + :members: + :undoc-members: + :show-inheritance: + +fastNLP.models.enas\_utils module +--------------------------------- + +.. automodule:: fastNLP.models.enas_utils + :members: + :undoc-members: + :show-inheritance: + +fastNLP.models.sequence\_modeling module +---------------------------------------- .. automodule:: fastNLP.models.sequence_modeling :members: + :undoc-members: + :show-inheritance: -fastNLP.models.snli --------------------- +fastNLP.models.snli module +-------------------------- .. automodule:: fastNLP.models.snli :members: + :undoc-members: + :show-inheritance: + +fastNLP.models.star\_transformer module +--------------------------------------- + +.. automodule:: fastNLP.models.star_transformer + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- .. automodule:: fastNLP.models :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.modules.aggregator.rst b/docs/source/fastNLP.modules.aggregator.rst index 073da4a5..63d351e4 100644 --- a/docs/source/fastNLP.modules.aggregator.rst +++ b/docs/source/fastNLP.modules.aggregator.rst @@ -1,36 +1,54 @@ -fastNLP.modules.aggregator -=========================== +fastNLP.modules.aggregator package +================================== -fastNLP.modules.aggregator.attention -------------------------------------- +Submodules +---------- + +fastNLP.modules.aggregator.attention module +------------------------------------------- .. automodule:: fastNLP.modules.aggregator.attention :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.aggregator.avg\_pool -------------------------------------- +fastNLP.modules.aggregator.avg\_pool module +------------------------------------------- .. automodule:: fastNLP.modules.aggregator.avg_pool :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.aggregator.kmax\_pool --------------------------------------- +fastNLP.modules.aggregator.kmax\_pool module +-------------------------------------------- .. automodule:: fastNLP.modules.aggregator.kmax_pool :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.aggregator.max\_pool -------------------------------------- +fastNLP.modules.aggregator.max\_pool module +------------------------------------------- .. automodule:: fastNLP.modules.aggregator.max_pool :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.aggregator.self\_attention -------------------------------------------- +fastNLP.modules.aggregator.self\_attention module +------------------------------------------------- .. automodule:: fastNLP.modules.aggregator.self_attention :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- .. automodule:: fastNLP.modules.aggregator :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.modules.decoder.rst b/docs/source/fastNLP.modules.decoder.rst index 6844543a..25602b2c 100644 --- a/docs/source/fastNLP.modules.decoder.rst +++ b/docs/source/fastNLP.modules.decoder.rst @@ -1,18 +1,30 @@ -fastNLP.modules.decoder -======================== +fastNLP.modules.decoder package +=============================== -fastNLP.modules.decoder.CRF ----------------------------- +Submodules +---------- + +fastNLP.modules.decoder.CRF module +---------------------------------- .. automodule:: fastNLP.modules.decoder.CRF :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.decoder.MLP ----------------------------- +fastNLP.modules.decoder.MLP module +---------------------------------- .. automodule:: fastNLP.modules.decoder.MLP :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- .. automodule:: fastNLP.modules.decoder :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.modules.encoder.rst b/docs/source/fastNLP.modules.encoder.rst index ea8fc699..ab93a169 100644 --- a/docs/source/fastNLP.modules.encoder.rst +++ b/docs/source/fastNLP.modules.encoder.rst @@ -1,60 +1,94 @@ -fastNLP.modules.encoder -======================== +fastNLP.modules.encoder package +=============================== -fastNLP.modules.encoder.char\_embedding ----------------------------------------- +Submodules +---------- + +fastNLP.modules.encoder.char\_embedding module +---------------------------------------------- .. automodule:: fastNLP.modules.encoder.char_embedding :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.encoder.conv ------------------------------ +fastNLP.modules.encoder.conv module +----------------------------------- .. automodule:: fastNLP.modules.encoder.conv :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.encoder.conv\_maxpool --------------------------------------- +fastNLP.modules.encoder.conv\_maxpool module +-------------------------------------------- .. automodule:: fastNLP.modules.encoder.conv_maxpool :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.encoder.embedding ----------------------------------- +fastNLP.modules.encoder.embedding module +---------------------------------------- .. automodule:: fastNLP.modules.encoder.embedding :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.encoder.linear -------------------------------- +fastNLP.modules.encoder.linear module +------------------------------------- .. automodule:: fastNLP.modules.encoder.linear :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.encoder.lstm ------------------------------ +fastNLP.modules.encoder.lstm module +----------------------------------- .. automodule:: fastNLP.modules.encoder.lstm :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.encoder.masked\_rnn ------------------------------------- +fastNLP.modules.encoder.masked\_rnn module +------------------------------------------ .. automodule:: fastNLP.modules.encoder.masked_rnn :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.encoder.transformer ------------------------------------- +fastNLP.modules.encoder.star\_transformer module +------------------------------------------------ + +.. automodule:: fastNLP.modules.encoder.star_transformer + :members: + :undoc-members: + :show-inheritance: + +fastNLP.modules.encoder.transformer module +------------------------------------------ .. automodule:: fastNLP.modules.encoder.transformer :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.encoder.variational\_rnn ------------------------------------------ +fastNLP.modules.encoder.variational\_rnn module +----------------------------------------------- .. automodule:: fastNLP.modules.encoder.variational_rnn :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- .. automodule:: fastNLP.modules.encoder :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.modules.rst b/docs/source/fastNLP.modules.rst index 965fb27d..57858176 100644 --- a/docs/source/fastNLP.modules.rst +++ b/docs/source/fastNLP.modules.rst @@ -1,5 +1,8 @@ -fastNLP.modules -================ +fastNLP.modules package +======================= + +Subpackages +----------- .. toctree:: @@ -7,24 +10,38 @@ fastNLP.modules fastNLP.modules.decoder fastNLP.modules.encoder -fastNLP.modules.dropout ------------------------- +Submodules +---------- + +fastNLP.modules.dropout module +------------------------------ .. automodule:: fastNLP.modules.dropout :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.other\_modules -------------------------------- +fastNLP.modules.other\_modules module +------------------------------------- .. automodule:: fastNLP.modules.other_modules :members: + :undoc-members: + :show-inheritance: -fastNLP.modules.utils ----------------------- +fastNLP.modules.utils module +---------------------------- .. automodule:: fastNLP.modules.utils :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- .. automodule:: fastNLP.modules :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.rst b/docs/source/fastNLP.rst index 61882359..6348c9a6 100644 --- a/docs/source/fastNLP.rst +++ b/docs/source/fastNLP.rst @@ -1,13 +1,22 @@ -fastNLP -======== +fastNLP package +=============== + +Subpackages +----------- .. toctree:: fastNLP.api + fastNLP.automl fastNLP.core fastNLP.io fastNLP.models fastNLP.modules +Module contents +--------------- + .. automodule:: fastNLP :members: + :undoc-members: + :show-inheritance: diff --git a/fastNLP/api/__init__.py b/fastNLP/api/__init__.py index a21a4c42..ae31b80b 100644 --- a/fastNLP/api/__init__.py +++ b/fastNLP/api/__init__.py @@ -1 +1,4 @@ +""" + 这是 API 部分的注释 +""" from .api import CWS, POS, Parser diff --git a/fastNLP/api/api.py b/fastNLP/api/api.py index 53a80131..b001629c 100644 --- a/fastNLP/api/api.py +++ b/fastNLP/api/api.py @@ -1,3 +1,7 @@ +""" +API.API 的文档 + +""" import warnings import torch @@ -184,17 +188,17 @@ class CWS(API): """ 传入一个分词文件路径,返回该数据集上分词f1, precision, recall。 分词文件应该为: - 1 编者按 编者按 NN O 11 nmod:topic - 2 : : PU O 11 punct - 3 7月 7月 NT DATE 4 compound:nn - 4 12日 12日 NT DATE 11 nmod:tmod - 5 , , PU O 11 punct - - 1 这 这 DT O 3 det - 2 款 款 M O 1 mark:clf - 3 飞行 飞行 NN O 8 nsubj - 4 从 从 P O 5 case - 5 外型 外型 NN O 8 nmod:prep + 1 编者按 编者按 NN O 11 nmod:topic + 2 : : PU O 11 punct + 3 7月 7月 NT DATE 4 compound:nn + 4 12日 12日 NT DATE 11 nmod:tmod + 5 , , PU O 11 punct + + 1 这 这 DT O 3 det + 2 款 款 M O 1 mark:clf + 3 飞行 飞行 NN O 8 nsubj + 4 从 从 P O 5 case + 5 外型 外型 NN O 8 nmod:prep 以空行分割两个句子,有内容的每行有7列。 :param filepath: str, 文件路径路径。 diff --git a/fastNLP/automl/enas_trainer.py b/fastNLP/automl/enas_trainer.py index 7c0da752..061d604c 100644 --- a/fastNLP/automl/enas_trainer.py +++ b/fastNLP/automl/enas_trainer.py @@ -62,13 +62,14 @@ class ENASTrainer(fastNLP.Trainer): """ :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 最好的模型参数。 - :return results: 返回一个字典类型的数据, 内含以下内容:: - - seconds: float, 表示训练时长 - 以下三个内容只有在提供了dev_data的情况下会有。 - best_eval: Dict of Dict, 表示evaluation的结果 - best_epoch: int,在第几个epoch取得的最佳值 - best_step: int, 在第几个step(batch)更新取得的最佳值 + :return results: 返回一个字典类型的数据, + 内含以下内容:: + + seconds: float, 表示训练时长 + 以下三个内容只有在提供了dev_data的情况下会有。 + best_eval: Dict of Dict, 表示evaluation的结果 + best_epoch: int,在第几个epoch取得的最佳值 + best_step: int, 在第几个step(batch)更新取得的最佳值 """ results = {} diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 76a34655..6cbfc20f 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -272,7 +272,7 @@ class DataSet(object): :param func: a function that takes an instance as input. :param str new_field_name: If not None, results of the function will be stored as a new field. - :param **kwargs: Accept parameters will be + :param kwargs: Accept parameters will be (1) is_input: boolean, will be ignored if new_field is None. If True, the new field will be as input. (2) is_target: boolean, will be ignored if new_field is None. If True, the new field will be as target. :return results: if new_field_name is not passed, returned values of the function over all instances. diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 10fbbebe..caf2a1cf 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -48,12 +48,16 @@ class PadderBase: class AutoPadder(PadderBase): """ 根据contents的数据自动判定是否需要做padding。 - (1) 如果元素类型(元素类型是指field中最里层List的元素的数据类型, 可以通过FieldArray.dtype查看,比如['This', 'is', ...]的元素类 - 型为np.str, [[1,2], ...]的元素类型为np.int64)的数据不为(np.int64, np.float64)则不会进行padding - (2) 如果元素类型为(np.int64, np.float64), - (2.1) 如果该field的内容只有一个,比如为sequence_length, 则不进行padding - (2.2) 如果该field的内容为List, 那么会将Batch中的List pad为一样长。若该List下还有里层的List需要padding,请使用其它padder。 - 如果某个instance中field为[1, 2, 3],则可以pad; 若为[[1,2], [3,4, ...]]则不能进行pad + + 1 如果元素类型(元素类型是指field中最里层List的元素的数据类型, 可以通过FieldArray.dtype查看,比如['This', 'is', ...]的元素类 + 型为np.str, [[1,2], ...]的元素类型为np.int64)的数据不为(np.int64, np.float64)则不会进行padding + + 2 如果元素类型为(np.int64, np.float64), + + 2.1 如果该field的内容只有一个,比如为sequence_length, 则不进行padding + + 2.2 如果该field的内容为List, 那么会将Batch中的List pad为一样长。若该List下还有里层的List需要padding,请使用其它padder。 + 如果某个instance中field为[1, 2, 3],则可以pad; 若为[[1,2], [3,4, ...]]则不能进行pad """ def __init__(self, pad_val=0): """ diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py index 5ac52e3f..fff992cc 100644 --- a/fastNLP/core/instance.py +++ b/fastNLP/core/instance.py @@ -1,13 +1,12 @@ class Instance(object): """An Instance is an example of data. - Example:: - ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2]) - ins["field_1"] - >>[1, 1, 1] - ins.add_field("field_3", [3, 3, 3]) - - :param fields: a dict of (str: list). - + Example:: + + ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2]) + ins["field_1"] + >>[1, 1, 1] + ins.add_field("field_3", [3, 3, 3]) + """ def __init__(self, **fields): diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index b52244e5..6b0b4460 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -272,7 +272,7 @@ def squash(predict, truth, **kwargs): :param predict: Tensor, model output :param truth: Tensor, truth from dataset - :param **kwargs: extra arguments + :param kwargs: extra arguments :return predict , truth: predict & truth after processing """ return predict.view(-1, predict.size()[-1]), truth.view(-1, ) @@ -316,7 +316,7 @@ def mask(predict, truth, **kwargs): :param predict: Tensor, [batch_size , max_len , tag_size] :param truth: Tensor, [batch_size , max_len] - :param **kwargs: extra arguments, kwargs["mask"]: ByteTensor, [batch_size , max_len], the mask Tensor. The position that is 1 will be selected. + :param kwargs: extra arguments, kwargs["mask"]: ByteTensor, [batch_size , max_len], the mask Tensor. The position that is 1 will be selected. :return predict , truth: predict & truth after processing """ diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 5687cc85..314be0d9 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -17,66 +17,72 @@ class MetricBase(object): """Base class for all metrics. 所有的传入到Trainer, Tester的Metric需要继承自该对象。需要覆盖写入evaluate(), get_metric()方法。 + evaluate(xxx)中传入的是一个batch的数据。 + get_metric(xxx)当所有数据处理完毕,调用该方法得到最终的metric值 + 以分类问题中,Accuracy计算为例 - 假设model的forward返回dict中包含'pred'这个key, 并且该key需要用于Accuracy - class Model(nn.Module): - def __init__(xxx): - # do something - def forward(self, xxx): - # do something - return {'pred': pred, 'other_keys':xxx} # pred's shape: batch_size x num_classes + 假设model的forward返回dict中包含'pred'这个key, 并且该key需要用于Accuracy:: + + class Model(nn.Module): + def __init__(xxx): + # do something + def forward(self, xxx): + # do something + return {'pred': pred, 'other_keys':xxx} # pred's shape: batch_size x num_classes + 假设dataset中'label'这个field是需要预测的值,并且该field被设置为了target - 对应的AccMetric可以按如下的定义 - # version1, 只使用这一次 - class AccMetric(MetricBase): - def __init__(self): - super().__init__() - - # 根据你的情况自定义指标 - self.corr_num = 0 - self.total = 0 - - def evaluate(self, label, pred): # 这里的名称需要和dataset中target field与model返回的key是一样的,不然找不到对应的value - # dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric - self.total += label.size(0) - self.corr_num += label.eq(pred).sum().item() - - def get_metric(self, reset=True): # 在这里定义如何计算metric - acc = self.corr_num/self.total - if reset: # 是否清零以便重新计算 + 对应的AccMetric可以按如下的定义, version1, 只使用这一次:: + + class AccMetric(MetricBase): + def __init__(self): + super().__init__() + + # 根据你的情况自定义指标 self.corr_num = 0 self.total = 0 - return {'acc': acc} # 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中 - - - # version2,如果需要复用Metric,比如下一次使用AccMetric时,dataset中目标field不叫label而叫y,或者model的输出不是pred - class AccMetric(MetricBase): - def __init__(self, label=None, pred=None): - # 假设在另一场景使用时,目标field叫y,model给出的key为pred_y。则只需要在初始化AccMetric时, - # acc_metric = AccMetric(label='y', pred='pred_y')即可。 - # 当初始化为acc_metric = AccMetric(),即label=None, pred=None, fastNLP会直接使用'label', 'pred'作为key去索取对 - # 应的的值 - super().__init__() - self._init_param_map(label=label, pred=pred) # 该方法会注册label和pred. 仅需要注册evaluate()方法会用到的参数名即可 - # 如果没有注册该则效果与version1就是一样的 - - # 根据你的情况自定义指标 - self.corr_num = 0 - self.total = 0 - - def evaluate(self, label, pred): # 这里的参数名称需要和self._init_param_map()注册时一致。 - # dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric - self.total += label.size(0) - self.corr_num += label.eq(pred).sum().item() - - def get_metric(self, reset=True): # 在这里定义如何计算metric - acc = self.corr_num/self.total - if reset: # 是否清零以便重新计算 + + def evaluate(self, label, pred): # 这里的名称需要和dataset中target field与model返回的key是一样的,不然找不到对应的value + # dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric + self.total += label.size(0) + self.corr_num += label.eq(pred).sum().item() + + def get_metric(self, reset=True): # 在这里定义如何计算metric + acc = self.corr_num/self.total + if reset: # 是否清零以便重新计算 + self.corr_num = 0 + self.total = 0 + return {'acc': acc} # 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中 + + + version2,如果需要复用Metric,比如下一次使用AccMetric时,dataset中目标field不叫label而叫y,或者model的输出不是pred:: + + class AccMetric(MetricBase): + def __init__(self, label=None, pred=None): + # 假设在另一场景使用时,目标field叫y,model给出的key为pred_y。则只需要在初始化AccMetric时, + # acc_metric = AccMetric(label='y', pred='pred_y')即可。 + # 当初始化为acc_metric = AccMetric(),即label=None, pred=None, fastNLP会直接使用'label', 'pred'作为key去索取对 + # 应的的值 + super().__init__() + self._init_param_map(label=label, pred=pred) # 该方法会注册label和pred. 仅需要注册evaluate()方法会用到的参数名即可 + # 如果没有注册该则效果与version1就是一样的 + + # 根据你的情况自定义指标 self.corr_num = 0 self.total = 0 - return {'acc': acc} # 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中 + + def evaluate(self, label, pred): # 这里的参数名称需要和self._init_param_map()注册时一致。 + # dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric + self.total += label.size(0) + self.corr_num += label.eq(pred).sum().item() + + def get_metric(self, reset=True): # 在这里定义如何计算metric + acc = self.corr_num/self.total + if reset: # 是否清零以便重新计算 + self.corr_num = 0 + self.total = 0 + return {'acc': acc} # 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中 ``MetricBase`` handles validity check of its input dictionaries - ``pred_dict`` and ``target_dict``. @@ -84,12 +90,12 @@ class MetricBase(object): ``target_dict`` is the ground truth from DataSet where ``is_target`` is set ``True``. ``MetricBase`` will do the following type checks: - 1. whether self.evaluate has varargs, which is not supported. - 2. whether params needed by self.evaluate is not included in ``pred_dict``, ``target_dict``. - 3. whether params needed by self.evaluate duplicate in ``pred_dict``, ``target_dict``. + 1. whether self.evaluate has varargs, which is not supported. + 2. whether params needed by self.evaluate is not included in ``pred_dict``, ``target_dict``. + 3. whether params needed by self.evaluate duplicate in ``pred_dict``, ``target_dict``. Besides, before passing params into self.evaluate, this function will filter out params from output_dict and - target_dict which are not used in self.evaluate. (but if **kwargs presented in self.evaluate, no filtering + target_dict which are not used in self.evaluate. (but if kwargs presented in self.evaluate, no filtering will be conducted.) """ @@ -388,23 +394,26 @@ class SpanFPreRecMetric(MetricBase): """ 在序列标注问题中,以span的方式计算F, pre, rec. 比如中文Part of speech中,会以character的方式进行标注,句子'中国在亚洲'对应的POS可能为(以BMES为例) - ['B-NN', 'E-NN', 'S-DET', 'B-NN', 'E-NN']。该metric就是为类似情况下的F1计算。 - 最后得到的metric结果为 - { - 'f': xxx, # 这里使用f考虑以后可以计算f_beta值 - 'pre': xxx, - 'rec':xxx - } - 若only_gross=False, 即还会返回各个label的metric统计值 + ['B-NN', 'E-NN', 'S-DET', 'B-NN', 'E-NN']。该metric就是为类似情况下的F1计算。 + 最后得到的metric结果为:: + { - 'f': xxx, - 'pre': xxx, - 'rec':xxx, - 'f-label': xxx, - 'pre-label': xxx, - 'rec-label':xxx, - ... - } + 'f': xxx, # 这里使用f考虑以后可以计算f_beta值 + 'pre': xxx, + 'rec':xxx + } + + 若only_gross=False, 即还会返回各个label的metric统计值:: + + { + 'f': xxx, + 'pre': xxx, + 'rec':xxx, + 'f-label': xxx, + 'pre-label': xxx, + 'rec-label':xxx, + ... + } """ def __init__(self, tag_vocab, pred=None, target=None, seq_lens=None, encoding_type='bio', ignore_labels=None, @@ -573,13 +582,21 @@ class BMESF1PreRecMetric(MetricBase): """ 按照BMES标注方式计算f1, precision, recall。由于可能存在非法tag,比如"BS",所以需要用以下的表格做转换,cur_B意思是当前tag是B, next_B意思是后一个tag是B。则cur_B=S,即将当前被predict是B的tag标为S;next_M=B, 即将后一个被predict是M的tag标为B + + +-------+---------+----------+----------+---------+---------+ | | next_B | next_M | next_E | next_S | end | - |:-----:|:-------:|:--------:|:--------:|:-------:|:-------:| - | start | 合法 | next_M=B | next_E=S | 合法 | - | + +=======+=========+==========+==========+=========+=========+ + | start | 合法 | next_M=B | next_E=S | 合法 | -- | + +-------+---------+----------+----------+---------+---------+ | cur_B | cur_B=S | 合法 | 合法 | cur_B=S | cur_B=S | + +-------+---------+----------+----------+---------+---------+ | cur_M | cur_M=E | 合法 | 合法 | cur_M=E | cur_M=E | + +-------+---------+----------+----------+---------+---------+ | cur_E | 合法 | next_M=B | next_E=S | 合法 | 合法 | + +-------+---------+----------+----------+---------+---------+ | cur_S | 合法 | next_M=B | next_E=S | 合法 | 合法 | + +-------+---------+----------+----------+---------+---------+ + 举例: prediction为BSEMS,会被认为是SSSSS. diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index b45dd148..250cfdb0 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -66,28 +66,28 @@ class Trainer(object): 不足,通过设置batch_size=32, update_every=4达到目的 """ super(Trainer, self).__init__() - + if not isinstance(train_data, DataSet): raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.") if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") - + # check metrics and dev_data if (not metrics) and dev_data is not None: raise ValueError("No metric for dev_data evaluation.") if metrics and (dev_data is None): raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") - + # check update every - assert update_every>=1, "update_every must be no less than 1." + assert update_every >= 1, "update_every must be no less than 1." self.update_every = int(update_every) - + # check save_path if not (save_path is None or isinstance(save_path, str)): raise ValueError("save_path can only be None or `str`.") # prepare evaluate metrics = _prepare_metrics(metrics) - + # parse metric_key # increase_better is True. It means the exp result gets better if the indicator increases. # It is true by default. @@ -97,19 +97,19 @@ class Trainer(object): self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key elif len(metrics) > 0: self.metric_key = metrics[0].__class__.__name__.lower().strip('metric') - + # prepare loss losser = _prepare_losser(loss) - + # sampler check if sampler is not None and not isinstance(sampler, BaseSampler): raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) - + if check_code_level > -1: _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, metric_key=metric_key, check_level=check_code_level, batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) - + self.train_data = train_data self.dev_data = dev_data # If None, No validation. self.model = model @@ -120,7 +120,7 @@ class Trainer(object): self.use_cuda = bool(use_cuda) self.save_path = save_path self.print_every = int(print_every) - self.validate_every = int(validate_every) if validate_every!=0 else -1 + self.validate_every = int(validate_every) if validate_every != 0 else -1 self.best_metric_indicator = None self.best_dev_epoch = None self.best_dev_step = None @@ -129,19 +129,19 @@ class Trainer(object): self.prefetch = prefetch self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) self.n_steps = (len(self.train_data) // self.batch_size + int( - len(self.train_data) % self.batch_size != 0)) * self.n_epochs - + len(self.train_data) % self.batch_size != 0)) * self.n_epochs + if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer else: if optimizer is None: optimizer = Adam(lr=0.01, weight_decay=0) self.optimizer = optimizer.construct_from_pytorch(self.model.parameters()) - + self.use_tqdm = use_tqdm self.pbar = None self.print_every = abs(self.print_every) - + if self.dev_data is not None: self.tester = Tester(model=self.model, data=self.dev_data, @@ -149,14 +149,13 @@ class Trainer(object): batch_size=self.batch_size, use_cuda=self.use_cuda, verbose=0) - + self.step = 0 self.start_time = None # start timestamp - + self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) - - + def train(self, load_best_model=True): """ @@ -185,14 +184,15 @@ class Trainer(object): 根据metrics进行evaluation,并根据是否提供了save_path判断是否存储模型 :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 - 最好的模型参数。 - :return results: 返回一个字典类型的数据, 内含以下内容:: + 最好的模型参数。 + :return results: 返回一个字典类型的数据, + 内含以下内容:: - seconds: float, 表示训练时长 - 以下三个内容只有在提供了dev_data的情况下会有。 - best_eval: Dict of Dict, 表示evaluation的结果 - best_epoch: int,在第几个epoch取得的最佳值 - best_step: int, 在第几个step(batch)更新取得的最佳值 + seconds: float, 表示训练时长 + 以下三个内容只有在提供了dev_data的情况下会有。 + best_eval: Dict of Dict, 表示evaluation的结果 + best_epoch: int,在第几个epoch取得的最佳值 + best_step: int, 在第几个step(batch)更新取得的最佳值 """ results = {} @@ -205,21 +205,22 @@ class Trainer(object): self.model = self.model.cuda() self._model_device = self.model.parameters().__next__().device self._mode(self.model, is_test=False) - + self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) start_time = time.time() print("training epochs started " + self.start_time, flush=True) - + try: self.callback_manager.on_train_begin() self._train() self.callback_manager.on_train_end() except (CallbackException, KeyboardInterrupt) as e: self.callback_manager.on_exception(e) - + if self.dev_data is not None and hasattr(self, 'best_dev_perf'): - print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + - self.tester._format_eval_results(self.best_dev_perf),) + print( + "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + + self.tester._format_eval_results(self.best_dev_perf), ) results['best_eval'] = self.best_dev_perf results['best_epoch'] = self.best_dev_epoch results['best_step'] = self.best_dev_step @@ -233,9 +234,9 @@ class Trainer(object): finally: pass results['seconds'] = round(time.time() - start_time, 2) - + return results - + def _train(self): if not self.use_tqdm: from fastNLP.core.utils import pseudo_tqdm as inner_tqdm @@ -244,13 +245,13 @@ class Trainer(object): self.step = 0 self.epoch = 0 start = time.time() - + with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: self.pbar = pbar if isinstance(pbar, tqdm) else None avg_loss = 0 data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, prefetch=self.prefetch) - for epoch in range(1, self.n_epochs+1): + for epoch in range(1, self.n_epochs + 1): self.epoch = epoch pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) # early stopping @@ -262,22 +263,22 @@ class Trainer(object): # negative sampling; replace unknown; re-weight batch_y self.callback_manager.on_batch_begin(batch_x, batch_y, indices) prediction = self._data_forward(self.model, batch_x) - + # edit prediction self.callback_manager.on_loss_begin(batch_y, prediction) loss = self._compute_loss(prediction, batch_y).mean() avg_loss += loss.item() - loss = loss/self.update_every - + loss = loss / self.update_every + # Is loss NaN or inf? requires_grad = False self.callback_manager.on_backward_begin(loss) self._grad_backward(loss) self.callback_manager.on_backward_end() - + self._update() self.callback_manager.on_step_end() - - if (self.step+1) % self.print_every == 0: + + if (self.step + 1) % self.print_every == 0: avg_loss = avg_loss / self.print_every if self.use_tqdm: print_output = "loss:{0:<6.5f}".format(avg_loss) @@ -290,34 +291,34 @@ class Trainer(object): pbar.set_postfix_str(print_output) avg_loss = 0 self.callback_manager.on_batch_end() - + if ((self.validate_every > 0 and self.step % self.validate_every == 0) or (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ and self.dev_data is not None: eval_res = self._do_validation(epoch=epoch, step=self.step) eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, self.n_steps) + \ - self.tester._format_eval_results(eval_res) + self.tester._format_eval_results(eval_res) pbar.write(eval_str + '\n') - + # ================= mini-batch end ==================== # - + # lr decay; early stopping self.callback_manager.on_epoch_end() # =============== epochs end =================== # pbar.close() self.pbar = None # ============ tqdm end ============== # - + def _do_validation(self, epoch, step): self.callback_manager.on_valid_begin() res = self.tester.test() - + is_better_eval = False if self._better_eval_result(res): if self.save_path is not None: self._save_model(self.model, - "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])) + "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])) else: self._best_model_states = {name: param.cpu().clone() for name, param in self.model.named_parameters()} self.best_dev_perf = res @@ -327,7 +328,7 @@ class Trainer(object): # get validation results; adjust optimizer self.callback_manager.on_valid_end(res, self.metric_key, self.optimizer, is_better_eval) return res - + def _mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. @@ -339,21 +340,21 @@ class Trainer(object): model.eval() else: model.train() - + def _update(self): """Perform weight update on a model. """ - if (self.step+1)%self.update_every==0: + if (self.step + 1) % self.update_every == 0: self.optimizer.step() - + def _data_forward(self, network, x): x = _build_args(network.forward, **x) y = network(**x) if not isinstance(y, dict): raise TypeError(f"The return value of {get_func_signature(network.forward)} should be dict, got {type(y)}.") return y - + def _grad_backward(self, loss): """Compute gradient with link rules. @@ -361,10 +362,10 @@ class Trainer(object): For PyTorch, just do "loss.backward()" """ - if self.step%self.update_every==0: + if self.step % self.update_every == 0: self.model.zero_grad() loss.backward() - + def _compute_loss(self, predict, truth): """Compute loss given prediction and ground truth. @@ -373,7 +374,7 @@ class Trainer(object): :return: a scalar """ return self.losser(predict, truth) - + def _save_model(self, model, model_name, only_param=False): """ 存储不含有显卡信息的state_dict或model :param model: @@ -394,7 +395,7 @@ class Trainer(object): model.cpu() torch.save(model, model_path) model.to(self._model_device) - + def _load_model(self, model, model_name, only_param=False): # 返回bool值指示是否成功reload模型 if self.save_path is not None: @@ -409,7 +410,7 @@ class Trainer(object): else: return False return True - + def _better_eval_result(self, metrics): """Check if the current epoch yields better validation results. @@ -437,6 +438,7 @@ class Trainer(object): DEFAULT_CHECK_BATCH_SIZE = 2 DEFAULT_CHECK_NUM_BATCH = 2 + def _get_value_info(_dict): # given a dict value, return information about this dict's value. Return list of str strs = [] @@ -453,27 +455,28 @@ def _get_value_info(_dict): strs.append(_str) return strs + def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, metric_key=None, check_level=0): # check get_loss 方法 model_devcie = model.parameters().__next__().device - + batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_count, (batch_x, batch_y) in enumerate(batch): _move_dict_value_to_device(batch_x, batch_y, device=model_devcie) # forward check - if batch_count==0: + if batch_count == 0: info_str = "" input_fields = _get_value_info(batch_x) target_fields = _get_value_info(batch_y) - if len(input_fields)>0: + if len(input_fields) > 0: info_str += "input fields after batch(if batch size is {}):\n".format(batch_size) info_str += "\n".join(input_fields) info_str += '\n' else: raise RuntimeError("There is no input field.") - if len(target_fields)>0: + if len(target_fields) > 0: info_str += "target fields after batch(if batch size is {}):\n".format(batch_size) info_str += "\n".join(target_fields) info_str += '\n' @@ -481,14 +484,14 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ info_str += 'There is no target field.' print(info_str) _check_forward_error(forward_func=model.forward, dataset=dataset, - batch_x=batch_x, check_level=check_level) - + batch_x=batch_x, check_level=check_level) + refined_batch_x = _build_args(model.forward, **batch_x) pred_dict = model(**refined_batch_x) func_signature = get_func_signature(model.forward) if not isinstance(pred_dict, dict): raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(pred_dict)}`.") - + # loss check try: loss = losser(pred_dict, batch_y) @@ -512,7 +515,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ model.zero_grad() if batch_count + 1 >= DEFAULT_CHECK_NUM_BATCH: break - + if dev_data is not None: tester = Tester(data=dev_data[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, batch_size=batch_size, verbose=-1) @@ -526,7 +529,7 @@ def _check_eval_results(metrics, metric_key, metric_list): # metric_list: 多个用来做评价的指标,来自Trainer的初始化 if isinstance(metrics, tuple): loss, metrics = metrics - + if isinstance(metrics, dict): if len(metrics) == 1: # only single metric, just use it @@ -537,7 +540,7 @@ def _check_eval_results(metrics, metric_key, metric_list): if metrics_name not in metrics: raise RuntimeError(f"{metrics_name} is chosen to do validation, but got {metrics}") metric_dict = metrics[metrics_name] - + if len(metric_dict) == 1: indicator_val, indicator = list(metric_dict.values())[0], list(metric_dict.keys())[0] elif len(metric_dict) > 1 and metric_key is None: diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index d9141412..fc15166e 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -197,17 +197,22 @@ def get_func_signature(func): Given a function or method, return its signature. For example: - (1) function + + 1 function:: + def func(a, b='a', *args): xxxx get_func_signature(func) # 'func(a, b='a', *args)' - (2) method + + 2 method:: + class Demo: def __init__(self): xxx def forward(self, a, b='a', **args) demo = Demo() get_func_signature(demo.forward) # 'Demo.forward(self, a, b='a', **args)' + :param func: a function or a method :return: str or None """ diff --git a/fastNLP/models/char_language_model.py b/fastNLP/models/char_language_model.py index 5fbde3cc..d5e3359d 100644 --- a/fastNLP/models/char_language_model.py +++ b/fastNLP/models/char_language_model.py @@ -20,16 +20,23 @@ class Highway(nn.Module): class CharLM(nn.Module): """CNN + highway network + LSTM - # Input: + + # Input:: + 4D tensor with shape [batch_size, in_channel, height, width] - # Output: + + # Output:: + 2D Tensor with shape [batch_size, vocab_size] - # Arguments: + + # Arguments:: + char_emb_dim: the size of each character's attention word_emb_dim: the size of each word's attention vocab_size: num of unique words num_char: num of characters use_gpu: True or False + """ def __init__(self, char_emb_dim, word_emb_dim, diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py index 6b51c897..26b7cd49 100644 --- a/fastNLP/models/enas_trainer.py +++ b/fastNLP/models/enas_trainer.py @@ -65,13 +65,14 @@ class ENASTrainer(fastNLP.Trainer): """ :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 最好的模型参数。 - :return results: 返回一个字典类型的数据, 内含以下内容:: - - seconds: float, 表示训练时长 - 以下三个内容只有在提供了dev_data的情况下会有。 - best_eval: Dict of Dict, 表示evaluation的结果 - best_epoch: int,在第几个epoch取得的最佳值 - best_step: int, 在第几个step(batch)更新取得的最佳值 + :return results: 返回一个字典类型的数据, + 内含以下内容:: + + seconds: float, 表示训练时长 + 以下三个内容只有在提供了dev_data的情况下会有。 + best_eval: Dict of Dict, 表示evaluation的结果 + best_epoch: int,在第几个epoch取得的最佳值 + best_step: int, 在第几个step(batch)更新取得的最佳值 """ results = {} From 13d8978953026bcb6fb4046c7f6e0ce500458efb Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 22 Apr 2019 01:49:44 +0800 Subject: [PATCH 12/13] fix some doc errors --- fastNLP/io/config_io.py | 8 +++--- fastNLP/io/dataset_loader.py | 47 +++++++++++++++++++++--------------- fastNLP/io/embed_loader.py | 2 +- fastNLP/io/model_io.py | 12 +++++---- 4 files changed, 39 insertions(+), 30 deletions(-) diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py index 5a64b96c..c0ffe53e 100644 --- a/fastNLP/io/config_io.py +++ b/fastNLP/io/config_io.py @@ -26,10 +26,10 @@ class ConfigLoader(BaseLoader): :param str file_path: the path of config file :param dict sections: the dict of ``{section_name(string): ConfigSection object}`` - Example:: - - test_args = ConfigSection() - ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) + Example:: + + test_args = ConfigSection() + ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) """ assert isinstance(sections, dict) diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index e33384a8..87127cf8 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -9,7 +9,7 @@ from fastNLP.io.base_loader import DataLoaderRegister def convert_seq_dataset(data): """Create an DataSet instance that contains no labels. - :param data: list of list of strings, [num_examples, *]. + :param data: list of list of strings, [num_examples, \*]. Example:: [ @@ -28,7 +28,7 @@ def convert_seq_dataset(data): def convert_seq2tag_dataset(data): """Convert list of data into DataSet. - :param data: list of list of strings, [num_examples, *]. + :param data: list of list of strings, [num_examples, \*]. Example:: [ @@ -48,7 +48,7 @@ def convert_seq2tag_dataset(data): def convert_seq2seq_dataset(data): """Convert list of data into DataSet. - :param data: list of list of strings, [num_examples, *]. + :param data: list of list of strings, [num_examples, \*]. Example:: [ @@ -177,18 +177,18 @@ DataLoaderRegister.set_reader(RawDataSetLoader, 'read_rawdata') class DummyPOSReader(DataSetLoader): """A simple reader for a dummy POS tagging dataset. - In these datasets, each line are divided by "\t". The first Col is the vocabulary and the second + In these datasets, each line are divided by "\\\\t". The first Col is the vocabulary and the second Col is the label. Different sentence are divided by an empty line. - E.g:: + E.g:: - Tom label1 - and label2 - Jerry label1 - . label3 - (separated by an empty line) - Hello label4 - world label5 - ! label3 + Tom label1 + and label2 + Jerry label1 + . label3 + (separated by an empty line) + Hello label4 + world label5 + ! label3 In this example, there are two sentences "Tom and Jerry ." and "Hello world !". Each word has its own label. """ @@ -200,11 +200,13 @@ class DummyPOSReader(DataSetLoader): """ :return data: three-level list Example:: + [ [ [word_11, word_12, ...], [label_1, label_1, ...] ], [ [word_21, word_22, ...], [label_2, label_1, ...] ], ... ] + """ with open(data_path, "r", encoding="utf-8") as f: lines = f.readlines() @@ -550,6 +552,7 @@ class SNLIDataSetReader(DataSetLoader): :param data: A 3D tensor. Example:: + [ [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ], [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ], @@ -647,7 +650,7 @@ class NaiveCWSReader(DataSetLoader): 例如:: 这是 fastNLP , 一个 非常 good 的 包 . - + 或者,即每个part后面还有一个pos tag 例如:: @@ -661,12 +664,15 @@ class NaiveCWSReader(DataSetLoader): def load(self, filepath, in_word_splitter=None, cut_long_sent=False): """ - 允许使用的情况有(默认以\t或空格作为seg) + 允许使用的情况有(默认以\\\\t或空格作为seg):: + 这是 fastNLP , 一个 非常 good 的 包 . - 和 + + 和:: + 也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY + 如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0] - :param filepath: :param in_word_splitter: :param cut_long_sent: @@ -737,11 +743,12 @@ class ZhConllPOSReader(object): def load(self, path): """ - 返回的DataSet, 包含以下的field + 返回的DataSet, 包含以下的field:: + words:list of str, tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..],会被认为是["S-VP", "B-NN", "M-NN",..] - 假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即 - :: + + 假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即:: 1 编者按 编者按 NN O 11 nmod:topic 2 : : PU O 11 punct diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 5ad27c53..16ea0339 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -132,7 +132,7 @@ class EmbedLoader(BaseLoader): def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'): """ load pretraining embedding in {embed_file} based on words in vocab. Words in vocab but not in the pretraining - embedding are initialized from a normal distribution which has the mean and std of the found words vectors. + embedding are initialized from a normal distribution which has the mean and std of the found words vectors. The embedding type is determined automatically, support glove and word2vec(the first line only has two elements). :param embed_filepath: str, where to read pretrain embedding diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py index 422eb919..53bdc7ce 100644 --- a/fastNLP/io/model_io.py +++ b/fastNLP/io/model_io.py @@ -31,16 +31,18 @@ class ModelLoader(BaseLoader): class ModelSaver(object): """Save a model + Example:: - :param str save_path: the path to the saving directory. - Example:: - - saver = ModelSaver("./save/model_ckpt_100.pkl") - saver.save_pytorch(model) + saver = ModelSaver("./save/model_ckpt_100.pkl") + saver.save_pytorch(model) """ def __init__(self, save_path): + """ + + :param save_path: the path to the saving directory. + """ self.save_path = save_path def save_pytorch(self, model, param_only=True): From 15cdee827a3a6788e4b16127b000c5cd60c72047 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 22 Apr 2019 11:34:45 +0800 Subject: [PATCH 13/13] =?UTF-8?q?=E6=A0=B7=E4=BE=8B=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/Makefile | 3 + fastNLP/api/__init__.py | 3 - fastNLP/api/api.py | 83 ++++++++++++++++++------- fastNLP/modules/aggregator/attention.py | 6 +- 4 files changed, 70 insertions(+), 25 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 6a5c7375..6f2f2821 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -16,6 +16,9 @@ help: apidoc: @$(SPHINXAPIDOC) -f -o source ../fastNLP +server: + cd build/html && python -m http.server + .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new diff --git a/fastNLP/api/__init__.py b/fastNLP/api/__init__.py index ae31b80b..a21a4c42 100644 --- a/fastNLP/api/__init__.py +++ b/fastNLP/api/__init__.py @@ -1,4 +1 @@ -""" - 这是 API 部分的注释 -""" from .api import CWS, POS, Parser diff --git a/fastNLP/api/api.py b/fastNLP/api/api.py index b001629c..f088b121 100644 --- a/fastNLP/api/api.py +++ b/fastNLP/api/api.py @@ -1,5 +1,39 @@ """ -API.API 的文档 +api.api的介绍文档 + 直接缩进会把上面的文字变成标题 + +空行缩进的写法比较合理 + + 比较合理 + +*这里是斜体内容* + +**这里是粗体内容** + +数学公式块 + +.. math:: + E = mc^2 + +.. note:: + 注解型提示。 + +.. warning:: + 警告型提示。 + +.. seealso:: + `参考与超链接 `_ + +普通代码块需要空一行, Example:: + + from fitlog import fitlog + fitlog.commit() + +普通下标和上标: + +H\ :sub:`2`\ O + +E = mc\ :sup:`2` """ import warnings @@ -28,6 +62,9 @@ model_urls = { class API: + """ + 这是 API 类的文档 + """ def __init__(self): self.pipeline = None self._dict = None @@ -73,8 +110,9 @@ class POS(API): self.load(model_path, device) def predict(self, content): - """ - + """predict函数的介绍, + 函数介绍的第二句,这句话不会换行 + :param content: list of list of str. Each string is a token(word). :return answer: list of list of str. Each string is a tag. """ @@ -140,13 +178,14 @@ class POS(API): class CWS(API): - def __init__(self, model_path=None, device='cpu'): - """ - 中文分词高级接口。 + """ + 中文分词高级接口。 - :param model_path: 当model_path为None,使用默认位置的model。如果默认位置不存在,则自动下载模型 - :param device: str,可以为'cpu', 'cuda'或'cuda:0'等。会将模型load到相应device进行推断。 - """ + :param model_path: 当model_path为None,使用默认位置的model。如果默认位置不存在,则自动下载模型 + :param device: str,可以为'cpu', 'cuda'或'cuda:0'等。会将模型load到相应device进行推断。 + """ + def __init__(self, model_path=None, device='cpu'): + super(CWS, self).__init__() if model_path is None: model_path = model_urls['cws'] @@ -187,18 +226,20 @@ class CWS(API): def test(self, filepath): """ 传入一个分词文件路径,返回该数据集上分词f1, precision, recall。 - 分词文件应该为: - 1 编者按 编者按 NN O 11 nmod:topic - 2 : : PU O 11 punct - 3 7月 7月 NT DATE 4 compound:nn - 4 12日 12日 NT DATE 11 nmod:tmod - 5 , , PU O 11 punct - - 1 这 这 DT O 3 det - 2 款 款 M O 1 mark:clf - 3 飞行 飞行 NN O 8 nsubj - 4 从 从 P O 5 case - 5 外型 外型 NN O 8 nmod:prep + 分词文件应该为:: + + 1 编者按 编者按 NN O 11 nmod:topic + 2 : : PU O 11 punct + 3 7月 7月 NT DATE 4 compound:nn + 4 12日 12日 NT DATE 11 nmod:tmod + 5 , , PU O 11 punct + + 1 这 这 DT O 3 det + 2 款 款 M O 1 mark:clf + 3 飞行 飞行 NN O 8 nsubj + 4 从 从 P O 5 case + 5 外型 外型 NN O 8 nmod:prep + 以空行分割两个句子,有内容的每行有7列。 :param filepath: str, 文件路径路径。 diff --git a/fastNLP/modules/aggregator/attention.py b/fastNLP/modules/aggregator/attention.py index 33d73a07..4155fdd6 100644 --- a/fastNLP/modules/aggregator/attention.py +++ b/fastNLP/modules/aggregator/attention.py @@ -112,12 +112,15 @@ class MultiHeadAttention(nn.Module): class BiAttention(nn.Module): """Bi Attention module Calculate Bi Attention matrix `e` + .. math:: + \begin{array}{ll} \\ e_ij = {a}^{\mathbf{T}}_{i}{b}_{j} \\ a_i = b_j = \end{array} + """ def __init__(self): @@ -131,7 +134,8 @@ class BiAttention(nn.Module): :param torch.Tensor x1_len: [batch_size, x1_seq_len] 第一句的0/1mask矩阵 :param torch.Tensor x2_len: [batch_size, x2_seq_len] 第二句的0/1mask矩阵 :return: torch.Tensor out_x1: [batch_size, x1_seq_len, hidden_size] 第一句attend到的特征表示 - torch.Tensor out_x2: [batch_size, x2_seq_len, hidden_size] 第一句attend到的特征表示 + torch.Tensor out_x2: [batch_size, x2_seq_len, hidden_size] 第一句attend到的特征表示 + """ assert in_x1.size()[0] == in_x2.size()[0]