From 9c078500198e550d72a8b13eb8206aed82a18803 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Thu, 16 May 2019 20:32:10 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=20models=20?= =?UTF-8?q?=E9=83=A8=E5=88=86=20import=20=E7=9A=84=E9=A1=BA=E5=BA=8F?= =?UTF-8?q?=EF=BC=8C=5F=5Fall=5F=5F=20=E6=9A=B4=E9=9C=B2=E7=9A=84=E5=86=85?= =?UTF-8?q?=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/fastNLP.models.base_model.rst | 7 - docs/source/fastNLP.models.bert.rst | 7 - .../source/fastNLP.models.enas_controller.rst | 7 - docs/source/fastNLP.models.enas_model.rst | 7 - docs/source/fastNLP.models.enas_trainer.rst | 7 - docs/source/fastNLP.models.enas_utils.rst | 7 - docs/source/fastNLP.models.rst | 6 - fastNLP/core/batch.py | 4 +- fastNLP/core/callback.py | 1 + fastNLP/core/dataset.py | 3 +- fastNLP/core/field.py | 4 +- fastNLP/core/losses.py | 4 +- fastNLP/core/metrics.py | 4 +- fastNLP/core/predictor.py | 4 +- fastNLP/core/sampler.py | 4 +- fastNLP/core/tester.py | 1 + fastNLP/core/trainer.py | 6 +- fastNLP/core/utils.py | 9 +- fastNLP/io/__init__.py | 9 +- fastNLP/io/base_loader.py | 19 ++- fastNLP/io/config_io.py | 64 ++++--- fastNLP/io/dataset_loader.py | 11 +- fastNLP/io/embed_loader.py | 56 +++--- fastNLP/io/model_io.py | 15 +- fastNLP/models/__init__.py | 20 ++- fastNLP/models/base_model.py | 10 +- fastNLP/models/biaffine_parser.py | 159 ++++++++++-------- fastNLP/models/cnn_text_classification.py | 17 +- fastNLP/models/enas_controller.py | 1 + fastNLP/models/enas_model.py | 139 +++++++-------- fastNLP/models/enas_trainer.py | 141 ++++++++-------- fastNLP/models/enas_utils.py | 2 - fastNLP/models/sequence_labeling.py | 10 +- fastNLP/models/snli.py | 61 +++---- fastNLP/models/star_transformer.py | 67 +++++--- 35 files changed, 465 insertions(+), 428 deletions(-) delete mode 100644 docs/source/fastNLP.models.base_model.rst delete mode 100644 docs/source/fastNLP.models.bert.rst delete mode 100644 docs/source/fastNLP.models.enas_controller.rst delete mode 100644 docs/source/fastNLP.models.enas_model.rst delete mode 100644 docs/source/fastNLP.models.enas_trainer.rst delete mode 100644 docs/source/fastNLP.models.enas_utils.rst diff --git a/docs/source/fastNLP.models.base_model.rst b/docs/source/fastNLP.models.base_model.rst deleted file mode 100644 index e1d4d64f..00000000 --- a/docs/source/fastNLP.models.base_model.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.models.base\_model -========================== - -.. automodule:: fastNLP.models.base_model - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.models.bert.rst b/docs/source/fastNLP.models.bert.rst deleted file mode 100644 index bba323df..00000000 --- a/docs/source/fastNLP.models.bert.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.models.bert -=================== - -.. automodule:: fastNLP.models.bert - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.models.enas_controller.rst b/docs/source/fastNLP.models.enas_controller.rst deleted file mode 100644 index 28655bd7..00000000 --- a/docs/source/fastNLP.models.enas_controller.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.models.enas\_controller -=============================== - -.. automodule:: fastNLP.models.enas_controller - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.models.enas_model.rst b/docs/source/fastNLP.models.enas_model.rst deleted file mode 100644 index 35fbe495..00000000 --- a/docs/source/fastNLP.models.enas_model.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.models.enas\_model -========================== - -.. automodule:: fastNLP.models.enas_model - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.models.enas_trainer.rst b/docs/source/fastNLP.models.enas_trainer.rst deleted file mode 100644 index 7e0ef462..00000000 --- a/docs/source/fastNLP.models.enas_trainer.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.models.enas\_trainer -============================ - -.. automodule:: fastNLP.models.enas_trainer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.models.enas_utils.rst b/docs/source/fastNLP.models.enas_utils.rst deleted file mode 100644 index 0a049706..00000000 --- a/docs/source/fastNLP.models.enas_utils.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.models.enas\_utils -========================== - -.. automodule:: fastNLP.models.enas_utils - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst index 57592bf4..5858ebcd 100644 --- a/docs/source/fastNLP.models.rst +++ b/docs/source/fastNLP.models.rst @@ -12,14 +12,8 @@ fastNLP.models .. toctree:: :titlesonly: - fastNLP.models.base_model - fastNLP.models.bert fastNLP.models.biaffine_parser fastNLP.models.cnn_text_classification - fastNLP.models.enas_controller - fastNLP.models.enas_model - fastNLP.models.enas_trainer - fastNLP.models.enas_utils fastNLP.models.sequence_labeling fastNLP.models.snli fastNLP.models.star_transformer diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 90f0fc8c..b031d051 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -3,12 +3,12 @@ batch 模块实现了 fastNLP 所需的 Batch 类。 """ import atexit +from queue import Empty, Full + import numpy as np import torch import torch.multiprocessing as mp -from queue import Empty, Full - from .sampler import RandomSampler __all__ = [ diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 0a5ddc52..51495f23 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -50,6 +50,7 @@ callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class: """ import os + import torch try: diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 63f66019..f20dd1f8 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -273,9 +273,10 @@ """ import _pickle as pickle -import numpy as np import warnings +import numpy as np + from .field import AutoPadder from .field import FieldArray from .instance import Instance diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 4029a4ca..14e2538d 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -3,10 +3,10 @@ field模块实现了 FieldArray 和若干 Padder。 FieldArray 是 :class:`~fas 原理部分请参考 :doc:`fastNLP.core.dataset` """ -import numpy as np - from copy import deepcopy +import numpy as np + __all__ = [ "FieldArray", "Padder", diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index b98c5ac7..797b557d 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -3,11 +3,11 @@ losses 模块定义了 fastNLP 中所需的各种损失函数,一般做为 :cl """ import inspect +from collections import defaultdict + import torch import torch.nn.functional as F -from collections import defaultdict - from .utils import _CheckError from .utils import _CheckRes from .utils import _build_args diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index df85a318..5ea2a5f1 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -3,11 +3,11 @@ metrics 模块实现了 fastNLP 所需的各种常用衡量指标,一般做为 """ import inspect +from collections import defaultdict + import numpy as np import torch -from collections import defaultdict - from .utils import _CheckError from .utils import _CheckRes from .utils import _build_args diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index a9ef7924..4f37e105 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -2,10 +2,10 @@ ..todo:: 检查这个类是否需要 """ -import torch - from collections import defaultdict +import torch + from . import Batch from . import DataSet from . import SequentialSampler diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py index 0900e733..c8577722 100644 --- a/fastNLP/core/sampler.py +++ b/fastNLP/core/sampler.py @@ -1,10 +1,10 @@ """ sampler 子类实现了 fastNLP 所需的各种采样器。 """ -import numpy as np - from itertools import chain +import numpy as np + __all__ = [ "Sampler", "BucketSampler", diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 47aef46e..883e0d01 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -33,6 +33,7 @@ Tester在验证进行之前会调用model.eval()提示当前进入了evaluation """ import warnings + import torch import torch.nn as nn diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 87d57f12..7efa5d28 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -297,13 +297,13 @@ Example2.3 """ import os -import numpy as np import time +from datetime import datetime, timedelta + +import numpy as np import torch import torch.nn as nn -from datetime import datetime, timedelta - try: from tqdm.auto import tqdm except: diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index a7ad3326..6e2f99ff 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -3,14 +3,13 @@ utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户 """ import _pickle import inspect -import numpy as np import os -import torch -import torch.nn as nn import warnings +from collections import Counter, namedtuple -from collections import Counter -from collections import namedtuple +import numpy as np +import torch +import torch.nn as nn __all__ = [ "cache_results", diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 3baf878c..6ce7ebc3 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -9,6 +9,11 @@ 这些类的使用方法如下: """ +from .embed_loader import EmbedLoader +from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \ + PeopleDailyCorpusLoader, Conll2003Loader +from .model_io import ModelLoader, ModelSaver + __all__ = [ 'EmbedLoader', @@ -24,7 +29,3 @@ __all__ = [ 'ModelLoader', 'ModelSaver', ] -from .embed_loader import EmbedLoader -from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \ - PeopleDailyCorpusLoader, Conll2003Loader -from .model_io import ModelLoader as ModelLoader, ModelSaver as ModelSaver \ No newline at end of file diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py index 051de281..33f59fe5 100644 --- a/fastNLP/io/base_loader.py +++ b/fastNLP/io/base_loader.py @@ -1,15 +1,20 @@ import _pickle as pickle import os +__all__ = [ + "BaseLoader" +] + class BaseLoader(object): """ 各个 Loader 的基类,提供了 API 的参考。 """ + def __init__(self): super(BaseLoader, self).__init__() - + @staticmethod def load_lines(data_path): """ @@ -20,7 +25,7 @@ class BaseLoader(object): with open(data_path, "r", encoding="utf=8") as f: text = f.readlines() return [line.strip() for line in text] - + @classmethod def load(cls, data_path): """ @@ -31,7 +36,7 @@ class BaseLoader(object): with open(data_path, "r", encoding="utf-8") as f: text = f.readlines() return [[word for word in sent.strip()] for sent in text] - + @classmethod def load_with_cache(cls, data_path, cache_path): """缓存版的load @@ -48,16 +53,18 @@ class BaseLoader(object): class DataLoaderRegister: _readers = {} - + @classmethod def set_reader(cls, reader_cls, read_fn_name): # def wrapper(reader_cls): if read_fn_name in cls._readers: - raise KeyError('duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls, read_fn_name)) + raise KeyError( + 'duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls, + read_fn_name)) if hasattr(reader_cls, 'load'): cls._readers[read_fn_name] = reader_cls().load return reader_cls - + @classmethod def get_reader(cls, read_fn_name): if read_fn_name in cls._readers: diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py index 8fa30dd4..e67511ee 100644 --- a/fastNLP/io/config_io.py +++ b/fastNLP/io/config_io.py @@ -1,14 +1,20 @@ """ - 用于读入和处理和保存 config 文件 + .. todo:: + 这个模块中的类可能被抛弃? """ -__all__ = ["ConfigLoader","ConfigSection","ConfigSaver"] import configparser import json import os from .base_loader import BaseLoader +__all__ = [ + "ConfigLoader", + "ConfigSection", + "ConfigSaver" +] + class ConfigLoader(BaseLoader): """ @@ -19,15 +25,16 @@ class ConfigLoader(BaseLoader): :param str data_path: 配置文件的路径 """ + def __init__(self, data_path=None): super(ConfigLoader, self).__init__() if data_path is not None: self.config = self.parse(super(ConfigLoader, self).load(data_path)) - + @staticmethod def parse(string): raise NotImplementedError - + @staticmethod def load_config(file_path, sections): """ @@ -81,10 +88,10 @@ class ConfigSection(object): ConfigSection是一个存储了一个section中所有键值对的数据结构,推荐使用此类的实例来配合 :meth:`ConfigLoader.load_config` 使用 """ - + def __init__(self): super(ConfigSection, self).__init__() - + def __getitem__(self, key): """ :param key: str, the name of the attribute @@ -97,7 +104,7 @@ class ConfigSection(object): if key in self.__dict__.keys(): return getattr(self, key) raise AttributeError("do NOT have attribute %s" % key) - + def __setitem__(self, key, value): """ :param key: str, the name of the attribute @@ -112,14 +119,14 @@ class ConfigSection(object): raise AttributeError("attr %s except %s but got %s" % (key, str(type(getattr(self, key))), str(type(value)))) setattr(self, key, value) - + def __contains__(self, item): """ :param item: The key of item. :return: True if the key in self.__dict__.keys() else False. """ return item in self.__dict__.keys() - + def __eq__(self, other): """Overwrite the == operator @@ -131,15 +138,15 @@ class ConfigSection(object): return False if getattr(self, k) != getattr(self, k): return False - + for k in other.__dict__.keys(): if k not in self.__dict__.keys(): return False if getattr(self, k) != getattr(self, k): return False - + return True - + def __ne__(self, other): """Overwrite the != operator @@ -147,7 +154,7 @@ class ConfigSection(object): :return: """ return not self.__eq__(other) - + @property def data(self): return self.__dict__ @@ -162,11 +169,12 @@ class ConfigSaver(object): :param str file_path: 配置文件的路径 """ + def __init__(self, file_path): self.file_path = file_path if not os.path.exists(self.file_path): raise FileNotFoundError("file {} NOT found!".__format__(self.file_path)) - + def _get_section(self, sect_name): """ This is the function to get the section with the section name. @@ -177,7 +185,7 @@ class ConfigSaver(object): sect = ConfigSection() ConfigLoader().load_config(self.file_path, {sect_name: sect}) return sect - + def _read_section(self): """ This is the function to read sections from the config file. @@ -187,16 +195,16 @@ class ConfigSaver(object): sect_key_list: A list of names in sect_list. """ sect_name = None - + sect_list = {} sect_key_list = [] - + single_section = {} single_section_key = [] - + with open(self.file_path, 'r') as f: lines = f.readlines() - + for line in lines: if line.startswith('[') and line.endswith(']\n'): if sect_name is None: @@ -208,29 +216,29 @@ class ConfigSaver(object): sect_key_list.append(sect_name) sect_name = line[1: -2] continue - + if line.startswith('#'): single_section[line] = '#' single_section_key.append(line) continue - + if line.startswith('\n'): single_section_key.append('\n') continue - + if '=' not in line: raise RuntimeError("can NOT load config file {}".__format__(self.file_path)) - + key = line.split('=', maxsplit=1)[0].strip() value = line.split('=', maxsplit=1)[1].strip() + '\n' single_section[key] = value single_section_key.append(key) - + if sect_name is not None: sect_list[sect_name] = single_section, single_section_key sect_key_list.append(sect_name) return sect_list, sect_key_list - + def _write_section(self, sect_list, sect_key_list): """ This is the function to write config file with section list and name list. @@ -252,7 +260,7 @@ class ConfigSaver(object): continue f.write(key + ' = ' + single_section[key]) f.write('\n') - + def save_config_file(self, section_name, section): """ 这个方法可以用来修改并保存配置文件中单独的一个 section @@ -284,11 +292,11 @@ class ConfigSaver(object): break if not change_file: return - + sect_list, sect_key_list = self._read_section() if section_name not in sect_key_list: raise AttributeError() - + sect, sect_key = sect_list[section_name] for k in section.__dict__.keys(): if k not in sect_key: diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 3cd475a5..a4b233ad 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -10,6 +10,12 @@ dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的 # ... do stuff """ +from nltk.tree import Tree + +from ..core.dataset import DataSet +from ..core.instance import Instance +from .file_reader import _read_csv, _read_json, _read_conll + __all__ = [ 'DataSetLoader', 'CSVLoader', @@ -20,11 +26,6 @@ __all__ = [ 'PeopleDailyCorpusLoader', 'Conll2003Loader', ] -from nltk.tree import Tree - -from ..core.dataset import DataSet -from ..core.instance import Instance -from .file_reader import _read_csv, _read_json, _read_conll def _download_from_url(url, path): diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 9f3a73dd..7a845366 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -1,11 +1,15 @@ import os +import warnings import numpy as np from ..core.vocabulary import Vocabulary from .base_loader import BaseLoader -import warnings +__all__ = [ + "EmbedLoader" +] + class EmbedLoader(BaseLoader): """ @@ -13,10 +17,10 @@ class EmbedLoader(BaseLoader): 用于读取预训练的embedding, 读取结果可直接载入为模型参数。 """ - + def __init__(self): super(EmbedLoader, self).__init__() - + @staticmethod def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'): """ @@ -40,11 +44,11 @@ class EmbedLoader(BaseLoader): line = f.readline().strip() parts = line.split() start_idx = 0 - if len(parts)==2: + if len(parts) == 2: dim = int(parts[1]) start_idx += 1 else: - dim = len(parts)-1 + dim = len(parts) - 1 f.seek(0) matrix = np.random.randn(len(vocab), dim).astype(dtype) for idx, line in enumerate(f, start_idx): @@ -63,21 +67,21 @@ class EmbedLoader(BaseLoader): total_hits = sum(hit_flags) print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab))) found_vectors = matrix[hit_flags] - if len(found_vectors)!=0: + if len(found_vectors) != 0: mean = np.mean(found_vectors, axis=0, keepdims=True) std = np.std(found_vectors, axis=0, keepdims=True) unfound_vec_num = len(vocab) - total_hits - r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype)*std + mean - matrix[hit_flags==False] = r_vecs - + r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean + matrix[hit_flags == False] = r_vecs + if normalize: matrix /= np.linalg.norm(matrix, axis=1, keepdims=True) - + return matrix - + @staticmethod def load_without_vocab(embed_filepath, dtype=np.float32, padding='', unknown='', normalize=True, - error='ignore'): + error='ignore'): """ 从embed_filepath中读取预训练的word vector。根据预训练的词表读取embedding并生成一个对应的Vocabulary。 @@ -96,35 +100,35 @@ class EmbedLoader(BaseLoader): vec_dict = {} found_unknown = False found_pad = False - + with open(embed_filepath, 'r', encoding='utf-8') as f: line = f.readline() start = 1 dim = -1 - if len(line.strip().split())!=2: + if len(line.strip().split()) != 2: f.seek(0) start = 0 for idx, line in enumerate(f, start=start): try: parts = line.strip().split() word = parts[0] - if dim==-1: - dim = len(parts)-1 + if dim == -1: + dim = len(parts) - 1 vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) vec_dict[word] = vec vocab.add_word(word) - if unknown is not None and unknown==word: + if unknown is not None and unknown == word: found_unknown = True - if found_pad is not None and padding==word: + if found_pad is not None and padding == word: found_pad = True except Exception as e: - if error=='ignore': + if error == 'ignore': warnings.warn("Error occurred at the {} line.".format(idx)) pass else: print("Error occurred at the {} line.".format(idx)) raise e - if dim==-1: + if dim == -1: raise RuntimeError("{} is an empty file.".format(embed_filepath)) matrix = np.random.randn(len(vocab), dim).astype(dtype) if (unknown is not None and not found_unknown) or (padding is not None and not found_pad): @@ -133,19 +137,19 @@ class EmbedLoader(BaseLoader): start_idx += 1 if unknown is not None: start_idx += 1 - + mean = np.mean(matrix[start_idx:], axis=0, keepdims=True) std = np.std(matrix[start_idx:], axis=0, keepdims=True) if (unknown is not None and not found_unknown): - matrix[start_idx-1] = np.random.randn(1, dim).astype(dtype)*std + mean + matrix[start_idx - 1] = np.random.randn(1, dim).astype(dtype) * std + mean if (padding is not None and not found_pad): - matrix[0] = np.random.randn(1, dim).astype(dtype)*std + mean - + matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean + for key, vec in vec_dict.items(): index = vocab.to_index(key) matrix[index] = vec - + if normalize: matrix /= np.linalg.norm(matrix, axis=1, keepdims=True) - + return matrix, vocab diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py index 48e53ab3..36393cd4 100644 --- a/fastNLP/io/model_io.py +++ b/fastNLP/io/model_io.py @@ -5,6 +5,11 @@ import torch from .base_loader import BaseLoader +__all__ = [ + "ModelLoader", + "ModelSaver" +] + class ModelLoader(BaseLoader): """ @@ -12,10 +17,10 @@ class ModelLoader(BaseLoader): 用于读取模型 """ - + def __init__(self): super(ModelLoader, self).__init__() - + @staticmethod def load_pytorch(empty_model, model_path): """ @@ -25,7 +30,7 @@ class ModelLoader(BaseLoader): :param str model_path: 模型保存的路径 """ empty_model.load_state_dict(torch.load(model_path)) - + @staticmethod def load_pytorch_model(model_path): """ @@ -48,14 +53,14 @@ class ModelSaver(object): saver.save_pytorch(model) """ - + def __init__(self, save_path): """ :param save_path: 模型保存的路径 """ self.save_path = save_path - + def save_pytorch(self, model, param_only=True): """ 把 PyTorch 模型存入 ".pkl" 文件 diff --git a/fastNLP/models/__init__.py b/fastNLP/models/__init__.py index 66af3a46..f9ade153 100644 --- a/fastNLP/models/__init__.py +++ b/fastNLP/models/__init__.py @@ -7,7 +7,6 @@ fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models """ -__all__ = ["CNNText", "SeqLabeling", "ESIM", "STSeqLabel", "AdvSeqLabel", "STNLICls", "STSeqCls"] from .base_model import BaseModel from .bert import BertForMultipleChoice, BertForQuestionAnswering, BertForSequenceClassification, \ BertForTokenClassification @@ -15,4 +14,21 @@ from .biaffine_parser import BiaffineParser, GraphParser from .cnn_text_classification import CNNText from .sequence_labeling import SeqLabeling, AdvSeqLabel from .snli import ESIM -from .star_transformer import STSeqCls, STNLICls, STSeqLabel +from .star_transformer import StarTransEnc, STSeqCls, STNLICls, STSeqLabel + +__all__ = [ + "CNNText", + + "SeqLabeling", + "AdvSeqLabel", + + "ESIM", + + "StarTransEnc", + "STSeqLabel", + "STNLICls", + "STSeqCls", + + "BiaffineParser", + "GraphParser" +] diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 39ac99a0..d27f1d21 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -6,13 +6,13 @@ from ..modules.decoder.MLP import MLP class BaseModel(torch.nn.Module): """Base PyTorch model for all models. """ - + def __init__(self): super(BaseModel, self).__init__() - + def fit(self, train_data, dev_data=None, **train_args): pass - + def predict(self, *args, **kwargs): raise NotImplementedError @@ -21,9 +21,9 @@ class NaiveClassifier(BaseModel): def __init__(self, in_feature_dim, out_feature_dim): super(NaiveClassifier, self).__init__() self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) - + def forward(self, x): return {"predict": torch.sigmoid(self.mlp(x))} - + def predict(self, x): return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py index 100bfb72..7f16202d 100644 --- a/fastNLP/models/biaffine_parser.py +++ b/fastNLP/models/biaffine_parser.py @@ -1,11 +1,12 @@ -"""Biaffine Dependency Parser 的 Pytorch 实现. """ -from collections import defaultdict - +Biaffine Dependency Parser 的 Pytorch 实现. +""" import numpy as np import torch -from torch import nn -from torch.nn import functional as F +import torch.nn as nn +import torch.nn.functional as F + +from collections import defaultdict from ..core.const import Const as C from ..core.losses import LossFunc @@ -18,6 +19,12 @@ from ..modules.utils import get_embeddings from .base_model import BaseModel from ..core.utils import seq_len_to_mask +__all__ = [ + "BiaffineParser", + "GraphParser" +] + + def _mst(scores): """ with some modification to support parser output for MST decoding @@ -44,7 +51,7 @@ def _mst(scores): scores[roots, new_heads] / root_scores)] heads[roots] = new_heads heads[new_root] = 0 - + edges = defaultdict(set) vertices = set((0,)) for dep, head in enumerate(heads[tokens]): @@ -73,7 +80,7 @@ def _mst(scores): heads[changed_cycle] = new_head edges[new_head].add(changed_cycle) edges[old_head].remove(changed_cycle) - + return heads @@ -88,7 +95,7 @@ def _find_cycle(vertices, edges): _lowlinks = {} _onstack = defaultdict(lambda: False) _SCCs = [] - + def _strongconnect(v): nonlocal _index _indices[v] = _index @@ -96,28 +103,28 @@ def _find_cycle(vertices, edges): _index += 1 _stack.append(v) _onstack[v] = True - + for w in edges[v]: if w not in _indices: _strongconnect(w) _lowlinks[v] = min(_lowlinks[v], _lowlinks[w]) elif _onstack[w]: _lowlinks[v] = min(_lowlinks[v], _indices[w]) - + if _lowlinks[v] == _indices[v]: SCC = set() while True: w = _stack.pop() _onstack[w] = False SCC.add(w) - if not(w != v): + if not (w != v): break _SCCs.append(SCC) - + for v in vertices: if v not in _indices: _strongconnect(v) - + return [SCC for SCC in _SCCs if len(SCC) > 1] @@ -125,9 +132,10 @@ class GraphParser(BaseModel): """ 基于图的parser base class, 支持贪婪解码和最大生成树解码 """ + def __init__(self): super(GraphParser, self).__init__() - + @staticmethod def greedy_decoder(arc_matrix, mask=None): """ @@ -146,7 +154,7 @@ class GraphParser(BaseModel): if mask is not None: heads *= mask.long() return heads - + @staticmethod def mst_decoder(arc_matrix, mask=None): """ @@ -176,6 +184,7 @@ class ArcBiaffine(nn.Module): :param hidden_size: 输入的特征维度 :param bias: 是否使用bias. Default: ``True`` """ + def __init__(self, hidden_size, bias=True): super(ArcBiaffine, self).__init__() self.U = nn.Parameter(torch.Tensor(hidden_size, hidden_size), requires_grad=True) @@ -185,7 +194,7 @@ class ArcBiaffine(nn.Module): else: self.register_parameter("bias", None) initial_parameter(self) - + def forward(self, head, dep): """ @@ -209,11 +218,12 @@ class LabelBilinear(nn.Module): :param num_label: 边类别的个数 :param bias: 是否使用bias. Default: ``True`` """ + def __init__(self, in1_features, in2_features, num_label, bias=True): super(LabelBilinear, self).__init__() self.bilinear = nn.Bilinear(in1_features, in2_features, num_label, bias=bias) self.lin = nn.Linear(in1_features + in2_features, num_label, bias=False) - + def forward(self, x1, x2): """ @@ -225,13 +235,13 @@ class LabelBilinear(nn.Module): output += self.lin(torch.cat([x1, x2], dim=2)) return output + class BiaffineParser(GraphParser): """ 别名::class:`fastNLP.models.BiaffineParser` :class:`fastNLP.models.baffine_parser.BiaffineParser` Biaffine Dependency Parser 实现. - 论文参考 ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) - `_ . + 论文参考 `Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) `_ . :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, @@ -248,18 +258,19 @@ class BiaffineParser(GraphParser): :param use_greedy_infer: 是否在inference时使用贪心算法. 若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False`` """ + def __init__(self, - init_embed, - pos_vocab_size, - pos_emb_dim, - num_label, - rnn_layers=1, - rnn_hidden_size=200, - arc_mlp_size=100, - label_mlp_size=100, - dropout=0.3, - encoder='lstm', - use_greedy_infer=False): + init_embed, + pos_vocab_size, + pos_emb_dim, + num_label, + rnn_layers=1, + rnn_hidden_size=200, + arc_mlp_size=100, + label_mlp_size=100, + dropout=0.3, + encoder='lstm', + use_greedy_infer=False): super(BiaffineParser, self).__init__() rnn_out_size = 2 * rnn_hidden_size word_hid_dim = pos_hid_dim = rnn_hidden_size @@ -295,20 +306,20 @@ class BiaffineParser(GraphParser): if (d_k * n_head) != rnn_out_size: raise ValueError('unsupported rnn_out_size: {} for transformer'.format(rnn_out_size)) self.position_emb = nn.Embedding(num_embeddings=self.max_len, - embedding_dim=rnn_out_size,) + embedding_dim=rnn_out_size, ) self.encoder = TransformerEncoder(num_layers=rnn_layers, model_size=rnn_out_size, inner_size=1024, key_size=d_k, value_size=d_v, num_head=n_head, - dropout=dropout,) + dropout=dropout, ) else: raise ValueError('unsupported encoder type: {}'.format(encoder)) - + self.mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size * 2 + label_mlp_size * 2), - nn.ELU(), - TimestepDropout(p=dropout),) + nn.ELU(), + TimestepDropout(p=dropout), ) self.arc_mlp_size = arc_mlp_size self.label_mlp_size = label_mlp_size self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True) @@ -316,7 +327,7 @@ class BiaffineParser(GraphParser): self.use_greedy_infer = use_greedy_infer self.reset_parameters() self.dropout = dropout - + def reset_parameters(self): for m in self.modules(): if isinstance(m, nn.Embedding): @@ -327,7 +338,7 @@ class BiaffineParser(GraphParser): else: for p in m.parameters(): nn.init.normal_(p, 0, 0.1) - + def forward(self, words1, words2, seq_len, target1=None): """模型forward阶段 @@ -337,50 +348,52 @@ class BiaffineParser(GraphParser): :param target1: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效, 用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器 Default: ``None`` - :return dict: parsing结果:: + :return dict: parsing + 结果:: + + pred1: [batch_size, seq_len, seq_len] 边预测logits + pred2: [batch_size, seq_len, num_label] label预测logits + pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测 - pred1: [batch_size, seq_len, seq_len] 边预测logits - pred2: [batch_size, seq_len, num_label] label预测logits - pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测 """ # prepare embeddings batch_size, length = words1.shape # print('forward {} {}'.format(batch_size, seq_len)) - + # get sequence mask mask = seq_len_to_mask(seq_len).long() - - word = self.word_embedding(words1) # [N,L] -> [N,L,C_0] - pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1] - + + word = self.word_embedding(words1) # [N,L] -> [N,L,C_0] + pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1] + word, pos = self.word_fc(word), self.pos_fc(pos) word, pos = self.word_norm(word), self.pos_norm(pos) - x = torch.cat([word, pos], dim=2) # -> [N,L,C] - + x = torch.cat([word, pos], dim=2) # -> [N,L,C] + # encoder, extract features if self.encoder_name.endswith('lstm'): sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True) x = x[sort_idx] x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True) - feat, _ = self.encoder(x) # -> [N,L,C] + feat, _ = self.encoder(x) # -> [N,L,C] feat, _ = nn.utils.rnn.pad_packed_sequence(feat, batch_first=True) _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False) feat = feat[unsort_idx] else: - seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None,:] + seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None, :] x = x + self.position_emb(seq_range) feat = self.encoder(x, mask.float()) - + # for arc biaffine # mlp, reduce dim feat = self.mlp(feat) arc_sz, label_sz = self.arc_mlp_size, self.label_mlp_size - arc_dep, arc_head = feat[:,:,:arc_sz], feat[:,:,arc_sz:2*arc_sz] - label_dep, label_head = feat[:,:,2*arc_sz:2*arc_sz+label_sz], feat[:,:,2*arc_sz+label_sz:] - + arc_dep, arc_head = feat[:, :, :arc_sz], feat[:, :, arc_sz:2 * arc_sz] + label_dep, label_head = feat[:, :, 2 * arc_sz:2 * arc_sz + label_sz], feat[:, :, 2 * arc_sz + label_sz:] + # biaffine arc classifier - arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L] - + arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L] + # use gold or predicted arc to predict label if target1 is None or not self.training: # use greedy decoding in training @@ -390,22 +403,22 @@ class BiaffineParser(GraphParser): heads = self.mst_decoder(arc_pred, mask) head_pred = heads else: - assert self.training # must be training mode + assert self.training # must be training mode if target1 is None: heads = self.greedy_decoder(arc_pred, mask) head_pred = heads else: head_pred = None heads = target1 - + batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=words1.device).unsqueeze(1) label_head = label_head[batch_range, heads].contiguous() - label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label] + label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label] res_dict = {C.OUTPUTS(0): arc_pred, C.OUTPUTS(1): label_pred} if head_pred is not None: res_dict[C.OUTPUTS(2)] = head_pred return res_dict - + @staticmethod def loss(pred1, pred2, target1, target2, seq_len): """ @@ -418,7 +431,7 @@ class BiaffineParser(GraphParser): :param seq_len: [batch_size, seq_len] 真实目标的长度 :return loss: scalar """ - + batch_size, length, _ = pred1.shape mask = seq_len_to_mask(seq_len) flip_mask = (mask == 0) @@ -430,24 +443,26 @@ class BiaffineParser(GraphParser): child_index = torch.arange(length, device=arc_logits.device, dtype=torch.long).unsqueeze(0) arc_loss = arc_logits[batch_index, child_index, target1] label_loss = label_logits[batch_index, child_index, target2] - + byte_mask = flip_mask.byte() arc_loss.masked_fill_(byte_mask, 0) label_loss.masked_fill_(byte_mask, 0) arc_nll = -arc_loss.mean() label_nll = -label_loss.mean() return arc_nll + label_nll - + def predict(self, words1, words2, seq_len): """模型预测API :param words1: [batch_size, seq_len] 输入word序列 :param words2: [batch_size, seq_len] 输入pos序列 :param seq_len: [batch_size, seq_len] 输入序列长度 - :return dict: parsing结果:: + :return dict: parsing + 结果:: + + pred1: [batch_size, seq_len] heads的预测结果 + pred2: [batch_size, seq_len, num_label] label预测logits - pred1: [batch_size, seq_len] heads的预测结果 - pred2: [batch_size, seq_len, num_label] label预测logits """ res = self(words1, words2, seq_len) output = {} @@ -470,6 +485,7 @@ class ParserLoss(LossFunc): :param seq_len: [batch_size, seq_len] 真实目标的长度 :return loss: scalar """ + def __init__(self, pred1=None, pred2=None, target1=None, target2=None, seq_len=None): @@ -497,9 +513,10 @@ class ParserMetric(MetricBase): UAS: 不带label时, 边预测的准确率 LAS: 同时预测边和label的准确率 """ + def __init__(self, pred1=None, pred2=None, target1=None, target2=None, seq_len=None): - + super().__init__() self._init_param_map(pred1=pred1, pred2=pred2, target1=target1, target2=target2, @@ -507,13 +524,13 @@ class ParserMetric(MetricBase): self.num_arc = 0 self.num_label = 0 self.num_sample = 0 - + def get_metric(self, reset=True): - res = {'UAS': self.num_arc*1.0 / self.num_sample, 'LAS': self.num_label*1.0 / self.num_sample} + res = {'UAS': self.num_arc * 1.0 / self.num_sample, 'LAS': self.num_label * 1.0 / self.num_sample} if reset: self.num_sample = self.num_label = self.num_arc = 0 return res - + def evaluate(self, pred1, pred2, target1, target2, seq_len=None): """Evaluate the performance of prediction. """ @@ -522,7 +539,7 @@ class ParserMetric(MetricBase): else: seq_mask = seq_len_to_mask(seq_len.long()).long() # mask out tag - seq_mask[:,0] = 0 + seq_mask[:, 0] = 0 head_pred_correct = (pred1 == target1).long() * seq_mask label_pred_correct = (pred2 == target2).long() * head_pred_correct self.num_arc += head_pred_correct.sum().item() diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index 01b03b9f..a9ccc568 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -1,12 +1,13 @@ -# python: 3.6 -# encoding: utf-8 - import torch import torch.nn as nn -from ..core.const import Const as C +from ..core.const import Const as C from ..modules import encoder +__all__ = [ + "CNNText" +] + class CNNText(torch.nn.Module): """ @@ -23,7 +24,7 @@ class CNNText(torch.nn.Module): :param int padding: 对句子前后的pad的大小, 用0填充。 :param float dropout: Dropout的大小 """ - + def __init__(self, init_embed, num_classes, kernel_nums=(3, 4, 5), @@ -31,7 +32,7 @@ class CNNText(torch.nn.Module): padding=0, dropout=0.5): super(CNNText, self).__init__() - + # no support for pre-trained embedding currently self.embed = encoder.Embedding(init_embed) self.conv_pool = encoder.ConvMaxpool( @@ -41,7 +42,7 @@ class CNNText(torch.nn.Module): padding=padding) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(sum(kernel_nums), num_classes) - + def forward(self, words, seq_len=None): """ @@ -54,7 +55,7 @@ class CNNText(torch.nn.Module): x = self.dropout(x) x = self.fc(x) # [N,C] -> [N, N_class] return {C.OUTPUT: x} - + def predict(self, words, seq_len=None): """ :param torch.LongTensor words: [batch_size, seq_len],句子中word的index diff --git a/fastNLP/models/enas_controller.py b/fastNLP/models/enas_controller.py index 16b970e6..e83c6b51 100644 --- a/fastNLP/models/enas_controller.py +++ b/fastNLP/models/enas_controller.py @@ -5,6 +5,7 @@ import os import torch import torch.nn.functional as F + from . import enas_utils as utils from .enas_utils import Node diff --git a/fastNLP/models/enas_model.py b/fastNLP/models/enas_model.py index 5c667927..b6b683c0 100644 --- a/fastNLP/models/enas_model.py +++ b/fastNLP/models/enas_model.py @@ -1,17 +1,19 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch - -"""Module containing the shared RNN model.""" -import numpy as np +""" +Module containing the shared RNN model. +Code Modified from https://github.com/carpedm20/ENAS-pytorch +""" import collections +import numpy as np import torch -from torch import nn +import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable from . import enas_utils as utils from .base_model import BaseModel + def _get_dropped_weights(w_raw, dropout_p, is_training): """Drops out weights to implement DropConnect. @@ -35,12 +37,13 @@ def _get_dropped_weights(w_raw, dropout_p, is_training): The above TODO is the reason for the hacky check for `torch.nn.Parameter`. """ dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training) - + if isinstance(dropped_w, torch.nn.Parameter): dropped_w = dropped_w.clone() - + return dropped_w + class EmbeddingDropout(torch.nn.Embedding): """Class for dropping out embeddings by zero'ing out parameters in the embedding matrix. @@ -53,6 +56,7 @@ class EmbeddingDropout(torch.nn.Embedding): See 'A Theoretically Grounded Application of Dropout in Recurrent Neural Networks', (Gal and Ghahramani, 2016). """ + def __init__(self, num_embeddings, embedding_dim, @@ -83,14 +87,14 @@ class EmbeddingDropout(torch.nn.Embedding): assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 ' 'and < 1.0') self.scale = scale - + def forward(self, inputs): # pylint:disable=arguments-differ """Embeds `inputs` with the dropped out embedding weight matrix.""" if self.training: dropout = self.dropout else: dropout = 0 - + if dropout: mask = self.weight.data.new(self.weight.size(0), 1) mask.bernoulli_(1 - dropout) @@ -101,7 +105,7 @@ class EmbeddingDropout(torch.nn.Embedding): masked_weight = self.weight if self.scale and self.scale != 1: masked_weight = masked_weight * self.scale - + return F.embedding(inputs, masked_weight, max_norm=self.max_norm, @@ -114,7 +118,7 @@ class LockedDropout(nn.Module): # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py def __init__(self): super().__init__() - + def forward(self, x, dropout=0.5): if not self.training or not dropout: return x @@ -126,11 +130,12 @@ class LockedDropout(nn.Module): class ENASModel(BaseModel): """Shared RNN model.""" + def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000): super(ENASModel, self).__init__() - + self.use_cuda = cuda - + self.shared_hid = shared_hid self.num_blocks = num_blocks self.decoder = nn.Linear(self.shared_hid, num_classes) @@ -139,16 +144,16 @@ class ENASModel(BaseModel): dropout=0.1) self.lockdrop = LockedDropout() self.dag = None - + # Tie weights # self.decoder.weight = self.encoder.weight - + # Since W^{x, c} and W^{h, c} are always summed, there # is no point duplicating their bias offset parameter. Likewise for # W^{x, h} and W^{h, h}. self.w_xc = nn.Linear(shared_embed, self.shared_hid) self.w_xh = nn.Linear(shared_embed, self.shared_hid) - + # The raw weights are stored here because the hidden-to-hidden weights # are weight dropped on the forward pass. self.w_hc_raw = torch.nn.Parameter( @@ -157,10 +162,10 @@ class ENASModel(BaseModel): torch.Tensor(self.shared_hid, self.shared_hid)) self.w_hc = None self.w_hh = None - + self.w_h = collections.defaultdict(dict) self.w_c = collections.defaultdict(dict) - + for idx in range(self.num_blocks): for jdx in range(idx + 1, self.num_blocks): self.w_h[idx][jdx] = nn.Linear(self.shared_hid, @@ -169,48 +174,47 @@ class ENASModel(BaseModel): self.w_c[idx][jdx] = nn.Linear(self.shared_hid, self.shared_hid, bias=False) - + self._w_h = nn.ModuleList([self.w_h[idx][jdx] for idx in self.w_h for jdx in self.w_h[idx]]) self._w_c = nn.ModuleList([self.w_c[idx][jdx] for idx in self.w_c for jdx in self.w_c[idx]]) - + self.batch_norm = None # if args.mode == 'train': # self.batch_norm = nn.BatchNorm1d(self.shared_hid) # else: # self.batch_norm = None - + self.reset_parameters() self.static_init_hidden = utils.keydefaultdict(self.init_hidden) - + def setDAG(self, dag): if self.dag is None: self.dag = dag - + def forward(self, word_seq, hidden=None): inputs = torch.transpose(word_seq, 0, 1) - + time_steps = inputs.size(0) batch_size = inputs.size(1) - - + self.w_hh = _get_dropped_weights(self.w_hh_raw, 0.5, self.training) self.w_hc = _get_dropped_weights(self.w_hc_raw, 0.5, self.training) - + # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden hidden = self.static_init_hidden[batch_size] - + embed = self.encoder(inputs) - + embed = self.lockdrop(embed, 0.65 if self.training else 0) - + # The norm of hidden states are clipped here because # otherwise ENAS is especially prone to exploding activations on the # forward pass. This could probably be fixed in a more elegant way, but @@ -226,7 +230,7 @@ class ENASModel(BaseModel): for step in range(time_steps): x_t = embed[step] logit, hidden = self.cell(x_t, hidden, self.dag) - + hidden_norms = hidden.norm(dim=-1) max_norm = 25.0 if hidden_norms.data.max() > max_norm: @@ -237,60 +241,60 @@ class ENASModel(BaseModel): # because the PyTorch slicing and slice assignment is too # flaky. hidden_norms = hidden_norms.data.cpu().numpy() - + clipped_num += 1 if hidden_norms.max() > max_clipped_norm: max_clipped_norm = hidden_norms.max() - + clip_select = hidden_norms > max_norm clip_norms = hidden_norms[clip_select] - + mask = np.ones(hidden.size()) - normalizer = max_norm/clip_norms + normalizer = max_norm / clip_norms normalizer = normalizer[:, np.newaxis] - + mask[clip_select] = normalizer - + if self.use_cuda: hidden *= torch.autograd.Variable( torch.FloatTensor(mask).cuda(), requires_grad=False) else: hidden *= torch.autograd.Variable( - torch.FloatTensor(mask), requires_grad=False) + torch.FloatTensor(mask), requires_grad=False) logits.append(logit) h1tohT.append(hidden) - + h1tohT = torch.stack(h1tohT) output = torch.stack(logits) raw_output = output - + output = self.lockdrop(output, 0.4 if self.training else 0) - - #Pooling + + # Pooling output = torch.mean(output, 0) - + decoded = self.decoder(output) - + extra_out = {'dropped': decoded, 'hiddens': h1tohT, 'raw': raw_output} return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out} - + def cell(self, x, h_prev, dag): """Computes a single pass through the discovered RNN cell.""" c = {} h = {} f = {} - + f[0] = self.get_f(dag[-1][0].name) c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None)) - h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + - (1 - c[0])*h_prev) - + h[0] = (c[0] * f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + + (1 - c[0]) * h_prev) + leaf_node_ids = [] q = collections.deque() q.append(0) - + # Computes connections from the parent nodes `node_id` # to their child nodes `next_id` recursively, skipping leaf nodes. A # leaf node is a node whose id == `self.num_blocks`. @@ -306,10 +310,10 @@ class ENASModel(BaseModel): while True: if len(q) == 0: break - + node_id = q.popleft() nodes = dag[node_id] - + for next_node in nodes: next_id = next_node.id if next_id == self.num_blocks: @@ -317,38 +321,38 @@ class ENASModel(BaseModel): assert len(nodes) == 1, ('parent of leaf node should have ' 'only one child') continue - + w_h = self.w_h[node_id][next_id] w_c = self.w_c[node_id][next_id] - + f[next_id] = self.get_f(next_node.name) c[next_id] = torch.sigmoid(w_c(h[node_id])) - h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) + - (1 - c[next_id])*h[node_id]) - + h[next_id] = (c[next_id] * f[next_id](w_h(h[node_id])) + + (1 - c[next_id]) * h[node_id]) + q.append(next_id) - + # Instead of averaging loose ends, perhaps there should # be a set of separate unshared weights for each "loose" connection # between each node in a cell and the output. # # As it stands, all weights W^h_{ij} are doing double duty by # connecting both from i to j, as well as from i to the output. - + # average all the loose ends leaf_nodes = [h[node_id] for node_id in leaf_node_ids] output = torch.mean(torch.stack(leaf_nodes, 2), -1) - + # stabilizing the Updates of omega if self.batch_norm is not None: output = self.batch_norm(output) - + return output, h[self.num_blocks - 1] - + def init_hidden(self, batch_size): zeros = torch.zeros(batch_size, self.shared_hid) return utils.get_variable(zeros, self.use_cuda, requires_grad=False) - + def get_f(self, name): name = name.lower() if name == 'relu': @@ -360,22 +364,21 @@ class ENASModel(BaseModel): elif name == 'sigmoid': f = torch.sigmoid return f - - + @property def num_parameters(self): def size(p): return np.prod(p.size()) + return sum([size(param) for param in self.parameters()]) - - + def reset_parameters(self): init_range = 0.025 # init_range = 0.025 if self.args.mode == 'train' else 0.04 for param in self.parameters(): param.data.uniform_(-init_range, init_range) self.decoder.bias.data.fill_(0) - + def predict(self, word_seq): """ diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py index 9cd7d8d0..ef596b03 100644 --- a/fastNLP/models/enas_trainer.py +++ b/fastNLP/models/enas_trainer.py @@ -1,12 +1,12 @@ # Code Modified from https://github.com/carpedm20/ENAS-pytorch - -import time -from datetime import datetime -from datetime import timedelta - +import math import numpy as np +import time import torch -import math + +from datetime import datetime, timedelta + +from torch.optim import Adam try: from tqdm.auto import tqdm @@ -21,8 +21,6 @@ from ..core.utils import _move_dict_value_to_device from . import enas_utils as utils from ..core.utils import _build_args -from torch.optim import Adam - def _get_no_grad_ctx_mgr(): """Returns a the `torch.no_grad` context manager for PyTorch version >= @@ -33,6 +31,7 @@ def _get_no_grad_ctx_mgr(): class ENASTrainer(Trainer): """A class to wrap training code.""" + def __init__(self, train_data, model, controller, **kwargs): """Constructor for training algorithm. :param DataSet train_data: the training data @@ -45,19 +44,19 @@ class ENASTrainer(Trainer): self.controller_step = 0 self.shared_step = 0 self.max_length = 35 - + self.shared = model self.controller = controller - + self.shared_optim = Adam( self.shared.parameters(), lr=20.0, weight_decay=1e-7) - + self.controller_optim = Adam( self.controller.parameters(), lr=3.5e-4) - + def train(self, load_best_model=True): """ :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 @@ -82,21 +81,22 @@ class ENASTrainer(Trainer): self.model = self.model.cuda() self._model_device = self.model.parameters().__next__().device self._mode(self.model, is_test=False) - + self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) start_time = time.time() print("training epochs started " + self.start_time, flush=True) - + try: self.callback_manager.on_train_begin() self._train() self.callback_manager.on_train_end() except (CallbackException, KeyboardInterrupt) as e: self.callback_manager.on_exception(e) - + if self.dev_data is not None: - print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + - self.tester._format_eval_results(self.best_dev_perf),) + print( + "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + + self.tester._format_eval_results(self.best_dev_perf), ) results['best_eval'] = self.best_dev_perf results['best_epoch'] = self.best_dev_epoch results['best_step'] = self.best_dev_step @@ -110,9 +110,9 @@ class ENASTrainer(Trainer): finally: pass results['seconds'] = round(time.time() - start_time, 2) - + return results - + def _train(self): if not self.use_tqdm: from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm @@ -126,21 +126,21 @@ class ENASTrainer(Trainer): avg_loss = 0 data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, prefetch=self.prefetch) - for epoch in range(1, self.n_epochs+1): + for epoch in range(1, self.n_epochs + 1): pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) if epoch == self.n_epochs + 1 - self.final_epochs: print('Entering the final stage. (Only train the selected structure)') # early stopping self.callback_manager.on_epoch_begin() - + # 1. Training the shared parameters omega of the child models self.train_shared(pbar) - + # 2. Training the controller parameters theta if not last_stage: self.train_controller() - + if ((self.validate_every > 0 and self.step % self.validate_every == 0) or (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ and self.dev_data is not None: @@ -149,16 +149,15 @@ class ENASTrainer(Trainer): eval_res = self._do_validation(epoch=epoch, step=self.step) eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \ - self.tester._format_eval_results(eval_res) + self.tester._format_eval_results(eval_res) pbar.write(eval_str) - + # lr decay; early stopping self.callback_manager.on_epoch_end() # =============== epochs end =================== # pbar.close() # ============ tqdm end ============== # - - + def get_loss(self, inputs, targets, hidden, dags): """Computes the loss for the same batch for M models. @@ -167,7 +166,7 @@ class ENASTrainer(Trainer): """ if not isinstance(dags, list): dags = [dags] - + loss = 0 for dag in dags: self.shared.setDAG(dag) @@ -175,14 +174,14 @@ class ENASTrainer(Trainer): inputs['hidden'] = hidden result = self.shared(**inputs) output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out'] - + self.callback_manager.on_loss_begin(targets, result) sample_loss = self._compute_loss(result, targets) loss += sample_loss - + assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`' return loss, hidden, extra_out - + def train_shared(self, pbar=None, max_step=None, dag=None): """Train the language model for 400 steps of minibatches of 64 examples. @@ -200,9 +199,9 @@ class ENASTrainer(Trainer): model = self.shared model.train() self.controller.eval() - + hidden = self.shared.init_hidden(self.batch_size) - + abs_max_grad = 0 abs_max_hidden_norm = 0 step = 0 @@ -211,15 +210,15 @@ class ENASTrainer(Trainer): train_idx = 0 avg_loss = 0 data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - + prefetch=self.prefetch) + for batch_x, batch_y in data_iterator: _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) indices = data_iterator.get_batch_indices() # negative sampling; replace unknown; re-weight batch_y self.callback_manager.on_batch_begin(batch_x, batch_y, indices) # prediction = self._data_forward(self.model, batch_x) - + dags = self.controller.sample(1) inputs, targets = batch_x, batch_y # self.callback_manager.on_loss_begin(batch_y, prediction) @@ -228,18 +227,18 @@ class ENASTrainer(Trainer): hidden, dags) hidden.detach_() - + avg_loss += loss.item() - + # Is loss NaN or inf? requires_grad = False self.callback_manager.on_backward_begin(loss) self._grad_backward(loss) self.callback_manager.on_backward_end() - + self._update() self.callback_manager.on_step_end() - - if (self.step+1) % self.print_every == 0: + + if (self.step + 1) % self.print_every == 0: if self.use_tqdm: print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every) pbar.update(self.print_every) @@ -255,30 +254,29 @@ class ENASTrainer(Trainer): self.shared_step += 1 self.callback_manager.on_batch_end() # ================= mini-batch end ==================== # - - + def get_reward(self, dag, entropies, hidden, valid_idx=0): """Computes the perplexity of a single sampled model on a minibatch of validation data. """ if not isinstance(entropies, np.ndarray): entropies = entropies.data.cpu().numpy() - + data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - + prefetch=self.prefetch) + for inputs, targets in data_iterator: valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) valid_loss = utils.to_item(valid_loss.data) - + valid_ppl = math.exp(valid_loss) - + R = 80 / valid_ppl - + rewards = R + 1e-4 * entropies - + return rewards, hidden - + def train_controller(self): """Fixes the shared parameters and updates the controller parameters. @@ -296,13 +294,13 @@ class ENASTrainer(Trainer): # Why can't we call shared.eval() here? Leads to loss # being uniformly zero for the controller. # self.shared.eval() - + avg_reward_base = None baseline = None adv_history = [] entropy_history = [] reward_history = [] - + hidden = self.shared.init_hidden(self.batch_size) total_loss = 0 valid_idx = 0 @@ -310,7 +308,7 @@ class ENASTrainer(Trainer): # sample models dags, log_probs, entropies = self.controller.sample( with_details=True) - + # calculate reward np_entropies = entropies.data.cpu().numpy() # No gradients should be backpropagated to the @@ -320,40 +318,39 @@ class ENASTrainer(Trainer): np_entropies, hidden, valid_idx) - - + reward_history.extend(rewards) entropy_history.extend(np_entropies) - + # moving average baseline if baseline is None: baseline = rewards else: decay = 0.95 baseline = decay * baseline + (1 - decay) * rewards - + adv = rewards - baseline adv_history.extend(adv) - + # policy loss - loss = -log_probs*utils.get_variable(adv, - 'cuda' in self.device, - requires_grad=False) - + loss = -log_probs * utils.get_variable(adv, + 'cuda' in self.device, + requires_grad=False) + loss = loss.sum() # or loss.mean() - + # update self.controller_optim.zero_grad() loss.backward() - + self.controller_optim.step() - + total_loss += utils.to_item(loss.data) - + if ((step % 50) == 0) and (step > 0): reward_history, adv_history, entropy_history = [], [], [] total_loss = 0 - + self.controller_step += 1 # prev_valid_idx = valid_idx # valid_idx = ((valid_idx + self.max_length) % @@ -362,16 +359,16 @@ class ENASTrainer(Trainer): # # validation data, we reset the hidden states. # if prev_valid_idx > valid_idx: # hidden = self.shared.init_hidden(self.batch_size) - + def derive(self, sample_num=10, valid_idx=0): """We are always deriving based on the very first batch of validation data? This seems wrong... """ hidden = self.shared.init_hidden(self.batch_size) - + dags, _, entropies = self.controller.sample(sample_num, with_details=True) - + max_R = 0 best_dag = None for dag in dags: @@ -379,5 +376,5 @@ class ENASTrainer(Trainer): if R.max() > max_R: max_R = R.max() best_dag = dag - + self.model.setDAG(best_dag) diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py index aafcb3a7..68c170ed 100644 --- a/fastNLP/models/enas_utils.py +++ b/fastNLP/models/enas_utils.py @@ -1,12 +1,10 @@ # Code Modified from https://github.com/carpedm20/ENAS-pytorch from __future__ import print_function - from collections import defaultdict import collections import numpy as np - import torch from torch.autograd import Variable diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py index 39f4c3fe..17f02298 100644 --- a/fastNLP/models/sequence_labeling.py +++ b/fastNLP/models/sequence_labeling.py @@ -1,11 +1,19 @@ +""" + 本模块实现了两种序列标注模型 +""" import torch +import torch.nn as nn from .base_model import BaseModel from ..modules import decoder, encoder from ..modules.decoder.CRF import allowed_transitions from ..core.utils import seq_len_to_mask from ..core.const import Const as C -from torch import nn + +__all__ = [ + "SeqLabeling", + "AdvSeqLabel" +] class SeqLabeling(BaseModel): diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py index 34b54302..606bcc42 100644 --- a/fastNLP/models/snli.py +++ b/fastNLP/models/snli.py @@ -8,6 +8,9 @@ from ..modules import encoder as Encoder from ..modules import aggregator as Aggregator from ..core.utils import seq_len_to_mask +__all__ = [ + "ESIM" +] my_inf = 10e12 @@ -26,7 +29,7 @@ class ESIM(BaseModel): :param int num_classes: 标签数目,默认为3 :param numpy.array init_embedding: 初始词嵌入矩阵,形状为(vocab_size, embed_dim),默认为None,即随机初始化词嵌入矩阵 """ - + def __init__(self, vocab_size, embed_dim, hidden_size, dropout=0.0, num_classes=3, init_embedding=None): super(ESIM, self).__init__() @@ -35,35 +38,36 @@ class ESIM(BaseModel): self.hidden_size = hidden_size self.dropout = dropout self.n_labels = num_classes - + self.drop = nn.Dropout(self.dropout) - + self.embedding = Encoder.Embedding( (self.vocab_size, self.embed_dim), dropout=self.dropout, ) - + self.embedding_layer = nn.Linear(self.embed_dim, self.hidden_size) - + self.encoder = Encoder.LSTM( input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=1, bias=True, batch_first=True, bidirectional=True ) - + self.bi_attention = Aggregator.BiAttention() self.mean_pooling = Aggregator.AvgPoolWithMask() self.max_pooling = Aggregator.MaxPoolWithMask() - + self.inference_layer = nn.Linear(self.hidden_size * 4, self.hidden_size) - + self.decoder = Encoder.LSTM( input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=1, bias=True, batch_first=True, bidirectional=True ) - + self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh', dropout=self.dropout) - + def forward(self, words1, words2, seq_len1=None, seq_len2=None, target=None): """ Forward function + :param torch.Tensor words1: [batch size(B), premise seq len(PL)] premise的token表示 :param torch.Tensor words2: [B, hypothesis seq len(HL)] hypothesis的token表示 :param torch.LongTensor seq_len1: [B] premise的长度 @@ -71,10 +75,10 @@ class ESIM(BaseModel): :param torch.LongTensor target: [B] 真实目标值 :return: dict prediction: [B, n_labels(N)] 预测结果 """ - + premise0 = self.embedding_layer(self.embedding(words1)) hypothesis0 = self.embedding_layer(self.embedding(words2)) - + if seq_len1 is not None: seq_len1 = seq_len_to_mask(seq_len1) else: @@ -85,55 +89,55 @@ class ESIM(BaseModel): else: seq_len2 = torch.ones(hypothesis0.size(0), hypothesis0.size(1)) seq_len2 = (seq_len2.long()).to(device=hypothesis0.device) - + _BP, _PSL, _HP = premise0.size() _BH, _HSL, _HH = hypothesis0.size() _BPL, _PLL = seq_len1.size() _HPL, _HLL = seq_len2.size() - + assert _BP == _BH and _BPL == _HPL and _BP == _BPL assert _HP == _HH assert _PSL == _PLL and _HSL == _HLL - + B, PL, H = premise0.size() B, HL, H = hypothesis0.size() - + a0 = self.encoder(self.drop(premise0)) # a0: [B, PL, H * 2] b0 = self.encoder(self.drop(hypothesis0)) # b0: [B, HL, H * 2] - + a = torch.mean(a0.view(B, PL, -1, H), dim=2) # a: [B, PL, H] b = torch.mean(b0.view(B, HL, -1, H), dim=2) # b: [B, HL, H] - + ai, bi = self.bi_attention(a, b, seq_len1, seq_len2) - + ma = torch.cat((a, ai, a - ai, a * ai), dim=2) # ma: [B, PL, 4 * H] mb = torch.cat((b, bi, b - bi, b * bi), dim=2) # mb: [B, HL, 4 * H] - + f_ma = self.inference_layer(ma) f_mb = self.inference_layer(mb) - + vat = self.decoder(self.drop(f_ma)) vbt = self.decoder(self.drop(f_mb)) - + va = torch.mean(vat.view(B, PL, -1, H), dim=2) # va: [B, PL, H] vb = torch.mean(vbt.view(B, HL, -1, H), dim=2) # vb: [B, HL, H] - + va_ave = self.mean_pooling(va, seq_len1, dim=1) # va_ave: [B, H] va_max, va_arg_max = self.max_pooling(va, seq_len1, dim=1) # va_max: [B, H] vb_ave = self.mean_pooling(vb, seq_len2, dim=1) # vb_ave: [B, H] vb_max, vb_arg_max = self.max_pooling(vb, seq_len2, dim=1) # vb_max: [B, H] - + v = torch.cat((va_ave, va_max, vb_ave, vb_max), dim=1) # v: [B, 4 * H] - + prediction = torch.tanh(self.output(v)) # prediction: [B, N] - + if target is not None: func = nn.CrossEntropyLoss() loss = func(prediction, target) return {Const.OUTPUT: prediction, Const.LOSS: loss} - + return {Const.OUTPUT: prediction} - + def predict(self, words1, words2, seq_len1=None, seq_len2=None, target=None): """ Predict function @@ -146,4 +150,3 @@ class ESIM(BaseModel): """ prediction = self.forward(words1, words2, seq_len1, seq_len2)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(prediction, dim=-1)} - diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py index cdd1f321..2e55f7e4 100644 --- a/fastNLP/models/star_transformer.py +++ b/fastNLP/models/star_transformer.py @@ -1,17 +1,25 @@ -"""Star-Transformer 的 一个 Pytorch 实现. """ +Star-Transformer 的 Pytorch 实现。 +""" +import torch +from torch import nn + from ..modules.encoder.star_transformer import StarTransformer from ..core.utils import seq_len_to_mask from ..modules.utils import get_embeddings from ..core.const import Const -import torch -from torch import nn +__all__ = [ + "StarTransEnc", + "STNLICls", + "STSeqCls", + "STSeqLabel", +] class StarTransEnc(nn.Module): """ - 别名::class:`fastNLP.models.StarTransEnc` :class:`fastNLP.models.start_transformer.StarTransEnc` + 别名::class:`fastNLP.models.StarTransEnc` :class:`fastNLP.models.star_transformer.StarTransEnc` 带word embedding的Star-Transformer Encoder @@ -28,6 +36,7 @@ class StarTransEnc(nn.Module): :param emb_dropout: 词嵌入的dropout概率. :param dropout: 模型除词嵌入外的dropout概率. """ + def __init__(self, init_embed, hidden_size, num_layers, @@ -47,7 +56,7 @@ class StarTransEnc(nn.Module): head_dim=head_dim, dropout=dropout, max_len=max_len) - + def forward(self, x, mask): """ :param FloatTensor data: [batch, length, hidden] 输入的序列 @@ -72,7 +81,7 @@ class _Cls(nn.Module): nn.Dropout(dropout), nn.Linear(hid_dim, num_cls), ) - + def forward(self, x): h = self.fc(x) return h @@ -83,20 +92,21 @@ class _NLICls(nn.Module): super(_NLICls, self).__init__() self.fc = nn.Sequential( nn.Dropout(dropout), - nn.Linear(in_dim*4, hid_dim), #4 + nn.Linear(in_dim * 4, hid_dim), # 4 nn.LeakyReLU(), nn.Dropout(dropout), nn.Linear(hid_dim, num_cls), ) - + def forward(self, x1, x2): - x = torch.cat([x1, x2, torch.abs(x1-x2), x1*x2], 1) + x = torch.cat([x1, x2, torch.abs(x1 - x2), x1 * x2], 1) h = self.fc(x) return h + class STSeqLabel(nn.Module): """ - 别名::class:`fastNLP.models.STSeqLabel` :class:`fastNLP.models.start_transformer.STSeqLabel` + 别名::class:`fastNLP.models.STSeqLabel` :class:`fastNLP.models.star_transformer.STSeqLabel` 用于序列标注的Star-Transformer模型 @@ -112,6 +122,7 @@ class STSeqLabel(nn.Module): :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ + def __init__(self, init_embed, num_cls, hidden_size=300, num_layers=4, @@ -120,7 +131,7 @@ class STSeqLabel(nn.Module): max_len=512, cls_hidden_size=600, emb_dropout=0.1, - dropout=0.1,): + dropout=0.1, ): super(STSeqLabel, self).__init__() self.enc = StarTransEnc(init_embed=init_embed, hidden_size=hidden_size, @@ -131,7 +142,7 @@ class STSeqLabel(nn.Module): emb_dropout=emb_dropout, dropout=dropout) self.cls = _Cls(hidden_size, num_cls, cls_hidden_size) - + def forward(self, words, seq_len): """ @@ -142,9 +153,9 @@ class STSeqLabel(nn.Module): mask = seq_len_to_mask(seq_len) nodes, _ = self.enc(words, mask) output = self.cls(nodes) - output = output.transpose(1,2) # make hidden to be dim 1 - return {Const.OUTPUT: output} # [bsz, n_cls, seq_len] - + output = output.transpose(1, 2) # make hidden to be dim 1 + return {Const.OUTPUT: output} # [bsz, n_cls, seq_len] + def predict(self, words, seq_len): """ @@ -159,7 +170,7 @@ class STSeqLabel(nn.Module): class STSeqCls(nn.Module): """ - 别名::class:`fastNLP.models.STSeqCls` :class:`fastNLP.models.start_transformer.STSeqCls` + 别名::class:`fastNLP.models.STSeqCls` :class:`fastNLP.models.star_transformer.STSeqCls` 用于分类任务的Star-Transformer @@ -175,7 +186,7 @@ class STSeqCls(nn.Module): :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ - + def __init__(self, init_embed, num_cls, hidden_size=300, num_layers=4, @@ -184,7 +195,7 @@ class STSeqCls(nn.Module): max_len=512, cls_hidden_size=600, emb_dropout=0.1, - dropout=0.1,): + dropout=0.1, ): super(STSeqCls, self).__init__() self.enc = StarTransEnc(init_embed=init_embed, hidden_size=hidden_size, @@ -195,7 +206,7 @@ class STSeqCls(nn.Module): emb_dropout=emb_dropout, dropout=dropout) self.cls = _Cls(hidden_size, num_cls, cls_hidden_size) - + def forward(self, words, seq_len): """ @@ -206,9 +217,9 @@ class STSeqCls(nn.Module): mask = seq_len_to_mask(seq_len) nodes, relay = self.enc(words, mask) y = 0.5 * (relay + nodes.max(1)[0]) - output = self.cls(y) # [bsz, n_cls] + output = self.cls(y) # [bsz, n_cls] return {Const.OUTPUT: output} - + def predict(self, words, seq_len): """ @@ -223,7 +234,7 @@ class STSeqCls(nn.Module): class STNLICls(nn.Module): """ - 别名::class:`fastNLP.models.STNLICls` :class:`fastNLP.models.start_transformer.STNLICls` + 别名::class:`fastNLP.models.STNLICls` :class:`fastNLP.models.star_transformer.STNLICls` 用于自然语言推断(NLI)的Star-Transformer @@ -239,7 +250,7 @@ class STNLICls(nn.Module): :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ - + def __init__(self, init_embed, num_cls, hidden_size=300, num_layers=4, @@ -248,7 +259,7 @@ class STNLICls(nn.Module): max_len=512, cls_hidden_size=600, emb_dropout=0.1, - dropout=0.1,): + dropout=0.1, ): super(STNLICls, self).__init__() self.enc = StarTransEnc(init_embed=init_embed, hidden_size=hidden_size, @@ -259,7 +270,7 @@ class STNLICls(nn.Module): emb_dropout=emb_dropout, dropout=dropout) self.cls = _NLICls(hidden_size, num_cls, cls_hidden_size) - + def forward(self, words1, words2, seq_len1, seq_len2): """ @@ -271,14 +282,16 @@ class STNLICls(nn.Module): """ mask1 = seq_len_to_mask(seq_len1) mask2 = seq_len_to_mask(seq_len2) + def enc(seq, mask): nodes, relay = self.enc(seq, mask) return 0.5 * (relay + nodes.max(1)[0]) + y1 = enc(words1, mask1) y2 = enc(words2, mask2) - output = self.cls(y1, y2) # [bsz, n_cls] + output = self.cls(y1, y2) # [bsz, n_cls] return {Const.OUTPUT: output} - + def predict(self, words1, words2, seq_len1, seq_len2): """