diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 7ac6ed65..633a748f 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -231,22 +231,29 @@ class Vocabulary(object): vocab.from_dataset(train_data1, train_data2, field_name='words') :param DataSet datasets: 需要转index的 DataSet, 支持一个或多个. - :param str field_name: 构建词典所使用的 field. - 若有多个 DataSet, 每个DataSet都必须有此 field. - 目前仅支持 ``str`` , ``list(str)`` , ``list(list(str))`` + :param field_name: 可为 ``str`` 或 ``list(str)`` . + 构建词典所使用的 field(s), 支持一个或多个field + 若有多个 DataSet, 每个DataSet都必须有这些field. + 目前仅支持的field结构: ``str`` , ``list(str)`` , ``list(list(str))`` :return self: """ + if isinstance(field_name, str): + field_name = [field_name] + elif not isinstance(field_name, list): + raise TypeError('invalid argument field_name: {}'.format(field_name)) + def construct_vocab(ins): - field = ins[field_name] - if isinstance(field, str): - self.add_word(field) - elif isinstance(field, list): - if not isinstance(field[0], list): - self.add_word_lst(field) - else: - if isinstance(field[0][0], list): - raise RuntimeError("Only support field with 2 dimensions.") - [self.add_word_lst(w) for w in field] + for fn in field_name: + field = ins[fn] + if isinstance(field, str): + self.add_word(field) + elif isinstance(field, list): + if not isinstance(field[0], list): + self.add_word_lst(field) + else: + if isinstance(field[0][0], list): + raise RuntimeError("Only support field with 2 dimensions.") + [self.add_word_lst(w) for w in field] for idx, dataset in enumerate(datasets): if isinstance(dataset, DataSet): try: diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index a3b18aa5..e8ccca30 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -1 +1,37 @@ +""" +用于IO的模块, 具体包括: + +1. 用于读入 embedding 的 :ref:`EmbedLoader ` 类, + +2. 用于读入数据的 :ref:`DataSetLoader ` 类 + +3. 用于读写config文件的类, 参考 :ref:`Config-io ` + +4. 用于保存和载入模型的类, 参考 :ref:`Model-io ` + +这些类的使用方法可以在对应module的文档下查看. +""" from .embed_loader import EmbedLoader +from .dataset_loader import * +from .config_io import * +from .model_io import * + +__all__ = [ + 'EmbedLoader', + + 'DataSetLoader', + 'CSVLoader', + 'JsonLoader', + 'ConllLoader', + 'SNLILoader', + 'SSTLoader', + 'PeopleDailyCorpusLoader', + 'Conll2003Loader', + + 'ConfigLoader', + 'ConfigSection', + 'ConfigSaver', + + 'ModelLoader', + 'ModelSaver', +] \ No newline at end of file diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py index c0ffe53e..f303f0e9 100644 --- a/fastNLP/io/config_io.py +++ b/fastNLP/io/config_io.py @@ -1,3 +1,8 @@ +""" +.. _config-io: + +用于读入和处理和保存 config 文件 +""" import configparser import json import os diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 039c4242..bb5e2f64 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -1,3 +1,18 @@ +""" +.. _dataset-loader: + +DataSetLoader 的 API, 用于读取不同格式的数据, 并返回 `DataSet` , +得到的 `DataSet` 对象可以直接传入 `Trainer`, `Tester`, 用于模型的训练和测试 + +Example:: + + loader = SNLILoader() + train_ds = loader.load('path/to/train') + dev_ds = loader.load('path/to/dev') + test_ds = loader.load('path/to/test') + + # ... do stuff +""" import os import json from nltk.tree import Tree @@ -55,8 +70,9 @@ def _uncompress(src, dst): class DataSetLoader: - """所有`DataSetLoader`的接口 + """ + 所有`DataSetLoader`的接口 """ def load(self, path): diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 31e590da..39d93fab 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -1,3 +1,8 @@ +""" +.. _embed-loader: + +用于读取预训练的embedding, 读取结果可直接载入为模型参数 +""" import os import numpy as np diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py index 53bdc7ce..d28034c8 100644 --- a/fastNLP/io/model_io.py +++ b/fastNLP/io/model_io.py @@ -1,3 +1,8 @@ +""" +.. _model-io: + +用于载入和保存模型 +""" import torch from fastNLP.io.base_loader import BaseLoader diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py index 9a070c92..f2329dca 100644 --- a/fastNLP/models/biaffine_parser.py +++ b/fastNLP/models/biaffine_parser.py @@ -1,3 +1,5 @@ +"""Biaffine Dependency Parser 的 Pytorch 实现. +""" from collections import defaultdict import numpy as np @@ -14,7 +16,7 @@ from fastNLP.modules.encoder.transformer import TransformerEncoder from fastNLP.modules.encoder.variational_rnn import VarLSTM from fastNLP.modules.utils import initial_parameter from fastNLP.modules.utils import seq_mask - +from fastNLP.modules.utils import get_embeddings def _mst(scores): """ @@ -228,8 +230,9 @@ class BiaffineParser(GraphParser): 论文参考 ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) `_ . - :param word_vocab_size: 单词词典大小 - :param word_emb_dim: 单词词嵌入向量的维度 + :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, + 此时就以传入的对象作为embedding :param pos_vocab_size: part-of-speech 词典大小 :param pos_emb_dim: part-of-speech 向量维度 :param num_label: 边的类别个数 @@ -243,8 +246,7 @@ class BiaffineParser(GraphParser): 若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False`` """ def __init__(self, - word_vocab_size, - word_emb_dim, + init_embed, pos_vocab_size, pos_emb_dim, num_label, @@ -258,7 +260,8 @@ class BiaffineParser(GraphParser): super(BiaffineParser, self).__init__() rnn_out_size = 2 * rnn_hidden_size word_hid_dim = pos_hid_dim = rnn_hidden_size - self.word_embedding = nn.Embedding(num_embeddings=word_vocab_size, embedding_dim=word_emb_dim) + self.word_embedding = get_embeddings(init_embed) + word_emb_dim = self.word_embedding.embedding_dim self.pos_embedding = nn.Embedding(num_embeddings=pos_vocab_size, embedding_dim=pos_emb_dim) self.word_fc = nn.Linear(word_emb_dim, word_hid_dim) self.pos_fc = nn.Linear(pos_emb_dim, pos_hid_dim) diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index 37551e14..86848d0c 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -14,8 +14,7 @@ class CNNText(torch.nn.Module): 'Yoon Kim. 2014. Convolution Neural Networks for Sentence Classification.' """ - def __init__(self, vocab_size, - embed_dim, + def __init__(self, init_embed, num_classes, kernel_nums=(3, 4, 5), kernel_sizes=(3, 4, 5), @@ -23,8 +22,8 @@ class CNNText(torch.nn.Module): dropout=0.5): """ - :param int vocab_size: 词表的大小 - :param int embed_dim: 词embedding的维度大小 + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), + 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding :param int num_classes: 一共有多少类 :param int,tuple(int) out_channels: 输出channel的数量。如果为list,则需要与kernel_sizes的数量保持一致 :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。 @@ -34,9 +33,9 @@ class CNNText(torch.nn.Module): super(CNNText, self).__init__() # no support for pre-trained embedding currently - self.embed = encoder.Embedding(vocab_size, embed_dim) + self.embed = encoder.Embedding(init_embed) self.conv_pool = encoder.ConvMaxpool( - in_channels=embed_dim, + in_channels=self.embed.embedding_dim, out_channels=kernel_nums, kernel_sizes=kernel_sizes, padding=padding) diff --git a/fastNLP/models/sequence_modeling.py b/fastNLP/models/sequence_modeling.py index bd04a803..b9b0677d 100644 --- a/fastNLP/models/sequence_modeling.py +++ b/fastNLP/models/sequence_modeling.py @@ -11,19 +11,19 @@ class SeqLabeling(BaseModel): 一个基础的Sequence labeling的模型 """ - def __init__(self, vocab_size, embed_dim, hidden_size, num_classes): + def __init__(self, init_embed, hidden_size, num_classes): """ 用于做sequence labeling的基础类。结构包含一层Embedding,一层LSTM(单向,一层),一层FC,以及一层CRF。 - :param int vocab_size: 词表大小。 - :param int embed_dim: embedding的维度 + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), + 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding :param int hidden_size: LSTM隐藏层的大小 :param int num_classes: 一共有多少类 """ super(SeqLabeling, self).__init__() - self.Embedding = encoder.embedding.Embedding(vocab_size, embed_dim) - self.Rnn = encoder.lstm.LSTM(embed_dim, hidden_size) + self.Embedding = encoder.embedding.Embedding(init_embed) + self.Rnn = encoder.lstm.LSTM(self.Embedding.embedding_dim, hidden_size) self.Linear = encoder.linear.Linear(hidden_size, num_classes) self.Crf = decoder.CRF.ConditionalRandomField(num_classes) self.mask = None @@ -103,24 +103,22 @@ class AdvSeqLabel: 更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。 """ - def __init__(self, vocab_size, embed_dim, hidden_size, num_classes, dropout=0.3, embedding=None, - id2words=None, encoding_type='bmes'): + def __init__(self, init_embed, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bmes'): """ - :param int vocab_size: 词表的大小 - :param int embed_dim: embedding的维度 + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), + 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding :param int hidden_size: LSTM的隐层大小 :param int num_classes: 有多少个类 :param float dropout: LSTM中以及DropOut层的drop概率 - :param numpy.ndarray embedding: 预训练的embedding,需要与指定的词表大小等一致 :param dict id2words: tag id转为其tag word的表。用于在CRF解码时防止解出非法的顺序,比如'BMES'这个标签规范中,'S' 不能出现在'B'之后。这里也支持类似与'B-NN',即'-'前为标签类型的指示,后面为具体的tag的情况。这里不但会保证 'B-NN'后面不为'S-NN'还会保证'B-NN'后面不会出现'M-xx'(任何非'M-NN'和'E-NN'的情况。) :param str encoding_type: 支持"BIO", "BMES", "BEMSO"。 """ - self.Embedding = encoder.embedding.Embedding(vocab_size, embed_dim, init_emb=embedding) - self.norm1 = torch.nn.LayerNorm(embed_dim) - self.Rnn = torch.nn.LSTM(input_size=embed_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, + self.Embedding = encoder.embedding.Embedding(init_embed) + self.norm1 = torch.nn.LayerNorm(self.Embedding.embedding_dim) + self.Rnn = torch.nn.LSTM(input_size=self.Embedding.embedding_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True) self.Linear1 = encoder.Linear(hidden_size * 2, hidden_size * 2 // 3) self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3) diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py index 7ead5c18..d4bf3d59 100644 --- a/fastNLP/models/snli.py +++ b/fastNLP/models/snli.py @@ -35,8 +35,7 @@ class ESIM(BaseModel): self.drop = nn.Dropout(self.dropout) self.embedding = Encoder.Embedding( - self.vocab_size, self.embed_dim, dropout=self.dropout, - init_emb=init_embedding, + (self.vocab_size, self.embed_dim), dropout=self.dropout, ) self.embedding_layer = Encoder.Linear(self.embed_dim, self.hidden_size) diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py index 4f4ed551..e4fbeb28 100644 --- a/fastNLP/models/star_transformer.py +++ b/fastNLP/models/star_transformer.py @@ -1,5 +1,8 @@ +"""Star-Transformer 的 一个 Pytorch 实现. +""" from fastNLP.modules.encoder.star_transformer import StarTransformer from fastNLP.core.utils import seq_lens_to_masks +from ..modules.utils import get_embeddings import torch from torch import nn @@ -10,8 +13,9 @@ class StarTransEnc(nn.Module): """ 带word embedding的Star-Transformer Encoder - :param vocab_size: 词嵌入的词典大小 - :param emb_dim: 每个词嵌入的特征维度 + :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, + 此时就以传入的对象作为embedding :param num_cls: 输出类别个数 :param hidden_size: 模型中特征维度. :param num_layers: 模型层数. @@ -22,7 +26,7 @@ class StarTransEnc(nn.Module): :param emb_dropout: 词嵌入的dropout概率. :param dropout: 模型除词嵌入外的dropout概率. """ - def __init__(self, vocab_size, emb_dim, + def __init__(self, init_embed, hidden_size, num_layers, num_head, @@ -31,9 +35,10 @@ class StarTransEnc(nn.Module): emb_dropout, dropout): super(StarTransEnc, self).__init__() + self.embedding = get_embeddings(init_embed) + emb_dim = self.embedding.embedding_dim self.emb_fc = nn.Linear(emb_dim, hidden_size) self.emb_drop = nn.Dropout(emb_dropout) - self.embedding = nn.Embedding(vocab_size, emb_dim) self.encoder = StarTransformer(hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index c93fa1a3..098788a8 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -1,20 +1,30 @@ import torch.nn as nn +from fastNLP.modules.utils import get_embeddings +class Embedding(nn.Embedding): + """Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度""" -class Embedding(nn.Module): - """Embedding组件.""" - - def __init__(self, vocab_size, embed_dim, padding_idx=0, sparse=False, init_emb=None, dropout=0.0): + def __init__(self, init_embed, padding_idx=None, dropout=0.0, sparse=False, max_norm=None, norm_type=2, + scale_grad_by_freq=False): """ - :param int vocab_size: 词表大小. - :param int embed_dim: embedding维度. - :param int padding_idx: 如果碰到padding_idx则自动补0. - :param bool sparse: 如果为`True`则权重矩阵是一个sparse的矩阵. - :param torch.Tensor init_emb: 初始的embedding矩阵. - :param float dropout: dropout概率. + + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), + 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding + :param None,int padding_idx: 该index的Embedding将一直为0. + :param float dropout: 对Embedding的输出的dropout。 + :param bool sparse: 如果为True,则对Embedding的梯度将是sparse的,参考Pytorch Embedding获取更多信息。 + :param None,float max_norm: 每个vector最大的norm能为多大 + :param int norm_type: norm的类型 + :param bool scale_grad_by_freq: 如果为True,将会把梯度除以这个词出现的次数. """ - super(Embedding, self).__init__() - self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx, sparse=sparse, _weight=init_emb) + embed = get_embeddings(init_embed) + num_embeddings, embedding_dim = embed.weight.size() + + super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx, + max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, + sparse=sparse, _weight=embed.weight.data) + del embed + self.dropout = nn.Dropout(dropout) def forward(self, x): @@ -22,5 +32,5 @@ class Embedding(nn.Module): :param torch.LongTensor x: [batch, seq_len] :return: torch.Tensor : [batch, seq_len, embed_dim] """ - x = self.embed(x) + x = super().forward(x) return self.dropout(x) diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index 9ab8e273..cff39c84 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -1,3 +1,6 @@ +"""轻量封装的 Pytorch LSTM 模块. +可在 forward 时传入序列的长度, 自动对padding做合适的处理. +""" import torch import torch.nn as nn import torch.nn.utils.rnn as rnn @@ -35,8 +38,8 @@ class LSTM(nn.Module): :param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None`` :param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全1向量. Default: ``None`` :return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列 - :和 [batch, hidden_size*num_direction] 最后时刻隐状态. - :若 ``get_hidden=False`` 仅返回输出序列. + 和 [batch, hidden_size*num_direction] 最后时刻隐状态. + 若 ``get_hidden=False`` 仅返回输出序列. """ if h0 is not None and c0 is not None: hx = (h0, c0) diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py index 034cfa96..42662804 100644 --- a/fastNLP/modules/encoder/star_transformer.py +++ b/fastNLP/modules/encoder/star_transformer.py @@ -1,3 +1,5 @@ +"""Star-Transformer 的encoder部分的 Pytorch 实现 +""" import torch from torch import nn from torch.nn import functional as F diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py index d63aa6e7..89ab44d9 100644 --- a/fastNLP/modules/encoder/variational_rnn.py +++ b/fastNLP/modules/encoder/variational_rnn.py @@ -1,3 +1,5 @@ +"""Variational RNN 的 Pytorch 实现 +""" import torch import torch.nn as nn from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence @@ -28,11 +30,11 @@ class VarRnnCellWrapper(nn.Module): """ :param PackedSequence input_x: [seq_len, batch_size, input_size] :param hidden: for LSTM, tuple of (h_0, c_0), [batch_size, hidden_size] - :for other RNN, h_0, [batch_size, hidden_size] + for other RNN, h_0, [batch_size, hidden_size] :param mask_x: [batch_size, input_size] dropout mask for input :param mask_h: [batch_size, hidden_size] dropout mask for hidden :return PackedSequence output: [seq_len, bacth_size, hidden_size] - :hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size] + hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size] for other RNN, h_n, [batch_size, hidden_size] """ def get_hi(hi, h0, size): @@ -95,7 +97,7 @@ class VarRNNBase(nn.Module): :param num_layers: rnn的层数. Default: 1 :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 - :(batch, seq, feature). Default: ``False`` + (batch, seq, feature). Default: ``False`` :param input_dropout: 对输入的dropout概率. Default: 0 :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False`` @@ -138,7 +140,7 @@ class VarRNNBase(nn.Module): :param x: [batch, seq_len, input_size] 输入序列 :param hx: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None`` :return (output, ht): [batch, seq_len, hidden_size*num_direction] 输出序列 - :和 [batch, hidden_size*num_direction] 最后时刻隐状态 + 和 [batch, hidden_size*num_direction] 最后时刻隐状态 """ is_lstm = self.is_lstm is_packed = isinstance(x, PackedSequence) @@ -193,7 +195,6 @@ class VarRNNBase(nn.Module): return output, hidden - class VarLSTM(VarRNNBase): """Variational Dropout LSTM. @@ -202,7 +203,7 @@ class VarLSTM(VarRNNBase): :param num_layers: rnn的层数. Default: 1 :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 - :(batch, seq, feature). Default: ``False`` + (batch, seq, feature). Default: ``False`` :param input_dropout: 对输入的dropout概率. Default: 0 :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False`` @@ -211,6 +212,9 @@ class VarLSTM(VarRNNBase): def __init__(self, *args, **kwargs): super(VarLSTM, self).__init__(mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs) + def forward(self, x, hx=None): + return super(VarLSTM, self).forward(x, hx) + class VarRNN(VarRNNBase): """Variational Dropout RNN. @@ -220,7 +224,7 @@ class VarRNN(VarRNNBase): :param num_layers: rnn的层数. Default: 1 :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 - :(batch, seq, feature). Default: ``False`` + (batch, seq, feature). Default: ``False`` :param input_dropout: 对输入的dropout概率. Default: 0 :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False`` @@ -229,6 +233,8 @@ class VarRNN(VarRNNBase): def __init__(self, *args, **kwargs): super(VarRNN, self).__init__(mode="RNN", Cell=nn.RNNCell, *args, **kwargs) + def forward(self, x, hx=None): + return super(VarRNN, self).forward(x, hx) class VarGRU(VarRNNBase): """Variational Dropout GRU. @@ -238,7 +244,7 @@ class VarGRU(VarRNNBase): :param num_layers: rnn的层数. Default: 1 :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 - :(batch, seq, feature). Default: ``False`` + (batch, seq, feature). Default: ``False`` :param input_dropout: 对输入的dropout概率. Default: 0 :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False`` @@ -247,6 +253,9 @@ class VarGRU(VarRNNBase): def __init__(self, *args, **kwargs): super(VarGRU, self).__init__(mode="GRU", Cell=nn.GRUCell, *args, **kwargs) + def forward(self, x, hx=None): + return super(VarGRU, self).forward(x, hx) + # if __name__ == '__main__': # x = torch.Tensor([[1,2,3], [4,5,0], [6,0,0]])[:,:,None] * 0.1 # mask = (x != 0).float().view(3, -1) diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index 4ae15b18..c6d8be9d 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -1,3 +1,4 @@ +import numpy as np import torch import torch.nn as nn import torch.nn.init as init @@ -88,3 +89,25 @@ def seq_mask(seq_len, max_len): seq_len = seq_len.view(-1, 1).long() # [batch_size, 1] seq_range = torch.arange(start=0, end=max_len, dtype=torch.long, device=seq_len.device).view(1, -1) # [1, max_len] return torch.gt(seq_len, seq_range) # [batch_size, max_len] + + +def get_embeddings(init_embed): + """得到词嵌入 + + :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, + 此时就以传入的对象作为embedding + :return nn.Embedding embeddings: + """ + if isinstance(init_embed, tuple): + res = nn.Embedding(num_embeddings=init_embed[0], embedding_dim=init_embed[1]) + elif isinstance(init_embed, nn.Embedding): + res = init_embed + elif isinstance(init_embed, torch.Tensor): + res = nn.Embedding.from_pretrained(init_embed, freeze=False) + elif isinstance(init_embed, np.ndarray): + init_embed = torch.tensor(init_embed, dtype=torch.float32) + res = nn.Embedding.from_pretrained(init_embed, freeze=False) + else: + raise TypeError('invalid init_embed type: {}'.format((type(init_embed)))) + return res diff --git a/reproduction/LSTM+self_attention_sentiment_analysis/main.py b/reproduction/LSTM+self_attention_sentiment_analysis/main.py index ff2d7a67..4ca5388f 100644 --- a/reproduction/LSTM+self_attention_sentiment_analysis/main.py +++ b/reproduction/LSTM+self_attention_sentiment_analysis/main.py @@ -42,7 +42,7 @@ train_data, dev_data = preprocess.run(train_data, dev_data) class SELF_ATTENTION_YELP_CLASSIFICATION(BaseModel): def __init__(self, args=None): super(SELF_ATTENTION_YELP_CLASSIFICATION,self).__init__() - self.embedding = Embedding(len(word2index) ,embeding_size , init_emb= None ) + self.embedding = Embedding((len(word2index) ,embeding_size)) self.lstm = LSTM(input_size=embeding_size, hidden_size=lstm_hidden_size, bidirectional=True) self.attention = SelfAttention(lstm_hidden_size * 2 ,dim =attention_unit ,num_vec=attention_hops) self.mlp = MLP(size_layer=[lstm_hidden_size * 2*attention_hops ,nfc ,class_num ]) diff --git a/requirements.txt b/requirements.txt index 931ca285..d763ea1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -numpy>=1.14.2 +numpy torch>=0.4.0 tensorboardX -tqdm>=4.28.1 -nltk>=3.4.1 \ No newline at end of file +tqdm +nltk \ No newline at end of file diff --git a/test/api/test_processor.py b/test/api/test_processor.py index d0c27c40..9611e458 100644 --- a/test/api/test_processor.py +++ b/test/api/test_processor.py @@ -59,7 +59,7 @@ class TestProcessor(unittest.TestCase): def test_ModelProcessor(self): from fastNLP.models.cnn_text_classification import CNNText - model = CNNText(100, 100, 5) + model = CNNText((100, 100), 5) ins_list = [] for _ in range(64): seq_len = np.random.randint(5, 30) diff --git a/test/test_tutorials.py b/test/test_tutorials.py index a1d47dde..8c0e37bf 100644 --- a/test/test_tutorials.py +++ b/test/test_tutorials.py @@ -70,7 +70,7 @@ class TestTutorial(unittest.TestCase): break from fastNLP.models import CNNText - model = CNNText(vocab_size=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) + model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1) from fastNLP import Trainer from copy import deepcopy @@ -145,13 +145,15 @@ class TestTutorial(unittest.TestCase): is_input=True) from fastNLP.models import CNNText - model = CNNText(vocab_size=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) + model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1) + + from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam - from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), + optimizer= Adam(), metrics=AccuracyMetric(target='label_seq') ) trainer.train() @@ -405,8 +407,7 @@ class TestTutorial(unittest.TestCase): # 另一个例子:加载CNN文本分类模型 from fastNLP.models import CNNText - cnn_text_model = CNNText(vocab_size=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) - cnn_text_model + cnn_text_model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1) from fastNLP import CrossEntropyLoss from fastNLP import Adam @@ -421,7 +422,6 @@ class TestTutorial(unittest.TestCase): print_every=-1, validate_every=-1, dev_data=dev_data, - use_cuda=False, optimizer=Adam(lr=1e-3, weight_decay=0), check_code_level=-1, metric_key='acc',