diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index 72a33e97..0624d07f 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -44,7 +44,8 @@ class CNNCharEmbedding(TokenEmbedding): def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1), - pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None): + pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None, + requires_grad:bool=True, include_word_start_end:bool=True): """ :param vocab: 词表 @@ -60,6 +61,8 @@ class CNNCharEmbedding(TokenEmbedding): :param pre_train_char_embed: 可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹 (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. + :param requires_grad: 是否更新权重 + :param include_word_start_end: 是否在每个word开始的character前和结束的character增加特殊标示符号; """ super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) @@ -86,16 +89,21 @@ class CNNCharEmbedding(TokenEmbedding): logger.info("Start constructing character vocabulary.") # 建立char的词表 - self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) + self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq, + include_word_start_end=include_word_start_end) self.char_pad_index = self.char_vocab.padding_idx logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.") # 对vocab进行index max_word_len = max(map(lambda x: len(x[0]), vocab)) + if include_word_start_end: + max_word_len += 2 self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), max_word_len), fill_value=self.char_pad_index, dtype=torch.long)) self.register_buffer('word_lengths', torch.zeros(len(vocab)).long()) for word, index in vocab: # if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。修改为不区分pad, 这样所有的也是同一个embed + if include_word_start_end: + word = [''] + list(word) + [''] self.words_to_chars_embedding[index, :len(word)] = \ torch.LongTensor([self.char_vocab.to_index(c) for c in word]) self.word_lengths[index] = len(word) @@ -110,6 +118,7 @@ class CNNCharEmbedding(TokenEmbedding): for i in range(len(kernel_sizes))]) self._embed_size = embed_size self.fc = nn.Linear(sum(filter_nums), embed_size) + self.requires_grad = requires_grad def forward(self, words): """ @@ -164,8 +173,8 @@ class LSTMCharEmbedding(TokenEmbedding): def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu', - min_char_freq: int = 2, - bidirectional=True, pre_train_char_embed: str = None): + min_char_freq: int = 2, bidirectional=True, pre_train_char_embed: str = None, + requires_grad:bool=True, include_word_start_end:bool=True): """ :param vocab: 词表 @@ -181,6 +190,8 @@ class LSTMCharEmbedding(TokenEmbedding): :param pre_train_char_embed: 可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹 (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. + :param requires_grad: 是否更新权重 + :param include_word_start_end: 是否在每个word开始的character前和结束的character增加特殊标示符号; """ super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) @@ -206,20 +217,24 @@ class LSTMCharEmbedding(TokenEmbedding): logger.info("Start constructing character vocabulary.") # 建立char的词表 - self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) + self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq, + include_word_start_end=include_word_start_end) self.char_pad_index = self.char_vocab.padding_idx logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.") # 对vocab进行index - self.max_word_len = max(map(lambda x: len(x[0]), vocab)) - self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), self.max_word_len), + max_word_len = max(map(lambda x: len(x[0]), vocab)) + if include_word_start_end: + max_word_len += 2 + self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), max_word_len), fill_value=self.char_pad_index, dtype=torch.long)) self.register_buffer('word_lengths', torch.zeros(len(vocab)).long()) for word, index in vocab: # if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了. 修改为不区分pad与否 + if include_word_start_end: + word = [''] + list(word) + [''] self.words_to_chars_embedding[index, :len(word)] = \ torch.LongTensor([self.char_vocab.to_index(c) for c in word]) self.word_lengths[index] = len(word) - # self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) if pre_train_char_embed: self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) else: @@ -231,6 +246,7 @@ class LSTMCharEmbedding(TokenEmbedding): self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True) self._embed_size = embed_size self.bidirectional = bidirectional + self.requires_grad = requires_grad def forward(self, words): """ diff --git a/fastNLP/embeddings/utils.py b/fastNLP/embeddings/utils.py index 844a0c93..942f8b02 100644 --- a/fastNLP/embeddings/utils.py +++ b/fastNLP/embeddings/utils.py @@ -13,18 +13,21 @@ __all__ = [ ] -def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1): +def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1, include_word_start_end=True): """ 给定一个word的vocabulary生成character的vocabulary. :param vocab: 从vocab :param min_freq: + :param include_word_start_end: 是否需要包含特殊的 :return: """ char_vocab = Vocabulary(min_freq=min_freq) for word, index in vocab: if not vocab._is_word_no_create_entry(word): char_vocab.add_word_lst(list(word)) + if include_word_start_end: + char_vocab.add_word_lst(['', '']) return char_vocab