character embedding增加word boundary

5 years ago · e6ce48a6d3
--- a/fastNLP/embeddings/char_embedding.py
+++ b/fastNLP/embeddings/char_embedding.py
@@ -44,7 +44,8 @@ class CNNCharEmbedding(TokenEmbedding):
    
    def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0,
                 dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1),
                 pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None):
                 pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None,
                 requires_grad:bool=True, include_word_start_end:bool=True):
        """
        
        :param vocab: 词表
@@ -60,6 +61,8 @@ class CNNCharEmbedding(TokenEmbedding):
        :param pre_train_char_embed: 可以有两种方式调用预训练好的character embedding：第一种是传入embedding文件夹
            (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径；第二种是传入embedding的名称，第二种情况将自动查看缓存中是否存在该模型，
            没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding.
        :param requires_grad: 是否更新权重
        :param include_word_start_end: 是否在每个word开始的character前和结束的character增加特殊标示符号；
        """
        super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
        
@@ -86,16 +89,21 @@ class CNNCharEmbedding(TokenEmbedding):
        
        logger.info("Start constructing character vocabulary.")
        # 建立char的词表
        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq,
                                                           include_word_start_end=include_word_start_end)
        self.char_pad_index = self.char_vocab.padding_idx
        logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.")
        # 对vocab进行index
        max_word_len = max(map(lambda x: len(x[0]), vocab))
        if include_word_start_end:
            max_word_len += 2
        self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), max_word_len),
                                                                fill_value=self.char_pad_index, dtype=torch.long))
        self.register_buffer('word_lengths', torch.zeros(len(vocab)).long())
        for word, index in vocab:
            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了。修改为不区分pad, 这样所有的<pad>也是同一个embed
            if include_word_start_end:
                word = ['<bow>'] + list(word) + ['<eow>']
            self.words_to_chars_embedding[index, :len(word)] = \
                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
            self.word_lengths[index] = len(word)
@@ -110,6 +118,7 @@ class CNNCharEmbedding(TokenEmbedding):
            for i in range(len(kernel_sizes))])
        self._embed_size = embed_size
        self.fc = nn.Linear(sum(filter_nums), embed_size)
        self.requires_grad = requires_grad

    def forward(self, words):
        """
@@ -164,8 +173,8 @@ class LSTMCharEmbedding(TokenEmbedding):
    
    def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0,
                 dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu',
                 min_char_freq: int = 2,
                 bidirectional=True, pre_train_char_embed: str = None):
                 min_char_freq: int = 2, bidirectional=True, pre_train_char_embed: str = None,
                 requires_grad:bool=True, include_word_start_end:bool=True):
        """
        
        :param vocab: 词表
@@ -181,6 +190,8 @@ class LSTMCharEmbedding(TokenEmbedding):
        :param pre_train_char_embed: 可以有两种方式调用预训练好的character embedding：第一种是传入embedding文件夹
            (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径；第二种是传入embedding的名称，第二种情况将自动查看缓存中是否存在该模型，
            没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding.
        :param requires_grad: 是否更新权重
        :param include_word_start_end: 是否在每个word开始的character前和结束的character增加特殊标示符号；
        """
        super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
        
@@ -206,20 +217,24 @@ class LSTMCharEmbedding(TokenEmbedding):
        
        logger.info("Start constructing character vocabulary.")
        # 建立char的词表
        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq,
                                                           include_word_start_end=include_word_start_end)
        self.char_pad_index = self.char_vocab.padding_idx
        logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.")
        # 对vocab进行index
        self.max_word_len = max(map(lambda x: len(x[0]), vocab))
        self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), self.max_word_len),
        max_word_len = max(map(lambda x: len(x[0]), vocab))
        if include_word_start_end:
            max_word_len += 2
        self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), max_word_len),
                                                                fill_value=self.char_pad_index, dtype=torch.long))
        self.register_buffer('word_lengths', torch.zeros(len(vocab)).long())
        for word, index in vocab:
            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了. 修改为不区分pad与否
            if include_word_start_end:
                word = ['<bow>'] + list(word) + ['<eow>']
            self.words_to_chars_embedding[index, :len(word)] = \
                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
            self.word_lengths[index] = len(word)
        # self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
        if pre_train_char_embed:
            self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed)
        else:
@@ -231,6 +246,7 @@ class LSTMCharEmbedding(TokenEmbedding):
        self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True)
        self._embed_size = embed_size
        self.bidirectional = bidirectional
        self.requires_grad = requires_grad
    
    def forward(self, words):
        """
--- a/fastNLP/embeddings/utils.py
+++ b/fastNLP/embeddings/utils.py
@@ -13,18 +13,21 @@ __all__ = [
 ]


 def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1):
 def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1, include_word_start_end=True):
    """
    给定一个word的vocabulary生成character的vocabulary.

    :param vocab: 从vocab
    :param min_freq:
    :param include_word_start_end: 是否需要包含特殊的<bow>和<eos>
    :return:
    """
    char_vocab = Vocabulary(min_freq=min_freq)
    for word, index in vocab:
        if not vocab._is_word_no_create_entry(word):
            char_vocab.add_word_lst(list(word))
    if include_word_start_end:
        char_vocab.add_word_lst(['<bow>', '<eow>'])
    return char_vocab