In [1]:
import torch
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')

words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])  # 将文本转为index
print(embed(words).size())  # StaticEmbedding的使用和pytorch的nn.Embedding是类似的

Found 5 out of 7 words in the pre-training embedding.
torch.Size([1, 5, 50])


In [2]:
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)

words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

torch.Size([1, 5, 30])


In [3]:
from fastNLP.embeddings import ElmoEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

22 out of 22 characters were found in pretrained elmo embedding.
torch.Size([1, 5, 256])


In [4]:
embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')
print(embed(words).size())

22 out of 22 characters were found in pretrained elmo embedding.
torch.Size([1, 5, 512])


In [5]:
embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')
print(embed(words).size())  # 三层输出按照权重element-wise的加起来

22 out of 22 characters were found in pretrained elmo embedding.
torch.Size([1, 5, 256])


In [6]:
from fastNLP.embeddings import BertEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt
Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.
Start to generate word pieces for word.
Found(Or segment into word pieces) 7 words out of 7.
torch.Size([1, 5, 768])


In [7]:
#  使用后面两层的输出
embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')
print(embed(words).size())  # 结果将是在最后一维做拼接

loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt
Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.
Start to generate word pieces for word.
Found(Or segment into word pieces) 7 words out of 7.
torch.Size([1, 5, 1536])


In [8]:
embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)
print(embed(words).size())  # 结果将在序列维度上增加2
# 取出句子的cls表示
cls_reps = embed(words)[:, 0]  # shape: [batch_size, 768]

loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt
Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.
Start to generate word pieces for word.
Found(Or segment into word pieces) 7 words out of 7.
torch.Size([1, 7, 768])


In [9]:
embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')
print(embed(words).size())

loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt
Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.
Start to generate word pieces for word.
Found(Or segment into word pieces) 7 words out of 7.
torch.Size([1, 5, 768])


In [10]:
vocab = Vocabulary()
vocab.add_word_lst("this is a demo . [SEP] another sentence .".split())

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo . [SEP] another sentence .".split()]])
print(embed(words).size())

loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt
Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.
Start to generate word pieces for word.
Found(Or segment into word pieces) 10 words out of 10.
torch.Size([1, 9, 768])


In [11]:
from fastNLP.embeddings import CNNCharEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

# character的embedding维度大小为50，返回的embedding结果维度大小为64。
embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

Start constructing character vocabulary.
In total, there are 8 distinct characters.
torch.Size([1, 5, 64])


In [12]:
from fastNLP.embeddings import LSTMCharEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

# character的embedding维度大小为50，返回的embedding结果维度大小为64。
embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

Start constructing character vocabulary.
In total, there are 8 distinct characters.
torch.Size([1, 5, 64])


In [13]:
from fastNLP.embeddings import *

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')
print(static_embed.embedding_dim)  # 50
char_embed = CNNCharEmbedding(vocab, embed_size=30)
print(char_embed.embedding_dim)    # 30
elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')
print(elmo_embed_1.embedding_dim)  # 256
elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')
print(elmo_embed_2.embedding_dim)  # 512
bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')
print(bert_embed_1.embedding_dim)  # 768
bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')
print(bert_embed_2.embedding_dim)  # 1536
stack_embed = StackEmbedding([static_embed, char_embed])
print(stack_embed.embedding_dim)  # 80

Found 5 out of 7 words in the pre-training embedding.
50
Start constructing character vocabulary.
In total, there are 8 distinct characters.
30
22 out of 22 characters were found in pretrained elmo embedding.
256
22 out of 22 characters were found in pretrained elmo embedding.
512
loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt
Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.
Start to generate word pieces for word.
Found(Or segment into word pieces) 7 words out of 7.
768
loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt
Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.
Start to generate word pieces for word.
Found(Or segment into word pieces) 7 words out of 7.
1536
80


In [14]:
from fastNLP.embeddings import *

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True)  # 初始化时设定为需要更新
embed.requires_grad = False  # 修改BertEmbedding的权重为不更新

loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt
Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.
Start to generate word pieces for word.
Found(Or segment into word pieces) 7 words out of 7.


In [15]:
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("The the a A".split())
#  下面用随机的StaticEmbedding演示，但与使用预训练词向量时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)
print(embed(torch.LongTensor([vocab.to_index('The')])))
print(embed(torch.LongTensor([vocab.to_index('the')])))

tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],
       grad_fn=<EmbeddingBackward>)
tensor([[ 0.0926, -0.4812, -0.7744,  0.4836, -0.5475]],
       grad_fn=<EmbeddingBackward>)


In [16]:
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("The the a A".split())
#  下面用随机的StaticEmbedding演示，但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)
print(embed(torch.LongTensor([vocab.to_index('The')])))
print(embed(torch.LongTensor([vocab.to_index('the')])))

All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.
tensor([[ 0.4530, -0.1558, -0.1941,  0.3203,  0.0355]],
       grad_fn=<EmbeddingBackward>)
tensor([[ 0.4530, -0.1558, -0.1941,  0.3203,  0.0355]],
       grad_fn=<EmbeddingBackward>)


In [17]:
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("the the the a".split())
#  下面用随机的StaticEmbedding演示，但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)
print(embed(torch.LongTensor([vocab.to_index('the')])))
print(embed(torch.LongTensor([vocab.to_index('a')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

1 out of 4 words have frequency less than 2.
tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],
       grad_fn=<EmbeddingBackward>)
tensor([[ 0.7638, -0.0552,  0.1625, -0.2210,  0.4993]],
       grad_fn=<EmbeddingBackward>)
tensor([[ 0.7638, -0.0552,  0.1625, -0.2210,  0.4993]],
       grad_fn=<EmbeddingBackward>)


In [18]:
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("the the the a A".split())
#  下面用随机的StaticEmbedding演示，但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)
print(embed(torch.LongTensor([vocab.to_index('the')])))
print(embed(torch.LongTensor([vocab.to_index('a')])))
print(embed(torch.LongTensor([vocab.to_index('A')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

0 out of 5 words have frequency less than 2.
All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.
tensor([[ 0.1943,  0.3739,  0.2769, -0.4746, -0.3181]],
       grad_fn=<EmbeddingBackward>)
tensor([[ 0.5892, -0.6916,  0.7319, -0.3803,  0.4979]],
       grad_fn=<EmbeddingBackward>)
tensor([[ 0.5892, -0.6916,  0.7319, -0.3803,  0.4979]],
       grad_fn=<EmbeddingBackward>)
tensor([[-0.1348, -0.2172, -0.0071,  0.5704, -0.2607]],
       grad_fn=<EmbeddingBackward>)
