@@ -428,16 +428,16 @@ def _bioes_tag_to_spans(tags, ignore_labels=None): | |||
prev_bioes_tag = None | |||
for idx, tag in enumerate(tags): | |||
tag = tag.lower() | |||
bieso_tag, label = tag[:1], tag[2:] | |||
if bieso_tag in ('b', 's'): | |||
bioes_tag, label = tag[:1], tag[2:] | |||
if bioes_tag in ('b', 's'): | |||
spans.append((label, [idx, idx])) | |||
elif bieso_tag in ('i', 'e') and prev_bioes_tag in ('b', 'i') and label == spans[-1][0]: | |||
elif bioes_tag in ('i', 'e') and prev_bioes_tag in ('b', 'i') and label == spans[-1][0]: | |||
spans[-1][1][1] = idx | |||
elif bieso_tag == 'o': | |||
elif bioes_tag == 'o': | |||
pass | |||
else: | |||
spans.append((label, [idx, idx])) | |||
prev_bioes_tag = bieso_tag | |||
prev_bioes_tag = bioes_tag | |||
return [(span[0], (span[1][0], span[1][1] + 1)) | |||
for span in spans | |||
if span[0] not in ignore_labels | |||
@@ -500,8 +500,8 @@ class CNNCharEmbedding(TokenEmbedding): | |||
""" | |||
别名::class:`fastNLP.modules.CNNCharEmbedding` :class:`fastNLP.modules.encoder.embedding.CNNCharEmbedding` | |||
使用CNN生成character embedding。CNN的结果为, CNN(x) -> activation(x) -> pool -> fc. 不同的kernel大小的fitler结果是 | |||
concat起来的。 | |||
使用CNN生成character embedding。CNN的结果为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool | |||
-> fc. 不同的kernel大小的fitler结果是concat起来的。 | |||
Example:: | |||
@@ -511,13 +511,14 @@ class CNNCharEmbedding(TokenEmbedding): | |||
:param vocab: 词表 | |||
:param embed_size: 该word embedding的大小,默认值为50. | |||
:param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50. | |||
:param dropout: 以多大的概率drop | |||
:param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20]. | |||
:param kernel_sizes: kernel的大小. 默认值为[5, 3, 1]. | |||
:param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'. | |||
:param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数. | |||
:param min_char_freq: character的最少出现次数。默认值为2. | |||
""" | |||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, | |||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, dropout:float=0.5, | |||
filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), pool_method: str='max', | |||
activation='relu', min_char_freq: int=2): | |||
super(CNNCharEmbedding, self).__init__(vocab) | |||
@@ -526,6 +527,7 @@ class CNNCharEmbedding(TokenEmbedding): | |||
assert kernel % 2 == 1, "Only odd kernel is allowed." | |||
assert pool_method in ('max', 'avg') | |||
self.dropout = nn.Dropout(dropout, inplace=True) | |||
self.pool_method = pool_method | |||
# activation function | |||
if isinstance(activation, str): | |||
@@ -583,7 +585,7 @@ class CNNCharEmbedding(TokenEmbedding): | |||
# 为1的地方为mask | |||
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 | |||
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size | |||
chars = self.dropout(chars) | |||
reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) | |||
reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M | |||
conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) | |||
@@ -635,7 +637,7 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
""" | |||
别名::class:`fastNLP.modules.LSTMCharEmbedding` :class:`fastNLP.modules.encoder.embedding.LSTMCharEmbedding` | |||
使用LSTM的方式对character进行encode. | |||
使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool | |||
Example:: | |||
@@ -644,13 +646,14 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
:param vocab: 词表 | |||
:param embed_size: embedding的大小。默认值为50. | |||
:param char_emb_size: character的embedding的大小。默认值为50. | |||
:param dropout: 以多大概率drop | |||
:param hidden_size: LSTM的中间hidden的大小,如果为bidirectional的,hidden会除二,默认为50. | |||
:param pool_method: 支持'max', 'avg' | |||
:param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数. | |||
:param min_char_freq: character的最小出现次数。默认值为2. | |||
:param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。 | |||
""" | |||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, hidden_size=50, | |||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, dropout:float=0.5, hidden_size=50, | |||
pool_method: str='max', activation='relu', min_char_freq: int=2, bidirectional=True): | |||
super(LSTMCharEmbedding, self).__init__(vocab) | |||
@@ -658,7 +661,7 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
assert pool_method in ('max', 'avg') | |||
self.pool_method = pool_method | |||
self.dropout = nn.Dropout(dropout, inplace=True) | |||
# activation function | |||
if isinstance(activation, str): | |||
if activation.lower() == 'relu': | |||
@@ -715,7 +718,7 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
# 为mask的地方为1 | |||
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 | |||
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size | |||
chars = self.dropout(chars) | |||
reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1) | |||
char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) | |||
lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) | |||
@@ -40,12 +40,14 @@ class LSTM(nn.Module): | |||
def init_param(self): | |||
for name, param in self.named_parameters(): | |||
if 'bias_i' in name: | |||
param.data.fill_(1) | |||
elif 'bias_h' in name: | |||
if 'bias' in name: | |||
# based on https://github.com/pytorch/pytorch/issues/750#issuecomment-280671871 | |||
param.data.fill_(0) | |||
n = param.size(0) | |||
start, end = n // 4, n // 2 | |||
param.data[start:end].fill_(1) | |||
else: | |||
nn.init.xavier_normal_(param) | |||
nn.init.xavier_uniform_(param) | |||
def forward(self, x, seq_len=None, h0=None, c0=None): | |||
""" | |||
@@ -0,0 +1,92 @@ | |||
from fastNLP.core.vocabulary import VocabularyOption | |||
from fastNLP.io.base_loader import DataSetLoader, DataInfo | |||
from typing import Union, Dict | |||
from fastNLP import Vocabulary | |||
from fastNLP import Const | |||
from reproduction.utils import check_dataloader_paths | |||
from fastNLP.io.dataset_loader import ConllLoader | |||
from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 | |||
class Conll2003DataLoader(DataSetLoader): | |||
def __init__(self, task:str='ner', encoding_type:str='bioes'): | |||
""" | |||
加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos | |||
时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回 | |||
的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但 | |||
鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的中该值 | |||
ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。 | |||
:param task: 指定需要标注任务。可选ner, pos, chunk | |||
""" | |||
assert task in ('ner', 'pos', 'chunk') | |||
index = {'ner':3, 'pos':1, 'chunk':2}[task] | |||
self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index]) | |||
self._tag_converters = None | |||
if task in ('ner', 'chunk'): | |||
self._tag_converters = [iob2] | |||
if encoding_type == 'bioes': | |||
self._tag_converters.append(iob2bioes) | |||
def load(self, path: str): | |||
dataset = self._loader.load(path) | |||
def convert_tag_schema(tags): | |||
for converter in self._tag_converters: | |||
tags = converter(tags) | |||
return tags | |||
if self._tag_converters: | |||
dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) | |||
return dataset | |||
def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, lower:bool=True): | |||
""" | |||
读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略 | |||
:param paths: | |||
:param word_vocab_opt: vocabulary的初始化值 | |||
:param lower: 是否将所有字母转为小写 | |||
:return: | |||
""" | |||
# 读取数据 | |||
paths = check_dataloader_paths(paths) | |||
data = DataInfo() | |||
input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] | |||
target_fields = [Const.TARGET, Const.INPUT_LEN] | |||
for name, path in paths.items(): | |||
dataset = self.load(path) | |||
dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) | |||
if lower: | |||
dataset.apply_field(lambda words:[word.lower() for word in words], field_name=Const.INPUT, | |||
new_field_name=Const.INPUT) | |||
data.datasets[name] = dataset | |||
# 对construct vocab | |||
word_vocab = Vocabulary(min_freq=3) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) | |||
word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT) | |||
word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) | |||
data.vocabs[Const.INPUT] = word_vocab | |||
# cap words | |||
cap_word_vocab = Vocabulary() | |||
cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words') | |||
cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') | |||
input_fields.append('cap_words') | |||
data.vocabs['cap_words'] = cap_word_vocab | |||
# 对target建vocab | |||
target_vocab = Vocabulary(unknown=None, padding=None) | |||
target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) | |||
target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) | |||
data.vocabs[Const.TARGET] = target_vocab | |||
for name, dataset in data.datasets.items(): | |||
dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) | |||
dataset.set_input(*input_fields) | |||
dataset.set_target(*target_fields) | |||
return data | |||
if __name__ == '__main__': | |||
pass |
@@ -0,0 +1,130 @@ | |||
from fastNLP.core.vocabulary import VocabularyOption | |||
from fastNLP.io.base_loader import DataSetLoader, DataInfo | |||
from typing import Union, Dict | |||
from fastNLP import DataSet | |||
from fastNLP import Vocabulary | |||
from fastNLP import Const | |||
from reproduction.utils import check_dataloader_paths | |||
from fastNLP.io.dataset_loader import ConllLoader | |||
from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 | |||
class OntoNoteNERDataLoader(DataSetLoader): | |||
""" | |||
用于读取处理为Conll格式后的OntoNote数据。将OntoNote数据处理为conll格式的过程可以参考https://github.com/yhcc/OntoNotes-5.0-NER。 | |||
""" | |||
def __init__(self, encoding_type:str='bioes'): | |||
assert encoding_type in ('bioes', 'bio') | |||
self.encoding_type = encoding_type | |||
if encoding_type=='bioes': | |||
self.encoding_method = iob2bioes | |||
else: | |||
self.encoding_method = iob2 | |||
def load(self, path:str)->DataSet: | |||
""" | |||
给定一个文件路径,读取数据。返回的DataSet包含以下的field | |||
raw_words: List[str] | |||
target: List[str] | |||
:param path: | |||
:return: | |||
""" | |||
dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path) | |||
def convert_to_bio(tags): | |||
bio_tags = [] | |||
flag = None | |||
for tag in tags: | |||
label = tag.strip("()*") | |||
if '(' in tag: | |||
bio_label = 'B-' + label | |||
flag = label | |||
elif flag: | |||
bio_label = 'I-' + flag | |||
else: | |||
bio_label = 'O' | |||
if ')' in tag: | |||
flag = None | |||
bio_tags.append(bio_label) | |||
return self.encoding_method(bio_tags) | |||
dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target') | |||
return dataset | |||
def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, | |||
lower:bool=True)->DataInfo: | |||
""" | |||
读取并处理数据。返回的DataInfo包含以下的内容 | |||
vocabs: | |||
word: Vocabulary | |||
target: Vocabulary | |||
datasets: | |||
train: DataSet | |||
words: List[int], 被设置为input | |||
target: int. label,被同时设置为input和target | |||
seq_len: int. 句子的长度,被同时设置为input和target | |||
raw_words: List[str] | |||
xxx(根据传入的paths可能有所变化) | |||
:param paths: | |||
:param word_vocab_opt: vocabulary的初始化值 | |||
:param lower: 是否使用小写 | |||
:return: | |||
""" | |||
paths = check_dataloader_paths(paths) | |||
data = DataInfo() | |||
input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] | |||
target_fields = [Const.TARGET, Const.INPUT_LEN] | |||
for name, path in paths.items(): | |||
dataset = self.load(path) | |||
dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) | |||
if lower: | |||
dataset.apply_field(lambda words:[word.lower() for word in words], field_name=Const.INPUT, | |||
new_field_name=Const.INPUT) | |||
data.datasets[name] = dataset | |||
# 对construct vocab | |||
word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) | |||
word_vocab.from_dataset(data.datasets['train'], field_name='raw_words') | |||
word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name=Const.INPUT) | |||
data.vocabs[Const.INPUT] = word_vocab | |||
# cap words | |||
cap_word_vocab = Vocabulary() | |||
cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words') | |||
cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') | |||
input_fields.append('cap_words') | |||
data.vocabs['cap_words'] = cap_word_vocab | |||
# 对target建vocab | |||
target_vocab = Vocabulary(unknown=None, padding=None) | |||
target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) | |||
target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) | |||
data.vocabs[Const.TARGET] = target_vocab | |||
for name, dataset in data.datasets.items(): | |||
dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) | |||
dataset.set_input(*input_fields) | |||
dataset.set_target(*target_fields) | |||
return data | |||
if __name__ == '__main__': | |||
loader = OntoNoteNERDataLoader() | |||
dataset = loader.load('/hdd/fudanNLP/fastNLP/others/data/v4/english/test.txt') | |||
print(dataset.target.value_count()) | |||
print(dataset[:4]) | |||
""" | |||
train 115812 2200752 | |||
development 15680 304684 | |||
test 12217 230111 | |||
train 92403 1901772 | |||
valid 13606 279180 | |||
test 10258 204135 | |||
""" |
@@ -0,0 +1,49 @@ | |||
from typing import List | |||
def iob2(tags:List[str])->List[str]: | |||
""" | |||
检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。 | |||
:param tags: 需要转换的tags | |||
""" | |||
for i, tag in enumerate(tags): | |||
if tag == "O": | |||
continue | |||
split = tag.split("-") | |||
if len(split) != 2 or split[0] not in ["I", "B"]: | |||
raise TypeError("The encoding schema is not a valid IOB type.") | |||
if split[0] == "B": | |||
continue | |||
elif i == 0 or tags[i - 1] == "O": # conversion IOB1 to IOB2 | |||
tags[i] = "B" + tag[1:] | |||
elif tags[i - 1][1:] == tag[1:]: | |||
continue | |||
else: # conversion IOB1 to IOB2 | |||
tags[i] = "B" + tag[1:] | |||
return tags | |||
def iob2bioes(tags:List[str])->List[str]: | |||
""" | |||
将iob的tag转换为bmeso编码 | |||
:param tags: | |||
:return: | |||
""" | |||
new_tags = [] | |||
for i, tag in enumerate(tags): | |||
if tag == 'O': | |||
new_tags.append(tag) | |||
else: | |||
split = tag.split('-')[0] | |||
if split == 'B': | |||
if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I': | |||
new_tags.append(tag) | |||
else: | |||
new_tags.append(tag.replace('B-', 'S-')) | |||
elif split == 'I': | |||
if i + 1<len(tags) and tags[i+1].split('-')[0] == 'I': | |||
new_tags.append(tag) | |||
else: | |||
new_tags.append(tag.replace('I-', 'E-')) | |||
else: | |||
raise TypeError("Invalid IOB format.") | |||
return new_tags |
@@ -0,0 +1,62 @@ | |||
import torch | |||
from torch import nn | |||
from fastNLP import seq_len_to_mask | |||
from fastNLP.modules import Embedding | |||
from fastNLP.modules import LSTM | |||
from fastNLP.modules import ConditionalRandomField, allowed_transitions, TimestepDropout | |||
import torch.nn.functional as F | |||
from fastNLP import Const | |||
class CNNBiLSTMCRF(nn.Module): | |||
def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): | |||
super().__init__() | |||
self.embedding = Embedding(embed, dropout=0.5) | |||
self.char_embedding = Embedding(char_embed, dropout=0.5) | |||
self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, | |||
hidden_size=hidden_size//2, num_layers=num_layers, | |||
bidirectional=True, batch_first=True, dropout=dropout) | |||
self.forward_fc = nn.Linear(hidden_size//2, len(tag_vocab)) | |||
self.backward_fc = nn.Linear(hidden_size//2, len(tag_vocab)) | |||
transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=False) | |||
self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=False, allowed_transitions=transitions) | |||
self.dropout = TimestepDropout(dropout, inplace=True) | |||
for name, param in self.named_parameters(): | |||
if 'ward_fc' in name: | |||
if param.data.dim()>1: | |||
nn.init.xavier_normal_(param) | |||
else: | |||
nn.init.constant_(param, 0) | |||
if 'crf' in name: | |||
nn.init.zeros_(param) | |||
def _forward(self, words, cap_words, seq_len, target=None): | |||
words = self.embedding(words) | |||
chars = self.char_embedding(cap_words) | |||
words = torch.cat([words, chars], dim=-1) | |||
outputs, _ = self.lstm(words, seq_len) | |||
self.dropout(outputs) | |||
forwards, backwards = outputs.chunk(2, dim=-1) | |||
# forward_logits = F.log_softmax(self.forward_fc(forwards), dim=-1) | |||
# backward_logits = F.log_softmax(self.backward_fc(backwards), dim=-1) | |||
logits = self.forward_fc(forwards) + self.backward_fc(backwards) | |||
self.dropout(logits) | |||
if target is not None: | |||
loss = self.crf(logits, target, seq_len_to_mask(seq_len)) | |||
return {Const.LOSS: loss} | |||
else: | |||
pred, _ = self.crf.viterbi_decode(logits, seq_len_to_mask(seq_len)) | |||
return {Const.OUTPUT: pred} | |||
def forward(self, words, cap_words, seq_len, target): | |||
return self._forward(words, cap_words, seq_len, target) | |||
def predict(self, words, cap_words, seq_len): | |||
return self._forward(words, cap_words, seq_len, None) |
@@ -0,0 +1,33 @@ | |||
from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader | |||
from reproduction.seqence_labelling.ner.data.Conll2003Loader import iob2, iob2bioes | |||
import unittest | |||
class TestTagSchemaConverter(unittest.TestCase): | |||
def test_iob2(self): | |||
tags = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'] | |||
golden = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'] | |||
self.assertListEqual(golden, iob2(tags)) | |||
tags = ['I-ORG', 'O'] | |||
golden = ['B-ORG', 'O'] | |||
self.assertListEqual(golden, iob2(tags)) | |||
tags = ['I-MISC', 'I-MISC', 'O', 'I-PER', 'I-PER', 'O'] | |||
golden = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O'] | |||
self.assertListEqual(golden, iob2(tags)) | |||
def test_iob2bemso(self): | |||
tags = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O'] | |||
golden = ['B-MISC', 'E-MISC', 'O', 'B-PER', 'E-PER', 'O'] | |||
self.assertListEqual(golden, iob2bioes(tags)) | |||
def test_conll2003_loader(): | |||
path = '/hdd/fudanNLP/fastNLP/others/data/conll2003/train.txt' | |||
loader = Conll2003DataLoader().load(path) | |||
print(loader[:3]) | |||
if __name__ == '__main__': | |||
test_conll2003_loader() |
@@ -0,0 +1,42 @@ | |||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding | |||
from fastNLP.core.vocabulary import VocabularyOption | |||
from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | |||
from fastNLP import Trainer | |||
from fastNLP import SpanFPreRecMetric | |||
from fastNLP import BucketSampler | |||
from fastNLP import Const | |||
from torch.optim import SGD, Adam | |||
from fastNLP import GradientClipCallback | |||
from fastNLP.core.callback import FitlogCallback | |||
import fitlog | |||
fitlog.debug() | |||
from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader | |||
encoding_type = 'bioes' | |||
data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003', | |||
word_vocab_opt=VocabularyOption(min_freq=3)) | |||
print(data) | |||
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||
kernel_sizes=[3]) | |||
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | |||
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', | |||
requires_grad=True) | |||
word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() | |||
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], | |||
encoding_type=encoding_type) | |||
optimizer = Adam(model.parameters(), lr=0.001) | |||
callbacks = [GradientClipCallback(clip_type='value'), FitlogCallback({'test':data.datasets['test']}, verbose=1)] | |||
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), | |||
device=0, dev_data=data.datasets['dev'], batch_size=32, | |||
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | |||
callbacks=callbacks, num_workers=1, n_epochs=100) | |||
trainer.train() |
@@ -0,0 +1,39 @@ | |||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding | |||
from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | |||
from fastNLP import Trainer | |||
from fastNLP import SpanFPreRecMetric | |||
from fastNLP import BucketSampler | |||
from fastNLP import Const | |||
from torch.optim import SGD, Adam | |||
from fastNLP import GradientClipCallback | |||
from fastNLP.core.callback import FitlogCallback | |||
import fitlog | |||
fitlog.debug() | |||
from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader | |||
encoding_type = 'bioes' | |||
data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english') | |||
print(data) | |||
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||
kernel_sizes=[3]) | |||
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | |||
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', | |||
requires_grad=True) | |||
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], | |||
encoding_type=encoding_type) | |||
optimizer = Adam(model.parameters(), lr=0.001) | |||
callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)] | |||
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), | |||
device=1, dev_data=data.datasets['dev'], batch_size=32, | |||
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | |||
callbacks=callbacks, num_workers=1, n_epochs=100) | |||
trainer.train() |