@@ -613,6 +613,7 @@ class DataSet(object): | |||||
raise e | raise e | ||||
else: | else: | ||||
raise KeyError("{} is not a valid field name.".format(name)) | raise KeyError("{} is not a valid field name.".format(name)) | ||||
return self | |||||
def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): | def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): | ||||
""" | """ | ||||
@@ -636,6 +637,7 @@ class DataSet(object): | |||||
raise e | raise e | ||||
else: | else: | ||||
raise KeyError("{} is not a valid field name.".format(name)) | raise KeyError("{} is not a valid field name.".format(name)) | ||||
return self | |||||
def set_ignore_type(self, *field_names, flag=True): | def set_ignore_type(self, *field_names, flag=True): | ||||
""" | """ | ||||
@@ -652,6 +654,7 @@ class DataSet(object): | |||||
self.field_arrays[name].ignore_type = flag | self.field_arrays[name].ignore_type = flag | ||||
else: | else: | ||||
raise KeyError("{} is not a valid field name.".format(name)) | raise KeyError("{} is not a valid field name.".format(name)) | ||||
return self | |||||
def set_padder(self, field_name, padder): | def set_padder(self, field_name, padder): | ||||
""" | """ | ||||
@@ -667,6 +670,7 @@ class DataSet(object): | |||||
if field_name not in self.field_arrays: | if field_name not in self.field_arrays: | ||||
raise KeyError("There is no field named {}.".format(field_name)) | raise KeyError("There is no field named {}.".format(field_name)) | ||||
self.field_arrays[field_name].set_padder(padder) | self.field_arrays[field_name].set_padder(padder) | ||||
return self | |||||
def set_pad_val(self, field_name, pad_val): | def set_pad_val(self, field_name, pad_val): | ||||
""" | """ | ||||
@@ -678,6 +682,7 @@ class DataSet(object): | |||||
if field_name not in self.field_arrays: | if field_name not in self.field_arrays: | ||||
raise KeyError("There is no field named {}.".format(field_name)) | raise KeyError("There is no field named {}.".format(field_name)) | ||||
self.field_arrays[field_name].set_pad_val(pad_val) | self.field_arrays[field_name].set_pad_val(pad_val) | ||||
return self | |||||
def get_input_name(self): | def get_input_name(self): | ||||
""" | """ | ||||
@@ -868,48 +873,6 @@ class DataSet(object): | |||||
return train_set, dev_set | return train_set, dev_set | ||||
@classmethod | |||||
def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): | |||||
r""" | |||||
.. warning:: | |||||
此方法会在下个版本移除,请使用 :class:`fastNLP.io.CSVLoader` | |||||
从csv_path路径下以csv的格式读取数据。 | |||||
:param str csv_path: 从哪里读取csv文件 | |||||
:param list[str] headers: 如果为None,则使用csv文件的第一行作为header; 如果传入list(str), 则元素的个数必须 | |||||
与csv文件中每行的元素个数相同。 | |||||
:param str sep: 分割符 | |||||
:param bool dropna: 是否忽略与header数量不一致行。 | |||||
:return: 读取后的 :class:`~fastNLP.读取后的DataSet`。 | |||||
""" | |||||
warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead', | |||||
category=DeprecationWarning) | |||||
with open(csv_path, "r", encoding='utf-8') as f: | |||||
start_idx = 0 | |||||
if headers is None: | |||||
headers = f.readline().rstrip('\r\n') | |||||
headers = headers.split(sep) | |||||
start_idx += 1 | |||||
else: | |||||
assert isinstance(headers, (list, tuple)), "headers should be list or tuple, not {}.".format( | |||||
type(headers)) | |||||
_dict = {} | |||||
for col in headers: | |||||
_dict[col] = [] | |||||
for line_idx, line in enumerate(f, start_idx): | |||||
contents = line.rstrip('\r\n').split(sep) | |||||
if len(contents) != len(headers): | |||||
if dropna: | |||||
continue | |||||
else: | |||||
# TODO change error type | |||||
raise ValueError("Line {} has {} parts, while header has {} parts." \ | |||||
.format(line_idx, len(contents), len(headers))) | |||||
for header, content in zip(headers, contents): | |||||
_dict[header].append(content) | |||||
return cls(_dict) | |||||
def save(self, path): | def save(self, path): | ||||
""" | """ | ||||
保存DataSet. | 保存DataSet. | ||||
@@ -61,6 +61,9 @@ class BertEmbedding(ContextualEmbedding): | |||||
# 根据model_dir_or_name检查是否存在并下载 | # 根据model_dir_or_name检查是否存在并下载 | ||||
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | ||||
if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): | |||||
warnings.warn("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" | |||||
" faster speed.") | |||||
model_url = _get_embedding_url('bert', model_dir_or_name.lower()) | model_url = _get_embedding_url('bert', model_dir_or_name.lower()) | ||||
model_dir = cached_path(model_url, name='embedding') | model_dir = cached_path(model_url, name='embedding') | ||||
# 检查是否存在 | # 检查是否存在 | ||||
@@ -91,19 +94,33 @@ class BertEmbedding(ContextualEmbedding): | |||||
:param torch.LongTensor words: [batch_size, max_len] | :param torch.LongTensor words: [batch_size, max_len] | ||||
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) | :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) | ||||
""" | """ | ||||
if self._word_sep_index: # 不能drop sep | |||||
sep_mask = words.eq(self._word_sep_index) | |||||
words = self.drop_word(words) | words = self.drop_word(words) | ||||
if self._word_sep_index: | |||||
words.masked_fill_(sep_mask, self._word_sep_index) | |||||
outputs = self._get_sent_reprs(words) | outputs = self._get_sent_reprs(words) | ||||
if outputs is not None: | if outputs is not None: | ||||
return self.dropout(words) | |||||
return self.dropout(outputs) | |||||
outputs = self.model(words) | outputs = self.model(words) | ||||
outputs = torch.cat([*outputs], dim=-1) | outputs = torch.cat([*outputs], dim=-1) | ||||
return self.dropout(outputs) | return self.dropout(outputs) | ||||
def drop_word(self, words): | |||||
""" | |||||
按照设定随机将words设置为unknown_index。 | |||||
:param torch.LongTensor words: batch_size x max_len | |||||
:return: | |||||
""" | |||||
if self.word_dropout > 0 and self.training: | |||||
with torch.no_grad(): | |||||
if self._word_sep_index: # 不能drop sep | |||||
sep_mask = words.eq(self._word_sep_index) | |||||
mask = torch.ones_like(words).float() * self.word_dropout | |||||
mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 | |||||
words = words.masked_fill(mask, self._word_unk_index) | |||||
if self._word_sep_index: | |||||
words.masked_fill_(sep_mask, self._word_sep_index) | |||||
return words | |||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -134,10 +151,12 @@ class BertWordPieceEncoder(nn.Module): | |||||
:param str layers: 最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 | :param str layers: 最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 | ||||
:param bool pooled_cls: 返回的句子开头的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取 | :param bool pooled_cls: 返回的句子开头的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取 | ||||
[CLS]做预测,一般该值为True。 | [CLS]做预测,一般该值为True。 | ||||
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||||
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | |||||
:param bool requires_grad: 是否需要gradient。 | :param bool requires_grad: 是否需要gradient。 | ||||
""" | """ | ||||
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', | |||||
pooled_cls: bool = False, requires_grad: bool=False): | |||||
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False, | |||||
word_dropout=0, dropout=0, requires_grad: bool=False): | |||||
super().__init__() | super().__init__() | ||||
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | ||||
@@ -150,8 +169,12 @@ class BertWordPieceEncoder(nn.Module): | |||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | raise ValueError(f"Cannot recognize {model_dir_or_name}.") | ||||
self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) | self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) | ||||
self._sep_index = self.model._sep_index | |||||
self._wordpiece_unk_index = self.model._wordpiece_unknown_index | |||||
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size | self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size | ||||
self.requires_grad = requires_grad | self.requires_grad = requires_grad | ||||
self.word_dropout = word_dropout | |||||
self.dropout_layer = nn.Dropout(dropout) | |||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
@@ -199,13 +222,41 @@ class BertWordPieceEncoder(nn.Module): | |||||
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 | 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 | ||||
:param words: batch_size x max_len | :param words: batch_size x max_len | ||||
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话 | |||||
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入), | |||||
第一个[SEP]及之前为0, 第二个[SEP]及到第一个[SEP]之间为1; 第三个[SEP]及到第二个[SEP]之间为0,依次往后推。 | |||||
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) | :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) | ||||
""" | """ | ||||
with torch.no_grad(): | |||||
sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len | |||||
if token_type_ids is None: | |||||
sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) | |||||
token_type_ids = sep_mask_cumsum.fmod(2) | |||||
if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 | |||||
token_type_ids = token_type_ids.eq(0).long() | |||||
word_pieces = self.drop_word(word_pieces) | |||||
outputs = self.model(word_pieces, token_type_ids) | outputs = self.model(word_pieces, token_type_ids) | ||||
outputs = torch.cat([*outputs], dim=-1) | outputs = torch.cat([*outputs], dim=-1) | ||||
return outputs | |||||
return self.dropout_layer(outputs) | |||||
def drop_word(self, words): | |||||
""" | |||||
按照设定随机将words设置为unknown_index。 | |||||
:param torch.LongTensor words: batch_size x max_len | |||||
:return: | |||||
""" | |||||
if self.word_dropout > 0 and self.training: | |||||
with torch.no_grad(): | |||||
if self._word_sep_index: # 不能drop sep | |||||
sep_mask = words.eq(self._wordpiece_unk_index) | |||||
mask = torch.ones_like(words).float() * self.word_dropout | |||||
mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 | |||||
words = words.masked_fill(mask, self._word_unk_index) | |||||
if self._word_sep_index: | |||||
words.masked_fill_(sep_mask, self._wordpiece_unk_index) | |||||
return words | |||||
class _WordBertModel(nn.Module): | class _WordBertModel(nn.Module): | ||||
@@ -288,11 +339,11 @@ class _WordBertModel(nn.Module): | |||||
word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces) | word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces) | ||||
word_to_wordpieces.append(word_pieces) | word_to_wordpieces.append(word_pieces) | ||||
word_pieces_lengths.append(len(word_pieces)) | word_pieces_lengths.append(len(word_pieces)) | ||||
print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab))) | |||||
self._cls_index = self.tokenzier.vocab['[CLS]'] | self._cls_index = self.tokenzier.vocab['[CLS]'] | ||||
self._sep_index = self.tokenzier.vocab['[SEP]'] | self._sep_index = self.tokenzier.vocab['[SEP]'] | ||||
self._word_pad_index = vocab.padding_idx | self._word_pad_index = vocab.padding_idx | ||||
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece | self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece | ||||
print("Found(Or segment into word pieces) {} words out of {}.".format(found_count, len(vocab))) | |||||
self.word_to_wordpieces = np.array(word_to_wordpieces) | self.word_to_wordpieces = np.array(word_to_wordpieces) | ||||
self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) | self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) | ||||
print("Successfully generate word pieces.") | print("Successfully generate word pieces.") | ||||
@@ -339,7 +390,7 @@ class _WordBertModel(nn.Module): | |||||
sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) | sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) | ||||
token_type_ids = sep_mask_cumsum.fmod(2) | token_type_ids = sep_mask_cumsum.fmod(2) | ||||
if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 | if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 | ||||
token_type_ids = token_type_ids.eq(0).float() | |||||
token_type_ids = token_type_ids.eq(0).long() | |||||
else: | else: | ||||
token_type_ids = torch.zeros_like(word_pieces) | token_type_ids = torch.zeros_like(word_pieces) | ||||
# 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 | # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 | ||||
@@ -45,7 +45,7 @@ class StaticEmbedding(TokenEmbedding): | |||||
:param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 | :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 | ||||
以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 | 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 | ||||
如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 | 如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 | ||||
:param int embedding_dim: 随机初始化的embedding的维度,仅在model_dir_or_name为None时有效。 | |||||
:param int embedding_dim: 随机初始化的embedding的维度,当该值为大于0的值时,将忽略model_dir_or_name。 | |||||
:param bool requires_grad: 是否需要gradient. 默认为True | :param bool requires_grad: 是否需要gradient. 默认为True | ||||
:param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对 | :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对 | ||||
:param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 | :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 | ||||
@@ -55,9 +55,11 @@ class StaticEmbedding(TokenEmbedding): | |||||
:param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 | :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 | ||||
:param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 | :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 | ||||
""" | """ | ||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=100, requires_grad: bool=True, | |||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True, | |||||
init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): | init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): | ||||
super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | ||||
if embedding_dim>0: | |||||
model_dir_or_name = None | |||||
# 得到cache_path | # 得到cache_path | ||||
if model_dir_or_name is None: | if model_dir_or_name is None: | ||||
@@ -30,6 +30,9 @@ __all__ = [ | |||||
'Conll2003NERLoader', | 'Conll2003NERLoader', | ||||
'OntoNotesNERLoader', | 'OntoNotesNERLoader', | ||||
'CTBLoader', | 'CTBLoader', | ||||
"MsraNERLoader", | |||||
"WeiboNERLoader", | |||||
"PeopleDailyNERLoader", | |||||
'CSVLoader', | 'CSVLoader', | ||||
'JsonLoader', | 'JsonLoader', | ||||
@@ -50,6 +53,9 @@ __all__ = [ | |||||
"Conll2003NERPipe", | "Conll2003NERPipe", | ||||
"OntoNotesNERPipe", | "OntoNotesNERPipe", | ||||
"MsraNERPipe", | |||||
"PeopleDailyPipe", | |||||
"WeiboNERPipe", | |||||
"MatchingBertPipe", | "MatchingBertPipe", | ||||
"RTEBertPipe", | "RTEBertPipe", | ||||
@@ -133,19 +133,21 @@ class DataBundle: | |||||
:param ~fastNLP.Vocabulary vocab: 词表 | :param ~fastNLP.Vocabulary vocab: 词表 | ||||
:param str field_name: 这个vocab对应的field名称 | :param str field_name: 这个vocab对应的field名称 | ||||
:return: | |||||
:return: self | |||||
""" | """ | ||||
assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary supports." | assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary supports." | ||||
self.vocabs[field_name] = vocab | self.vocabs[field_name] = vocab | ||||
return self | |||||
def set_dataset(self, dataset, name): | def set_dataset(self, dataset, name): | ||||
""" | """ | ||||
:param ~fastNLP.DataSet dataset: 传递给DataBundle的DataSet | :param ~fastNLP.DataSet dataset: 传递给DataBundle的DataSet | ||||
:param str name: dataset的名称 | :param str name: dataset的名称 | ||||
:return: | |||||
:return: self | |||||
""" | """ | ||||
self.datasets[name] = dataset | self.datasets[name] = dataset | ||||
return self | |||||
def get_dataset(self, name:str)->DataSet: | def get_dataset(self, name:str)->DataSet: | ||||
""" | """ | ||||
@@ -165,7 +167,7 @@ class DataBundle: | |||||
""" | """ | ||||
return self.vocabs[field_name] | return self.vocabs[field_name] | ||||
def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_field=True): | |||||
def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_dataset=True): | |||||
""" | """ | ||||
将field_names中的field设置为input, 对data_bundle中所有的dataset执行该操作:: | 将field_names中的field设置为input, 对data_bundle中所有的dataset执行该操作:: | ||||
@@ -176,18 +178,21 @@ class DataBundle: | |||||
:param bool flag: 将field_name的input状态设置为flag | :param bool flag: 将field_name的input状态设置为flag | ||||
:param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | ||||
行的数据进行类型和维度推断本列的数据的类型和维度。 | 行的数据进行类型和维度推断本列的数据的类型和维度。 | ||||
:param bool ignore_miss_field: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 | |||||
:param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; | |||||
如果为False,则报错 | |||||
:return self | |||||
""" | """ | ||||
for field_name in field_names: | for field_name in field_names: | ||||
for name, dataset in self.datasets.items(): | for name, dataset in self.datasets.items(): | ||||
if not ignore_miss_field and not dataset.has_field(field_name): | |||||
if not ignore_miss_dataset and not dataset.has_field(field_name): | |||||
raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") | raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") | ||||
if not dataset.has_field(field_name): | if not dataset.has_field(field_name): | ||||
continue | continue | ||||
else: | else: | ||||
dataset.set_input(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) | dataset.set_input(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) | ||||
return self | |||||
def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_field=True): | |||||
def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_dataset=True): | |||||
""" | """ | ||||
将field_names中的field设置为target, 对data_bundle中所有的dataset执行该操作:: | 将field_names中的field设置为target, 对data_bundle中所有的dataset执行该操作:: | ||||
@@ -198,16 +203,34 @@ class DataBundle: | |||||
:param bool flag: 将field_name的target状态设置为flag | :param bool flag: 将field_name的target状态设置为flag | ||||
:param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | ||||
行的数据进行类型和维度推断本列的数据的类型和维度。 | 行的数据进行类型和维度推断本列的数据的类型和维度。 | ||||
:param bool ignore_miss_field: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 | |||||
:param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 | |||||
:return self | |||||
""" | """ | ||||
for field_name in field_names: | for field_name in field_names: | ||||
for name, dataset in self.datasets.items(): | for name, dataset in self.datasets.items(): | ||||
if not ignore_miss_field and not dataset.has_field(field_name): | |||||
if not ignore_miss_dataset and not dataset.has_field(field_name): | |||||
raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") | raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") | ||||
if not dataset.has_field(field_name): | if not dataset.has_field(field_name): | ||||
continue | continue | ||||
else: | else: | ||||
dataset.set_target(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) | dataset.set_target(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) | ||||
return self | |||||
def copy_field(self, field_name, new_field_name, ignore_miss_dataset=True): | |||||
""" | |||||
将DataBundle中所有的field_name复制一份叫new_field_name. | |||||
:param str field_name: | |||||
:param str new_field_name: | |||||
:param bool ignore_miss_dataset: 若DataBundle中的DataSet的 | |||||
:return: self | |||||
""" | |||||
for name, dataset in self.datasets.items(): | |||||
if dataset.has_field(field_name=field_name): | |||||
dataset.copy_field(field_name=field_name, new_field_name=new_field_name) | |||||
elif ignore_miss_dataset: | |||||
raise KeyError(f"{field_name} not found DataSet:{name}.") | |||||
return self | |||||
def __repr__(self): | def __repr__(self): | ||||
_str = 'In total {} datasets:\n'.format(len(self.datasets)) | _str = 'In total {} datasets:\n'.format(len(self.datasets)) | ||||
@@ -27,6 +27,7 @@ PRETRAINED_BERT_MODEL_DIR = { | |||||
'cn': 'bert-chinese-wwm.zip', | 'cn': 'bert-chinese-wwm.zip', | ||||
'cn-base': 'bert-base-chinese.zip', | 'cn-base': 'bert-base-chinese.zip', | ||||
'cn-wwm': 'bert-chinese-wwm.zip', | 'cn-wwm': 'bert-chinese-wwm.zip', | ||||
'cn-wwm-ext': "bert-chinese-wwm-ext.zip" | |||||
} | } | ||||
PRETRAINED_ELMO_MODEL_DIR = { | PRETRAINED_ELMO_MODEL_DIR = { | ||||
@@ -56,7 +57,7 @@ PRETRAIN_STATIC_FILES = { | |||||
'en-fasttext-wiki': "wiki-news-300d-1M.vec.zip", | 'en-fasttext-wiki': "wiki-news-300d-1M.vec.zip", | ||||
'en-fasttext-crawl': "crawl-300d-2M.vec.zip", | 'en-fasttext-crawl': "crawl-300d-2M.vec.zip", | ||||
'cn': "tencent_cn.txt.zip", | |||||
'cn': "tencent_cn.zip", | |||||
'cn-tencent': "tencent_cn.txt.zip", | 'cn-tencent': "tencent_cn.txt.zip", | ||||
'cn-fasttext': "cc.zh.300.vec.gz", | 'cn-fasttext': "cc.zh.300.vec.gz", | ||||
'cn-sgns-literature-word': 'sgns.literature.word.txt.zip', | 'cn-sgns-literature-word': 'sgns.literature.word.txt.zip', | ||||
@@ -71,7 +72,10 @@ DATASET_DIR = { | |||||
"qnli": "QNLI.zip", | "qnli": "QNLI.zip", | ||||
"sst-2": "SST-2.zip", | "sst-2": "SST-2.zip", | ||||
"sst": "SST.zip", | "sst": "SST.zip", | ||||
"rte": "RTE.zip" | |||||
"rte": "RTE.zip", | |||||
"msra-ner": "MSRA_NER.zip", | |||||
"peopledaily": "peopledaily.zip", | |||||
"weibo-ner": "weibo_NER.zip" | |||||
} | } | ||||
PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, | PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, | ||||
@@ -320,42 +324,44 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: | |||||
# GET file object | # GET file object | ||||
req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"}) | req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"}) | ||||
if req.status_code == 200: | if req.status_code == 200: | ||||
content_length = req.headers.get("Content-Length") | |||||
total = int(content_length) if content_length is not None else None | |||||
progress = tqdm(unit="B", total=total, unit_scale=1) | |||||
fd, temp_filename = tempfile.mkstemp() | |||||
print("%s not found in cache, downloading to %s" % (url, temp_filename)) | |||||
with open(temp_filename, "wb") as temp_file: | |||||
for chunk in req.iter_content(chunk_size=1024 * 16): | |||||
if chunk: # filter out keep-alive new chunks | |||||
progress.update(len(chunk)) | |||||
temp_file.write(chunk) | |||||
progress.close() | |||||
print(f"Finish download from {url}.") | |||||
# 开始解压 | |||||
delete_temp_dir = None | |||||
if suffix in ('.zip', '.tar.gz'): | |||||
uncompress_temp_dir = tempfile.mkdtemp() | |||||
delete_temp_dir = uncompress_temp_dir | |||||
print(f"Start to uncompress file to {uncompress_temp_dir}") | |||||
if suffix == '.zip': | |||||
unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) | |||||
else: | |||||
untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) | |||||
filenames = os.listdir(uncompress_temp_dir) | |||||
if len(filenames) == 1: | |||||
if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): | |||||
uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) | |||||
cache_path.mkdir(parents=True, exist_ok=True) | |||||
print("Finish un-compressing file.") | |||||
else: | |||||
uncompress_temp_dir = temp_filename | |||||
cache_path = str(cache_path) + suffix | |||||
success = False | success = False | ||||
fd, temp_filename = tempfile.mkstemp() | |||||
uncompress_temp_dir = None | |||||
try: | try: | ||||
content_length = req.headers.get("Content-Length") | |||||
total = int(content_length) if content_length is not None else None | |||||
progress = tqdm(unit="B", total=total, unit_scale=1) | |||||
print("%s not found in cache, downloading to %s" % (url, temp_filename)) | |||||
with open(temp_filename, "wb") as temp_file: | |||||
for chunk in req.iter_content(chunk_size=1024 * 16): | |||||
if chunk: # filter out keep-alive new chunks | |||||
progress.update(len(chunk)) | |||||
temp_file.write(chunk) | |||||
progress.close() | |||||
print(f"Finish download from {url}") | |||||
# 开始解压 | |||||
if suffix in ('.zip', '.tar.gz', '.gz'): | |||||
uncompress_temp_dir = tempfile.mkdtemp() | |||||
print(f"Start to uncompress file to {uncompress_temp_dir}") | |||||
if suffix == '.zip': | |||||
unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) | |||||
elif suffix == '.gz': | |||||
ungzip_file(temp_filename, uncompress_temp_dir, dir_name) | |||||
else: | |||||
untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) | |||||
filenames = os.listdir(uncompress_temp_dir) | |||||
if len(filenames) == 1: | |||||
if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): | |||||
uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) | |||||
cache_path.mkdir(parents=True, exist_ok=True) | |||||
print("Finish un-compressing file.") | |||||
else: | |||||
uncompress_temp_dir = temp_filename | |||||
cache_path = str(cache_path) + suffix | |||||
# 复制到指定的位置 | # 复制到指定的位置 | ||||
print(f"Copy file to {cache_path}") | print(f"Copy file to {cache_path}") | ||||
if os.path.isdir(uncompress_temp_dir): | if os.path.isdir(uncompress_temp_dir): | ||||
@@ -377,10 +383,12 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: | |||||
os.remove(cache_path) | os.remove(cache_path) | ||||
else: | else: | ||||
shutil.rmtree(cache_path) | shutil.rmtree(cache_path) | ||||
if delete_temp_dir: | |||||
shutil.rmtree(delete_temp_dir) | |||||
os.close(fd) | os.close(fd) | ||||
os.remove(temp_filename) | os.remove(temp_filename) | ||||
if os.path.isdir(uncompress_temp_dir): | |||||
shutil.rmtree(uncompress_temp_dir) | |||||
elif os.path.isfile(uncompress_temp_dir): | |||||
os.remove(uncompress_temp_dir) | |||||
return get_filepath(cache_path) | return get_filepath(cache_path) | ||||
else: | else: | ||||
raise HTTPError(f"Status code:{req.status_code}. Fail to download from {url}.") | raise HTTPError(f"Status code:{req.status_code}. Fail to download from {url}.") | ||||
@@ -402,6 +410,15 @@ def untar_gz_file(file: Path, to: Path): | |||||
tar.extractall(to) | tar.extractall(to) | ||||
def ungzip_file(file: str, to: str, filename:str): | |||||
import gzip | |||||
g_file = gzip.GzipFile(file) | |||||
with open(os.path.join(to, filename), 'wb+') as f: | |||||
f.write(g_file.read()) | |||||
g_file.close() | |||||
def match_file(dir_name: str, cache_dir: Path) -> str: | def match_file(dir_name: str, cache_dir: Path) -> str: | ||||
""" | """ | ||||
匹配的原则是: 在cache_dir下的文件与dir_name完全一致, 或除了后缀以外和dir_name完全一致。 | 匹配的原则是: 在cache_dir下的文件与dir_name完全一致, 或除了后缀以外和dir_name完全一致。 | ||||
@@ -58,6 +58,9 @@ __all__ = [ | |||||
'Conll2003NERLoader', | 'Conll2003NERLoader', | ||||
'OntoNotesNERLoader', | 'OntoNotesNERLoader', | ||||
'CTBLoader', | 'CTBLoader', | ||||
"MsraNERLoader", | |||||
"PeopleDailyNERLoader", | |||||
"WeiboNERLoader", | |||||
# 'CSVLoader', | # 'CSVLoader', | ||||
# 'JsonLoader', | # 'JsonLoader', | ||||
@@ -77,3 +80,4 @@ from .cws import CWSLoader | |||||
from .json import JsonLoader | from .json import JsonLoader | ||||
from .loader import Loader | from .loader import Loader | ||||
from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader | from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader | ||||
from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader |
@@ -6,6 +6,8 @@ import os | |||||
import random | import random | ||||
import shutil | import shutil | ||||
import numpy as np | import numpy as np | ||||
import glob | |||||
import time | |||||
class YelpLoader(Loader): | class YelpLoader(Loader): | ||||
@@ -57,7 +59,7 @@ class YelpLoader(Loader): | |||||
class YelpFullLoader(YelpLoader): | class YelpFullLoader(YelpLoader): | ||||
def download(self, dev_ratio: float = 0.1, seed: int = 0): | |||||
def download(self, dev_ratio: float = 0.1, re_download:bool=False): | |||||
""" | """ | ||||
自动下载数据集,如果你使用了这个数据集,请引用以下的文章 | 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 | ||||
@@ -68,35 +70,23 @@ class YelpFullLoader(YelpLoader): | |||||
dev.csv三个文件。 | dev.csv三个文件。 | ||||
:param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 | :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 | ||||
:param int seed: 划分dev时的随机数种子 | |||||
:param bool re_download: 是否重新下载数据,以重新切分数据。 | |||||
:return: str, 数据集的目录地址 | :return: str, 数据集的目录地址 | ||||
""" | """ | ||||
dataset_name = 'yelp-review-full' | dataset_name = 'yelp-review-full' | ||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | data_dir = self._get_dataset_path(dataset_name=dataset_name) | ||||
if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否需要重新下载 | |||||
re_download = True | |||||
if dev_ratio > 0: | |||||
dev_line_count = 0 | |||||
tr_line_count = 0 | |||||
with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ | |||||
open(os.path.join(data_dir, 'dev.csv'), 'r', encoding='utf-8') as f2: | |||||
for line in f1: | |||||
tr_line_count += 1 | |||||
for line in f2: | |||||
dev_line_count += 1 | |||||
if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): | |||||
re_download = True | |||||
else: | |||||
re_download = False | |||||
if re_download: | |||||
shutil.rmtree(data_dir) | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
modify_time = 0 | |||||
for filepath in glob.glob(os.path.join(data_dir, '*')): | |||||
modify_time = os.stat(filepath).st_mtime | |||||
break | |||||
if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 | |||||
shutil.rmtree(data_dir) | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
if not os.path.exists(os.path.join(data_dir, 'dev.csv')): | if not os.path.exists(os.path.join(data_dir, 'dev.csv')): | ||||
if dev_ratio > 0: | if dev_ratio > 0: | ||||
assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | ||||
random.seed(int(seed)) | |||||
try: | try: | ||||
with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ | with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ | ||||
open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ | open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ | ||||
@@ -116,44 +106,32 @@ class YelpFullLoader(YelpLoader): | |||||
class YelpPolarityLoader(YelpLoader): | class YelpPolarityLoader(YelpLoader): | ||||
def download(self, dev_ratio: float = 0.1, seed: int = 0): | |||||
def download(self, dev_ratio: float = 0.1, re_download=False): | |||||
""" | """ | ||||
自动下载数据集,如果你使用了这个数据集,请引用以下的文章 | 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 | ||||
Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances | Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances | ||||
in Neural Information Processing Systems 28 (NIPS 2015) | in Neural Information Processing Systems 28 (NIPS 2015) | ||||
根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev | |||||
根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分dev_ratio这么多作为dev | |||||
:param float dev_ratio: 如果路径中不存在dev.csv, 从train划分多少作为dev的数据. 如果为0,则不划分dev | |||||
:param int seed: 划分dev时的随机数种子 | |||||
:param float dev_ratio: 如果路径中不存在dev.csv, 从train划分多少作为dev的数据。 如果为0,则不划分dev。 | |||||
:param bool re_download: 是否重新下载数据,以重新切分数据。 | |||||
:return: str, 数据集的目录地址 | :return: str, 数据集的目录地址 | ||||
""" | """ | ||||
dataset_name = 'yelp-review-polarity' | dataset_name = 'yelp-review-polarity' | ||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | data_dir = self._get_dataset_path(dataset_name=dataset_name) | ||||
if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否符合比例要求 | |||||
re_download = True | |||||
if dev_ratio > 0: | |||||
dev_line_count = 0 | |||||
tr_line_count = 0 | |||||
with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ | |||||
open(os.path.join(data_dir, 'dev.csv'), 'r', encoding='utf-8') as f2: | |||||
for line in f1: | |||||
tr_line_count += 1 | |||||
for line in f2: | |||||
dev_line_count += 1 | |||||
if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): | |||||
re_download = True | |||||
else: | |||||
re_download = False | |||||
if re_download: | |||||
shutil.rmtree(data_dir) | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
modify_time = 0 | |||||
for filepath in glob.glob(os.path.join(data_dir, '*')): | |||||
modify_time = os.stat(filepath).st_mtime | |||||
break | |||||
if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 | |||||
shutil.rmtree(data_dir) | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
if not os.path.exists(os.path.join(data_dir, 'dev.csv')): | if not os.path.exists(os.path.join(data_dir, 'dev.csv')): | ||||
if dev_ratio > 0: | if dev_ratio > 0: | ||||
assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | ||||
random.seed(int(seed)) | |||||
try: | try: | ||||
with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ | with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ | ||||
open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ | open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ | ||||
@@ -209,7 +187,7 @@ class IMDBLoader(Loader): | |||||
return dataset | return dataset | ||||
def download(self, dev_ratio: float = 0.1, seed: int = 0): | |||||
def download(self, dev_ratio: float = 0.1, re_download=False): | |||||
""" | """ | ||||
自动下载数据集,如果你使用了这个数据集,请引用以下的文章 | 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 | ||||
@@ -218,34 +196,22 @@ class IMDBLoader(Loader): | |||||
根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev | 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev | ||||
:param float dev_ratio: 如果路径中没有dev.txt。从train划分多少作为dev的数据. 如果为0,则不划分dev | :param float dev_ratio: 如果路径中没有dev.txt。从train划分多少作为dev的数据. 如果为0,则不划分dev | ||||
:param int seed: 划分dev时的随机数种子 | |||||
:param bool re_download: 是否重新下载数据,以重新切分数据。 | |||||
:return: str, 数据集的目录地址 | :return: str, 数据集的目录地址 | ||||
""" | """ | ||||
dataset_name = 'aclImdb' | dataset_name = 'aclImdb' | ||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | data_dir = self._get_dataset_path(dataset_name=dataset_name) | ||||
if os.path.exists(os.path.join(data_dir, 'dev.txt')): # 存在dev的话,check是否符合比例要求 | |||||
re_download = True | |||||
if dev_ratio > 0: | |||||
dev_line_count = 0 | |||||
tr_line_count = 0 | |||||
with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f1, \ | |||||
open(os.path.join(data_dir, 'dev.txt'), 'r', encoding='utf-8') as f2: | |||||
for line in f1: | |||||
tr_line_count += 1 | |||||
for line in f2: | |||||
dev_line_count += 1 | |||||
if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): | |||||
re_download = True | |||||
else: | |||||
re_download = False | |||||
if re_download: | |||||
shutil.rmtree(data_dir) | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
modify_time = 0 | |||||
for filepath in glob.glob(os.path.join(data_dir, '*')): | |||||
modify_time = os.stat(filepath).st_mtime | |||||
break | |||||
if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 | |||||
shutil.rmtree(data_dir) | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
if not os.path.exists(os.path.join(data_dir, 'dev.csv')): | if not os.path.exists(os.path.join(data_dir, 'dev.csv')): | ||||
if dev_ratio > 0: | if dev_ratio > 0: | ||||
assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | ||||
random.seed(int(seed)) | |||||
try: | try: | ||||
with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f, \ | with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f, \ | ||||
open(os.path.join(data_dir, 'middle_file.txt'), 'w', encoding='utf-8') as f1, \ | open(os.path.join(data_dir, 'middle_file.txt'), 'w', encoding='utf-8') as f1, \ | ||||
@@ -4,10 +4,12 @@ from .loader import Loader | |||||
from ...core.dataset import DataSet | from ...core.dataset import DataSet | ||||
from ..file_reader import _read_conll | from ..file_reader import _read_conll | ||||
from ...core.instance import Instance | from ...core.instance import Instance | ||||
from .. import DataBundle | |||||
from ..utils import check_loader_paths | |||||
from ...core.const import Const | from ...core.const import Const | ||||
import glob | |||||
import os | |||||
import shutil | |||||
import time | |||||
import random | |||||
class ConllLoader(Loader): | class ConllLoader(Loader): | ||||
""" | """ | ||||
@@ -262,3 +264,173 @@ class CTBLoader(Loader): | |||||
def _load(self, path:str): | def _load(self, path:str): | ||||
pass | pass | ||||
class CNNERLoader(Loader): | |||||
def _load(self, path:str): | |||||
""" | |||||
支持加载形如以下格式的内容,一行两列,以空格隔开两个sample | |||||
Example:: | |||||
我 O | |||||
们 O | |||||
变 O | |||||
而 O | |||||
以 O | |||||
书 O | |||||
会 O | |||||
... | |||||
:param str path: 文件路径 | |||||
:return: DataSet,包含raw_words列和target列 | |||||
""" | |||||
ds = DataSet() | |||||
with open(path, 'r', encoding='utf-8') as f: | |||||
raw_chars = [] | |||||
target = [] | |||||
for line in f: | |||||
line = line.strip() | |||||
if line: | |||||
parts = line.split() | |||||
if len(parts) == 1: # 网上下载的数据有一些列少tag,默认补充O | |||||
parts.append('O') | |||||
raw_chars.append(parts[0]) | |||||
target.append(parts[1]) | |||||
else: | |||||
if raw_chars: | |||||
ds.append(Instance(raw_chars=raw_chars, target=target)) | |||||
raw_chars = [] | |||||
target = [] | |||||
return ds | |||||
class MsraNERLoader(CNNERLoader): | |||||
""" | |||||
读取MSRA-NER数据,数据中的格式应该类似与下列的内容 | |||||
Example:: | |||||
我 O | |||||
们 O | |||||
变 O | |||||
而 O | |||||
以 O | |||||
书 O | |||||
会 O | |||||
... | |||||
读取后的DataSet包含以下的field | |||||
.. csv-table:: target列是基于BIO的编码方式 | |||||
:header: "raw_chars", "target" | |||||
"[我, 们, 变...]", "[O, O, ...]" | |||||
"[中, 共, 中, ...]", "[B-ORG, I-ORG, I-ORG, ...]" | |||||
"[...]", "[...]" | |||||
""" | |||||
def __init__(self): | |||||
super().__init__() | |||||
def download(self, dev_ratio:float=0.1, re_download:bool=False)->str: | |||||
""" | |||||
自动下载MSAR-NER的数据,如果你使用该数据,请引用 Gina-Anne Levow, 2006, The Third International Chinese Language | |||||
Processing Bakeoff: Word Segmentation and Named Entity Recognition. | |||||
根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后在output_dir中有train.conll, test.conll, | |||||
dev.conll三个文件。 | |||||
:param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 | |||||
:param bool re_download: 是否重新下载数据,以重新切分数据。 | |||||
:return: str, 数据集的目录地址 | |||||
:return: | |||||
""" | |||||
dataset_name = 'msra-ner' | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
modify_time = 0 | |||||
for filepath in glob.glob(os.path.join(data_dir, '*')): | |||||
modify_time = os.stat(filepath).st_mtime | |||||
break | |||||
if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 | |||||
shutil.rmtree(data_dir) | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
if not os.path.exists(os.path.join(data_dir, 'dev.conll')): | |||||
if dev_ratio > 0: | |||||
assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | |||||
try: | |||||
with open(os.path.join(data_dir, 'train.conll'), 'r', encoding='utf-8') as f, \ | |||||
open(os.path.join(data_dir, 'middle_file.conll'), 'w', encoding='utf-8') as f1, \ | |||||
open(os.path.join(data_dir, 'dev.conll'), 'w', encoding='utf-8') as f2: | |||||
lines = [] # 一个sample包含很多行 | |||||
for line in f: | |||||
line = line.strip() | |||||
if line: | |||||
lines.append(line) | |||||
else: | |||||
if random.random() < dev_ratio: | |||||
f2.write('\n'.join(lines) + '\n\n') | |||||
else: | |||||
f1.write('\n'.join(lines) + '\n\n') | |||||
lines.clear() | |||||
os.remove(os.path.join(data_dir, 'train.conll')) | |||||
os.renames(os.path.join(data_dir, 'middle_file.conll'), os.path.join(data_dir, 'train.conll')) | |||||
finally: | |||||
if os.path.exists(os.path.join(data_dir, 'middle_file.conll')): | |||||
os.remove(os.path.join(data_dir, 'middle_file.conll')) | |||||
return data_dir | |||||
class WeiboNERLoader(CNNERLoader): | |||||
def __init__(self): | |||||
super().__init__() | |||||
def download(self)->str: | |||||
""" | |||||
自动下载Weibo-NER的数据,如果你使用了该数据,请引用 Nanyun Peng and Mark Dredze, 2015, Named Entity Recognition for | |||||
Chinese Social Media with Jointly Trained Embeddings. | |||||
:return: str | |||||
""" | |||||
dataset_name = 'weibo-ner' | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
return data_dir | |||||
class PeopleDailyNERLoader(CNNERLoader): | |||||
""" | |||||
支持加载的数据格式如下 | |||||
Example:: | |||||
当 O | |||||
希 O | |||||
望 O | |||||
工 O | |||||
程 O | |||||
救 O | |||||
助 O | |||||
的 O | |||||
百 O | |||||
读取后的DataSet包含以下的field | |||||
.. csv-table:: target列是基于BIO的编码方式 | |||||
:header: "raw_chars", "target" | |||||
"[我, 们, 变...]", "[O, O, ...]" | |||||
"[中, 共, 中, ...]", "[B-ORG, I-ORG, I-ORG, ...]" | |||||
"[...]", "[...]" | |||||
""" | |||||
def __init__(self): | |||||
super().__init__() | |||||
def download(self) -> str: | |||||
dataset_name = 'peopledaily' | |||||
data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||||
return data_dir |
@@ -8,6 +8,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce | |||||
""" | """ | ||||
__all__ = [ | __all__ = [ | ||||
"Pipe", | |||||
"YelpFullPipe", | "YelpFullPipe", | ||||
"YelpPolarityPipe", | "YelpPolarityPipe", | ||||
"SSTPipe", | "SSTPipe", | ||||
@@ -16,6 +18,9 @@ __all__ = [ | |||||
"Conll2003NERPipe", | "Conll2003NERPipe", | ||||
"OntoNotesNERPipe", | "OntoNotesNERPipe", | ||||
"MsraNERPipe", | |||||
"WeiboNERPipe", | |||||
"PeopleDailyPipe", | |||||
"MatchingBertPipe", | "MatchingBertPipe", | ||||
"RTEBertPipe", | "RTEBertPipe", | ||||
@@ -32,6 +37,7 @@ __all__ = [ | |||||
] | ] | ||||
from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe | from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe | ||||
from .conll import Conll2003NERPipe, OntoNotesNERPipe | |||||
from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe | |||||
from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ | from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ | ||||
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe | MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe | ||||
from .pipe import Pipe |
@@ -4,6 +4,8 @@ from .utils import iob2, iob2bioes | |||||
from ...core.const import Const | from ...core.const import Const | ||||
from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader | from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader | ||||
from .utils import _indexize, _add_words_field | from .utils import _indexize, _add_words_field | ||||
from .utils import _add_chars_field | |||||
from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader | |||||
class _NERPipe(Pipe): | class _NERPipe(Pipe): | ||||
@@ -17,7 +19,7 @@ class _NERPipe(Pipe): | |||||
:param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | ||||
:param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 | :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 | ||||
:param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 | |||||
:param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 | |||||
""" | """ | ||||
def __init__(self, encoding_type: str = 'bio', lower: bool = False, target_pad_val=0): | def __init__(self, encoding_type: str = 'bio', lower: bool = False, target_pad_val=0): | ||||
@@ -32,31 +34,16 @@ class _NERPipe(Pipe): | |||||
""" | """ | ||||
支持的DataSet的field为 | 支持的DataSet的field为 | ||||
.. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | |||||
.. csv-table:: | |||||
:header: "raw_words", "target" | :header: "raw_words", "target" | ||||
"[Nadim, Ladki]", "[B-PER, I-PER]" | "[Nadim, Ladki]", "[B-PER, I-PER]" | ||||
"[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" | "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" | ||||
"[...]", "[...]" | "[...]", "[...]" | ||||
:param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 | :param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 | ||||
在传入DataBundle基础上原位修改。 | 在传入DataBundle基础上原位修改。 | ||||
:return: DataBundle | :return: DataBundle | ||||
Example:: | |||||
data_bundle = Conll2003Loader().load('/path/to/conll2003/') | |||||
data_bundle = Conll2003NERPipe().process(data_bundle) | |||||
# 获取train | |||||
tr_data = data_bundle.get_dataset('train') | |||||
# 获取target这个field的词表 | |||||
target_vocab = data_bundle.get_vocab('target') | |||||
# 获取words这个field的词表 | |||||
word_vocab = data_bundle.get_vocab('words') | |||||
""" | """ | ||||
# 转换tag | # 转换tag | ||||
for name, dataset in data_bundle.datasets.items(): | for name, dataset in data_bundle.datasets.items(): | ||||
@@ -79,18 +66,6 @@ class _NERPipe(Pipe): | |||||
return data_bundle | return data_bundle | ||||
def process_from_file(self, paths) -> DataBundle: | |||||
""" | |||||
:param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。 | |||||
:return: DataBundle | |||||
""" | |||||
# 读取数据 | |||||
data_bundle = Conll2003NERLoader().load(paths) | |||||
data_bundle = self.process(data_bundle) | |||||
return data_bundle | |||||
class Conll2003NERPipe(_NERPipe): | class Conll2003NERPipe(_NERPipe): | ||||
""" | """ | ||||
@@ -102,8 +77,8 @@ class Conll2003NERPipe(_NERPipe): | |||||
.. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | ||||
:header: "raw_words", "words", "target", "seq_len" | :header: "raw_words", "words", "target", "seq_len" | ||||
"[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 10 | |||||
"[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 | |||||
"[...]", "[...]", "[...]", . | "[...]", "[...]", "[...]", . | ||||
raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | ||||
@@ -134,10 +109,13 @@ class OntoNotesNERPipe(_NERPipe): | |||||
.. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | ||||
:header: "raw_words", "words", "target", "seq_len" | :header: "raw_words", "words", "target", "seq_len" | ||||
"[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 6 | |||||
"[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 | |||||
"[...]", "[...]", "[...]", . | "[...]", "[...]", "[...]", . | ||||
raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||||
target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 | |||||
:param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | ||||
:param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 | :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 | ||||
:param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 | :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 | ||||
@@ -146,3 +124,124 @@ class OntoNotesNERPipe(_NERPipe): | |||||
def process_from_file(self, paths): | def process_from_file(self, paths): | ||||
data_bundle = OntoNotesNERLoader().load(paths) | data_bundle = OntoNotesNERLoader().load(paths) | ||||
return self.process(data_bundle) | return self.process(data_bundle) | ||||
class _CNNERPipe(Pipe): | |||||
""" | |||||
中文NER任务的处理Pipe, 该Pipe会(1)复制raw_chars列,并命名为chars; (2)在chars, target列建立词表 | |||||
(创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将chars,target列根据相应的 | |||||
Vocabulary转换为index。 | |||||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target, seq_len。 | |||||
:param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | |||||
:param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 | |||||
""" | |||||
def __init__(self, encoding_type: str = 'bio', target_pad_val=0): | |||||
if encoding_type == 'bio': | |||||
self.convert_tag = iob2 | |||||
else: | |||||
self.convert_tag = lambda words: iob2bioes(iob2(words)) | |||||
self.target_pad_val = int(target_pad_val) | |||||
def process(self, data_bundle: DataBundle) -> DataBundle: | |||||
""" | |||||
支持的DataSet的field为 | |||||
.. csv-table:: | |||||
:header: "raw_chars", "target" | |||||
"[相, 比, 之, 下,...]", "[O, O, O, O, ...]" | |||||
"[青, 岛, 海, 牛, 队, 和, ...]", "[B-ORG, I-ORG, I-ORG, ...]" | |||||
"[...]", "[...]" | |||||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||||
:param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 | |||||
在传入DataBundle基础上原位修改。 | |||||
:return: DataBundle | |||||
""" | |||||
# 转换tag | |||||
for name, dataset in data_bundle.datasets.items(): | |||||
dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) | |||||
_add_chars_field(data_bundle, lower=False) | |||||
# index | |||||
_indexize(data_bundle, input_field_name=Const.CHAR_INPUT, target_field_name=Const.TARGET) | |||||
input_fields = [Const.TARGET, Const.CHAR_INPUT, Const.INPUT_LEN] | |||||
target_fields = [Const.TARGET, Const.INPUT_LEN] | |||||
for name, dataset in data_bundle.datasets.items(): | |||||
dataset.set_pad_val(Const.TARGET, self.target_pad_val) | |||||
dataset.add_seq_len(Const.CHAR_INPUT) | |||||
data_bundle.set_input(*input_fields) | |||||
data_bundle.set_target(*target_fields) | |||||
return data_bundle | |||||
class MsraNERPipe(_CNNERPipe): | |||||
""" | |||||
处理MSRA-NER的数据,处理之后的DataSet的field情况为 | |||||
.. csv-table:: | |||||
:header: "raw_chars", "chars", "target", "seq_len" | |||||
"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||||
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||||
"[...]", "[...]", "[...]", . | |||||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||||
""" | |||||
def process_from_file(self, paths=None) -> DataBundle: | |||||
data_bundle = MsraNERLoader().load(paths) | |||||
return self.process(data_bundle) | |||||
class PeopleDailyPipe(_CNNERPipe): | |||||
""" | |||||
处理people daily的ner的数据,处理之后的DataSet的field情况为 | |||||
.. csv-table:: | |||||
:header: "raw_chars", "chars", "target", "seq_len" | |||||
"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||||
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||||
"[...]", "[...]", "[...]", . | |||||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||||
""" | |||||
def process_from_file(self, paths=None) -> DataBundle: | |||||
data_bundle = PeopleDailyNERLoader().load(paths) | |||||
return self.process(data_bundle) | |||||
class WeiboNERPipe(_CNNERPipe): | |||||
""" | |||||
处理weibo的ner的数据,处理之后的DataSet的field情况为 | |||||
.. csv-table:: | |||||
:header: "raw_chars", "chars", "target", "seq_len" | |||||
"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||||
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||||
"[...]", "[...]", "[...]", . | |||||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||||
:param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | |||||
:param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 | |||||
""" | |||||
def process_from_file(self, paths=None) -> DataBundle: | |||||
data_bundle = WeiboNERLoader().load(paths) | |||||
return self.process(data_bundle) |
@@ -50,8 +50,8 @@ class MatchingBertPipe(Pipe): | |||||
dataset.drop(lambda x: x[Const.TARGET] == '-') | dataset.drop(lambda x: x[Const.TARGET] == '-') | ||||
for name, dataset in data_bundle.datasets.items(): | for name, dataset in data_bundle.datasets.items(): | ||||
dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0)) | |||||
dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1)) | |||||
dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0), ) | |||||
dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1), ) | |||||
if self.lower: | if self.lower: | ||||
for name, dataset in data_bundle.datasets.items(): | for name, dataset in data_bundle.datasets.items(): | ||||
@@ -76,25 +76,27 @@ def _raw_split(sent): | |||||
return sent.split() | return sent.split() | ||||
def _indexize(data_bundle): | |||||
def _indexize(data_bundle, input_field_name=Const.INPUT, target_field_name=Const.TARGET): | |||||
""" | """ | ||||
在dataset中的"words"列建立词表,"target"列建立词表,并把词表加入到data_bundle中。 | |||||
在dataset中的field_name列建立词表,Const.TARGET列建立词表,并把词表加入到data_bundle中。 | |||||
:param data_bundle: | :param data_bundle: | ||||
:param: str input_field_name: | |||||
:param: str target_field_name: 这一列的vocabulary没有unknown和padding | |||||
:return: | :return: | ||||
""" | """ | ||||
src_vocab = Vocabulary() | src_vocab = Vocabulary() | ||||
src_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.INPUT, | |||||
src_vocab.from_dataset(data_bundle.datasets['train'], field_name=input_field_name, | |||||
no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if | no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if | ||||
name != 'train']) | name != 'train']) | ||||
src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) | |||||
src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) | |||||
tgt_vocab = Vocabulary(unknown=None, padding=None) | tgt_vocab = Vocabulary(unknown=None, padding=None) | ||||
tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) | |||||
tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.TARGET) | |||||
tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=target_field_name) | |||||
tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=target_field_name) | |||||
data_bundle.set_vocab(src_vocab, Const.INPUT) | |||||
data_bundle.set_vocab(tgt_vocab, Const.TARGET) | |||||
data_bundle.set_vocab(src_vocab, input_field_name) | |||||
data_bundle.set_vocab(tgt_vocab, target_field_name) | |||||
return data_bundle | return data_bundle | ||||
@@ -107,14 +109,30 @@ def _add_words_field(data_bundle, lower=False): | |||||
:param bool lower:是否要小写化 | :param bool lower:是否要小写化 | ||||
:return: 传入的DataBundle | :return: 传入的DataBundle | ||||
""" | """ | ||||
for name, dataset in data_bundle.datasets.items(): | |||||
dataset.copy_field(field_name=Const.RAW_WORD, new_field_name=Const.INPUT) | |||||
data_bundle.copy_field(field_name=Const.RAW_WORD, new_field_name=Const.INPUT, ignore_miss_dataset=True) | |||||
if lower: | if lower: | ||||
for name, dataset in data_bundle.datasets.items(): | for name, dataset in data_bundle.datasets.items(): | ||||
dataset[Const.INPUT].lower() | dataset[Const.INPUT].lower() | ||||
return data_bundle | return data_bundle | ||||
def _add_chars_field(data_bundle, lower=False): | |||||
""" | |||||
给data_bundle中的dataset中复制一列chars. 并根据lower参数判断是否需要小写化 | |||||
:param data_bundle: | |||||
:param bool lower:是否要小写化 | |||||
:return: 传入的DataBundle | |||||
""" | |||||
data_bundle.copy_field(field_name=Const.RAW_CHAR, new_field_name=Const.CHAR_INPUT, ignore_miss_dataset=True) | |||||
if lower: | |||||
for name, dataset in data_bundle.datasets.items(): | |||||
dataset[Const.CHAR_INPUT].lower() | |||||
return data_bundle | |||||
def _drop_empty_instance(data_bundle, field_name): | def _drop_empty_instance(data_bundle, field_name): | ||||
""" | """ | ||||
删除data_bundle的DataSet中存在的某个field为空的情况 | 删除data_bundle的DataSet中存在的某个field为空的情况 | ||||
@@ -868,6 +868,7 @@ class _WordPieceBertModel(nn.Module): | |||||
self._cls_index = self.tokenzier.vocab['[CLS]'] | self._cls_index = self.tokenzier.vocab['[CLS]'] | ||||
self._sep_index = self.tokenzier.vocab['[SEP]'] | self._sep_index = self.tokenzier.vocab['[SEP]'] | ||||
self._wordpiece_unknown_index = self.tokenzier.vocab['[UNK]'] | |||||
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece | self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece | ||||
self.pooled_cls = pooled_cls | self.pooled_cls = pooled_cls | ||||
@@ -919,7 +920,7 @@ class _WordPieceBertModel(nn.Module): | |||||
outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1))) | outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1))) | ||||
for l_index, l in enumerate(self.layers): | for l_index, l in enumerate(self.layers): | ||||
bert_output = bert_outputs[l] | bert_output = bert_outputs[l] | ||||
if l==len(bert_outputs) and self.pooled_cls: | |||||
if l in (len(bert_outputs)-1, -1) and self.pooled_cls: | |||||
bert_output[:, 0] = pooled_cls | bert_output[:, 0] = pooled_cls | ||||
outputs[l_index] = bert_output | outputs[l_index] = bert_output | ||||
return outputs | return outputs |
@@ -1,115 +0,0 @@ | |||||
from fastNLP.io.data_bundle import DataSetLoader, DataBundle | |||||
from fastNLP.io import ConllLoader | |||||
from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 | |||||
from fastNLP import Const | |||||
from reproduction.utils import check_dataloader_paths | |||||
from fastNLP import Vocabulary | |||||
class ChineseNERLoader(DataSetLoader): | |||||
""" | |||||
读取中文命名实体数据集,包括PeopleDaily, MSRA-NER, Weibo。数据在这里可以找到https://github.com/OYE93/Chinese-NLP-Corpus/tree/master/NER | |||||
请确保输入数据的格式如下, 共两列,第一列为字,第二列为标签,不同句子以空行隔开 | |||||
我 O | |||||
们 O | |||||
变 O | |||||
而 O | |||||
以 O | |||||
书 O | |||||
会 O | |||||
... | |||||
""" | |||||
def __init__(self, encoding_type:str='bioes'): | |||||
""" | |||||
:param str encoding_type: 支持bio和bioes格式 | |||||
""" | |||||
super().__init__() | |||||
self._loader = ConllLoader(headers=['raw_chars', 'target'], indexes=[0, 1]) | |||||
assert encoding_type in ('bio', 'bioes') | |||||
self._tag_converters = [iob2] | |||||
if encoding_type == 'bioes': | |||||
self._tag_converters.append(iob2bioes) | |||||
def load(self, path:str): | |||||
dataset = self._loader.load(path) | |||||
def convert_tag_schema(tags): | |||||
for converter in self._tag_converters: | |||||
tags = converter(tags) | |||||
return tags | |||||
if self._tag_converters: | |||||
dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) | |||||
return dataset | |||||
def process(self, paths, bigrams=False, trigrams=False): | |||||
""" | |||||
:param paths: | |||||
:param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d<eos>] | |||||
:param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd<eos>, d<eos><eos>] | |||||
:return: ~fastNLP.io.DataBundle | |||||
包含以下的fields | |||||
raw_chars: List[str] | |||||
chars: List[int] | |||||
seq_len: int, 字的长度 | |||||
bigrams: List[int], optional | |||||
trigrams: List[int], optional | |||||
target: List[int] | |||||
""" | |||||
paths = check_dataloader_paths(paths) | |||||
data = DataBundle() | |||||
input_fields = [Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET] | |||||
target_fields = [Const.TARGET, Const.INPUT_LEN] | |||||
for name, path in paths.items(): | |||||
dataset = self.load(path) | |||||
if bigrams: | |||||
dataset.apply_field(lambda raw_chars: [c1+c2 for c1, c2 in zip(raw_chars, raw_chars[1:]+['<eos>'])], | |||||
field_name='raw_chars', new_field_name='bigrams') | |||||
if trigrams: | |||||
dataset.apply_field(lambda raw_chars: [c1+c2+c3 for c1, c2, c3 in zip(raw_chars, | |||||
raw_chars[1:]+['<eos>'], | |||||
raw_chars[2:]+['<eos>']*2)], | |||||
field_name='raw_chars', new_field_name='trigrams') | |||||
data.datasets[name] = dataset | |||||
char_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='raw_chars', | |||||
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) | |||||
char_vocab.index_dataset(*data.datasets.values(), field_name='raw_chars', new_field_name=Const.CHAR_INPUT) | |||||
data.vocabs[Const.CHAR_INPUT] = char_vocab | |||||
target_vocab = Vocabulary(unknown=None, padding=None).from_dataset(data.datasets['train'], field_name=Const.TARGET) | |||||
target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) | |||||
data.vocabs[Const.TARGET] = target_vocab | |||||
if bigrams: | |||||
bigram_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='bigrams', | |||||
no_create_entry_dataset=[dataset for name, dataset in | |||||
data.datasets.items() if name != 'train']) | |||||
bigram_vocab.index_dataset(*data.datasets.values(), field_name='bigrams', new_field_name='bigrams') | |||||
data.vocabs['bigrams'] = bigram_vocab | |||||
input_fields.append('bigrams') | |||||
if trigrams: | |||||
trigram_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='trigrams', | |||||
no_create_entry_dataset=[dataset for name, dataset in | |||||
data.datasets.items() if name != 'train']) | |||||
trigram_vocab.index_dataset(*data.datasets.values(), field_name='trigrams', new_field_name='trigrams') | |||||
data.vocabs['trigrams'] = trigram_vocab | |||||
input_fields.append('trigrams') | |||||
for name, dataset in data.datasets.items(): | |||||
dataset.add_seq_len(Const.CHAR_INPUT) | |||||
dataset.set_input(*input_fields) | |||||
dataset.set_target(*target_fields) | |||||
return data | |||||
@@ -12,22 +12,23 @@ sys.path.append('../../../') | |||||
from torch import nn | from torch import nn | ||||
from fastNLP.embeddings import BertEmbedding, Embedding | from fastNLP.embeddings import BertEmbedding, Embedding | ||||
from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader | |||||
from fastNLP import Trainer, Const | from fastNLP import Trainer, Const | ||||
from fastNLP import BucketSampler, SpanFPreRecMetric, GradientClipCallback | from fastNLP import BucketSampler, SpanFPreRecMetric, GradientClipCallback | ||||
from fastNLP.modules import MLP | from fastNLP.modules import MLP | ||||
from fastNLP.core.callback import WarmupCallback | from fastNLP.core.callback import WarmupCallback | ||||
from fastNLP import CrossEntropyLoss | from fastNLP import CrossEntropyLoss | ||||
from fastNLP.core.optimizer import AdamW | from fastNLP.core.optimizer import AdamW | ||||
import os | |||||
from fastNLP.io import MsraNERPipe, MsraNERLoader, WeiboNERPipe | |||||
from fastNLP import cache_results | from fastNLP import cache_results | ||||
encoding_type = 'bio' | encoding_type = 'bio' | ||||
@cache_results('caches/msra.pkl') | |||||
@cache_results('caches/weibo.pkl', _refresh=False) | |||||
def get_data(): | def get_data(): | ||||
data = ChineseNERLoader(encoding_type=encoding_type).process("MSRA/") | |||||
# data_dir = MsraNERLoader().download(dev_ratio=0) | |||||
# data = MsraNERPipe(encoding_type=encoding_type, target_pad_val=-100).process_from_file(data_dir) | |||||
data = WeiboNERPipe(encoding_type=encoding_type).process_from_file() | |||||
return data | return data | ||||
data = get_data() | data = get_data() | ||||
print(data) | print(data) | ||||
@@ -35,10 +36,10 @@ print(data) | |||||
class BertCNNER(nn.Module): | class BertCNNER(nn.Module): | ||||
def __init__(self, embed, tag_size): | def __init__(self, embed, tag_size): | ||||
super().__init__() | super().__init__() | ||||
self.embedding = Embedding(embed, dropout=0.1) | |||||
self.embedding = embed | |||||
self.tag_size = tag_size | self.tag_size = tag_size | ||||
self.mlp = MLP(size_layer=[self.embedding.embedding_dim, tag_size]) | self.mlp = MLP(size_layer=[self.embedding.embedding_dim, tag_size]) | ||||
def forward(self, chars): | def forward(self, chars): | ||||
# batch_size, max_len = words.size() | # batch_size, max_len = words.size() | ||||
chars = self.embedding(chars) | chars = self.embedding(chars) | ||||
@@ -46,11 +47,15 @@ class BertCNNER(nn.Module): | |||||
return {Const.OUTPUT: outputs} | return {Const.OUTPUT: outputs} | ||||
embed = BertEmbedding(data.vocabs[Const.CHAR_INPUT], model_dir_or_name='en-base', | |||||
pool_method='max', requires_grad=True, layers='11') | |||||
def predict(self, chars): | |||||
# batch_size, max_len = words.size() | |||||
chars = self.embedding(chars) | |||||
outputs = self.mlp(chars) | |||||
for name, dataset in data.datasets.items(): | |||||
dataset.set_pad_val(Const.TARGET, -100) | |||||
return {Const.OUTPUT: outputs} | |||||
embed = BertEmbedding(data.get_vocab(Const.CHAR_INPUT), model_dir_or_name='cn-wwm-ext', | |||||
pool_method='first', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5) | |||||
callbacks = [ | callbacks = [ | ||||
GradientClipCallback(clip_type='norm', clip_value=1), | GradientClipCallback(clip_type='norm', clip_value=1), | ||||
@@ -58,7 +63,7 @@ callbacks = [ | |||||
] | ] | ||||
model = BertCNNER(embed, len(data.vocabs[Const.TARGET])) | model = BertCNNER(embed, len(data.vocabs[Const.TARGET])) | ||||
optimizer = AdamW(model.parameters(), lr=1e-4) | |||||
optimizer = AdamW(model.parameters(), lr=3e-5) | |||||
for name, dataset in data.datasets.items(): | for name, dataset in data.datasets.items(): | ||||
original_len = len(dataset) | original_len = len(dataset) | ||||
@@ -66,13 +71,11 @@ for name, dataset in data.datasets.items(): | |||||
clipped_len = len(dataset) | clipped_len = len(dataset) | ||||
print("Delete {} instances in {}.".format(original_len-clipped_len, name)) | print("Delete {} instances in {}.".format(original_len-clipped_len, name)) | ||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' | |||||
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), | trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), | ||||
device=[0, 1], dev_data=data.datasets['test'], batch_size=20, | |||||
device=0, dev_data=data.datasets['test'], batch_size=6, | |||||
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | ||||
loss=CrossEntropyLoss(reduction='sum'), | loss=CrossEntropyLoss(reduction='sum'), | ||||
callbacks=callbacks, num_workers=2, n_epochs=5, | callbacks=callbacks, num_workers=2, n_epochs=5, | ||||
check_code_level=-1, update_every=3) | |||||
check_code_level=0, update_every=3) | |||||
trainer.train() | trainer.train() | ||||
@@ -1,7 +1,6 @@ | |||||
import sys | |||||
sys.path.append('../../..') | |||||
from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader | |||||
from fastNLP.embeddings import StaticEmbedding | from fastNLP.embeddings import StaticEmbedding | ||||
from torch import nn | from torch import nn | ||||
@@ -14,7 +13,51 @@ import torch.nn.functional as F | |||||
from fastNLP import seq_len_to_mask | from fastNLP import seq_len_to_mask | ||||
from fastNLP.core.const import Const as C | from fastNLP.core.const import Const as C | ||||
from fastNLP import SpanFPreRecMetric, Trainer | from fastNLP import SpanFPreRecMetric, Trainer | ||||
from fastNLP import cache_results | |||||
from fastNLP import cache_results, Vocabulary | |||||
from fastNLP.io.pipe.utils import _add_chars_field, _indexize | |||||
from fastNLP.io.pipe import Pipe | |||||
from fastNLP.core.utils import iob2bioes, iob2 | |||||
from fastNLP.io import MsraNERLoader, WeiboNERLoader | |||||
class ChineseNERPipe(Pipe): | |||||
def __init__(self, encoding_type: str = 'bio', target_pad_val=0, bigram=False): | |||||
if encoding_type == 'bio': | |||||
self.convert_tag = iob2 | |||||
else: | |||||
self.convert_tag = lambda words: iob2bioes(iob2(words)) | |||||
self.target_pad_val = int(target_pad_val) | |||||
self.bigram = bigram | |||||
def process(self, data_bundle): | |||||
data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT) | |||||
input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN] | |||||
target_fields = [C.TARGET, C.INPUT_LEN] | |||||
if self.bigram: | |||||
for dataset in data_bundle.datasets.values(): | |||||
dataset.apply_field(lambda chars:[c1+c2 for c1, c2 in zip(chars, chars[1:]+['<eos>'])], | |||||
field_name=C.CHAR_INPUT, new_field_name='bigrams') | |||||
bigram_vocab = Vocabulary() | |||||
bigram_vocab.from_dataset(data_bundle.get_dataset('train'),field_name='bigrams', | |||||
no_create_entry_dataset=[ds for name, ds in data_bundle.datasets.items() if name!='train']) | |||||
bigram_vocab.index_dataset(*data_bundle.datasets.values(), field_name='bigrams') | |||||
data_bundle.set_vocab(bigram_vocab, field_name='bigrams') | |||||
input_fields.append('bigrams') | |||||
_add_chars_field(data_bundle, lower=False) | |||||
# index | |||||
_indexize(data_bundle, input_field_name=C.CHAR_INPUT, target_field_name=C.TARGET) | |||||
for name, dataset in data_bundle.datasets.items(): | |||||
dataset.set_pad_val(C.TARGET, self.target_pad_val) | |||||
dataset.add_seq_len(C.CHAR_INPUT) | |||||
data_bundle.set_input(*input_fields) | |||||
data_bundle.set_target(*target_fields) | |||||
return data_bundle | |||||
class CNBiLSTMCRFNER(nn.Module): | class CNBiLSTMCRFNER(nn.Module): | ||||
def __init__(self, char_embed, num_classes, bigram_embed=None, trigram_embed=None, num_layers=1, hidden_size=100, | def __init__(self, char_embed, num_classes, bigram_embed=None, trigram_embed=None, num_layers=1, hidden_size=100, | ||||
@@ -73,22 +116,21 @@ class CNBiLSTMCRFNER(nn.Module): | |||||
return self._forward(chars, bigrams, trigrams, seq_len) | return self._forward(chars, bigrams, trigrams, seq_len) | ||||
# data_bundle = pickle.load(open('caches/msra.pkl', 'rb')) | # data_bundle = pickle.load(open('caches/msra.pkl', 'rb')) | ||||
@cache_results('caches/msra.pkl', _refresh=True) | |||||
@cache_results('caches/weibo-lstm.pkl', _refresh=False) | |||||
def get_data(): | def get_data(): | ||||
data_bundle = ChineseNERLoader().process('MSRA-NER/', bigrams=True) | |||||
char_embed = StaticEmbedding(data_bundle.vocabs['chars'], | |||||
model_dir_or_name='cn-char') | |||||
bigram_embed = StaticEmbedding(data_bundle.vocabs['bigrams'], | |||||
model_dir_or_name='cn-bigram') | |||||
data_bundle = WeiboNERLoader().load() | |||||
data_bundle = ChineseNERPipe(encoding_type='bioes', bigram=True).process(data_bundle) | |||||
char_embed = StaticEmbedding(data_bundle.get_vocab(C.CHAR_INPUT), model_dir_or_name='cn-fasttext') | |||||
bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), embedding_dim=100, min_freq=3) | |||||
return data_bundle, char_embed, bigram_embed | return data_bundle, char_embed, bigram_embed | ||||
data_bundle, char_embed, bigram_embed = get_data() | data_bundle, char_embed, bigram_embed = get_data() | ||||
# data_bundle = get_data() | |||||
print(data_bundle) | print(data_bundle) | ||||
# exit(0) | # exit(0) | ||||
data_bundle.datasets['train'].set_input('target') | |||||
data_bundle.datasets['dev'].set_input('target') | |||||
model = CNBiLSTMCRFNER(char_embed, num_classes=len(data_bundle.vocabs['target']), bigram_embed=bigram_embed) | model = CNBiLSTMCRFNER(char_embed, num_classes=len(data_bundle.vocabs['target']), bigram_embed=bigram_embed) | ||||
Trainer(data_bundle.datasets['train'], model, batch_size=640, | |||||
Trainer(data_bundle.datasets['train'], model, batch_size=20, | |||||
metrics=SpanFPreRecMetric(data_bundle.vocabs['target'], encoding_type='bioes'), | metrics=SpanFPreRecMetric(data_bundle.vocabs['target'], encoding_type='bioes'), | ||||
num_workers=2, dev_data=data_bundle. datasets['dev'], device=3).train() | |||||
num_workers=2, dev_data=data_bundle. datasets['dev'], device=0).train() | |||||
@@ -2,7 +2,6 @@ | |||||
import torch | import torch | ||||
from torch import nn | from torch import nn | ||||
from fastNLP import seq_len_to_mask | from fastNLP import seq_len_to_mask | ||||
from fastNLP.modules import Embedding | |||||
from fastNLP.modules import LSTM | from fastNLP.modules import LSTM | ||||
from fastNLP.modules import ConditionalRandomField, allowed_transitions | from fastNLP.modules import ConditionalRandomField, allowed_transitions | ||||
import torch.nn.functional as F | import torch.nn.functional as F | ||||
@@ -1,8 +1,7 @@ | |||||
import sys | import sys | ||||
sys.path.append('../../..') | sys.path.append('../../..') | ||||
from fastNLP.embeddings.embedding import CNNCharEmbedding, StaticEmbedding | |||||
from fastNLP.core.vocabulary import VocabularyOption | |||||
from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding | |||||
from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | ||||
from fastNLP import Trainer | from fastNLP import Trainer | ||||
@@ -11,68 +10,44 @@ from fastNLP import BucketSampler | |||||
from fastNLP import Const | from fastNLP import Const | ||||
from torch.optim import SGD | from torch.optim import SGD | ||||
from fastNLP import GradientClipCallback | from fastNLP import GradientClipCallback | ||||
from fastNLP.core.callback import FitlogCallback, LRScheduler | |||||
from fastNLP.core.callback import EvaluateCallback, LRScheduler | |||||
from torch.optim.lr_scheduler import LambdaLR | from torch.optim.lr_scheduler import LambdaLR | ||||
# from reproduction.seqence_labelling.ner.model.swats import SWATS | |||||
from fastNLP import cache_results | from fastNLP import cache_results | ||||
import fitlog | |||||
fitlog.debug() | |||||
from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader | |||||
from fastNLP.io.pipe.conll import Conll2003NERPipe | |||||
encoding_type = 'bioes' | encoding_type = 'bioes' | ||||
@cache_results('caches/upper_conll2003.pkl') | |||||
@cache_results('caches/conll2003_new.pkl', _refresh=True) | |||||
def load_data(): | def load_data(): | ||||
data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003', | |||||
word_vocab_opt=VocabularyOption(min_freq=1), | |||||
lower=False) | |||||
# 替换路径 | |||||
paths = {'test':"NER/corpus/CoNLL-2003/eng.testb", | |||||
'train':"NER/corpus/CoNLL-2003/eng.train", | |||||
'dev':"NER/corpus/CoNLL-2003/eng.testa"} | |||||
data = Conll2003NERPipe(encoding_type=encoding_type, target_pad_val=0).process_from_file(paths) | |||||
return data | return data | ||||
data = load_data() | data = load_data() | ||||
print(data) | print(data) | ||||
char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||||
kernel_sizes=[3], word_dropout=0.01, dropout=0.5) | |||||
# char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30) | |||||
word_embed = StaticEmbedding(vocab=data.vocabs['words'], | |||||
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', | |||||
char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], | |||||
kernel_sizes=[3], word_dropout=0, dropout=0.5) | |||||
word_embed = StaticEmbedding(vocab=data.get_vocab('words'), | |||||
model_dir_or_name='en-glove-6b-100d', | |||||
requires_grad=True, lower=True, word_dropout=0.01, dropout=0.5) | requires_grad=True, lower=True, word_dropout=0.01, dropout=0.5) | ||||
word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() | word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() | ||||
# import joblib | |||||
# raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib') | |||||
# def convert_to_ids(raw_words): | |||||
# ids = [] | |||||
# for word in raw_words: | |||||
# id = raw_data['word_to_id'][word] | |||||
# id = raw_data['id_to_emb_map'][id] | |||||
# ids.append(id) | |||||
# return ids | |||||
# word_embed = raw_data['emb_matrix'] | |||||
# for name, dataset in data.datasets.items(): | |||||
# dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) | |||||
# elmo_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'], | |||||
# model_dir_or_name='.', | |||||
# requires_grad=True, layers='mix') | |||||
# char_embed = StackEmbedding([elmo_embed, char_embed]) | |||||
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], | model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], | ||||
encoding_type=encoding_type) | encoding_type=encoding_type) | ||||
callbacks = [ | callbacks = [ | ||||
GradientClipCallback(clip_type='value', clip_value=5), | GradientClipCallback(clip_type='value', clip_value=5), | ||||
FitlogCallback({'test':data.datasets['test']}, verbose=1), | |||||
# SaveModelCallback('save_models/', top=3, only_param=False, save_on_exception=True) | |||||
EvaluateCallback(data=data.get_dataset('test')) # 额外对test上的数据进行性能评测 | |||||
] | ] | ||||
# optimizer = Adam(model.parameters(), lr=0.001) | |||||
# optimizer = SWATS(model.parameters(), verbose=True) | |||||
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) | |||||
optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9) | |||||
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) | scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) | ||||
callbacks.append(scheduler) | callbacks.append(scheduler) | ||||
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(batch_size=20), | |||||
device=1, dev_data=data.datasets['dev'], batch_size=20, | |||||
trainer = Trainer(train_data=data.get_dataset('train'), model=model, optimizer=optimizer, sampler=BucketSampler(), | |||||
device=0, dev_data=data.get_dataset('dev'), batch_size=20, | |||||
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | ||||
callbacks=callbacks, num_workers=2, n_epochs=100) | |||||
callbacks=callbacks, num_workers=2, n_epochs=100, dev_batch_size=512) | |||||
trainer.train() | trainer.train() |
@@ -11,52 +11,37 @@ from fastNLP import Const | |||||
from torch.optim import SGD | from torch.optim import SGD | ||||
from torch.optim.lr_scheduler import LambdaLR | from torch.optim.lr_scheduler import LambdaLR | ||||
from fastNLP import GradientClipCallback | from fastNLP import GradientClipCallback | ||||
from fastNLP.core.vocabulary import VocabularyOption | |||||
from fastNLP.core.callback import FitlogCallback, LRScheduler | |||||
from functools import partial | |||||
from torch import nn | |||||
from fastNLP import BucketSampler | |||||
from fastNLP.core.callback import EvaluateCallback, LRScheduler | |||||
from fastNLP import cache_results | from fastNLP import cache_results | ||||
from fastNLP.io.pipe.conll import OntoNotesNERPipe | |||||
import fitlog | |||||
fitlog.debug() | |||||
fitlog.set_log_dir('logs/') | |||||
fitlog.add_hyper_in_file(__file__) | |||||
#######hyper | #######hyper | ||||
normalize = False | normalize = False | ||||
divide_std = True | |||||
lower = False | lower = False | ||||
lr = 0.015 | |||||
lr = 0.01 | |||||
dropout = 0.5 | dropout = 0.5 | ||||
batch_size = 20 | |||||
init_method = 'default' | |||||
batch_size = 32 | |||||
job_embed = False | job_embed = False | ||||
data_name = 'ontonote' | data_name = 'ontonote' | ||||
#######hyper | #######hyper | ||||
init_method = {'default': None, | |||||
'xavier': partial(nn.init.xavier_normal_, gain=0.02), | |||||
'normal': partial(nn.init.normal_, std=0.02) | |||||
}[init_method] | |||||
from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader | |||||
encoding_type = 'bioes' | encoding_type = 'bioes' | ||||
@cache_results('caches/ontonotes.pkl') | |||||
@cache_results('caches/ontonotes.pkl', _refresh=True) | |||||
def cache(): | def cache(): | ||||
data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('../../../../others/data/v4/english', | |||||
lower=lower, | |||||
word_vocab_opt=VocabularyOption(min_freq=1)) | |||||
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||||
kernel_sizes=[3]) | |||||
data = OntoNotesNERPipe(encoding_type=encoding_type).process_from_file('../../../../others/data/v4/english') | |||||
char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||||
kernel_sizes=[3], dropout=dropout) | |||||
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | ||||
model_dir_or_name='/remote-home/hyan01/fastnlp_caches/glove.6B.100d/glove.6B.100d.txt', | |||||
model_dir_or_name='en-glove-100d', | |||||
requires_grad=True, | requires_grad=True, | ||||
normalize=normalize, | normalize=normalize, | ||||
init_method=init_method) | |||||
word_dropout=0.01, | |||||
dropout=dropout, | |||||
lower=True, | |||||
min_freq=2) | |||||
return data, char_embed, word_embed | return data, char_embed, word_embed | ||||
data, char_embed, word_embed = cache() | data, char_embed, word_embed = cache() | ||||
@@ -67,7 +52,7 @@ model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag | |||||
callbacks = [ | callbacks = [ | ||||
GradientClipCallback(clip_value=5, clip_type='value'), | GradientClipCallback(clip_value=5, clip_type='value'), | ||||
FitlogCallback(data.datasets['test'], verbose=1) | |||||
EvaluateCallback(data.datasets['test']) | |||||
] | ] | ||||
optimizer = SGD(model.parameters(), lr=lr, momentum=0.9) | optimizer = SGD(model.parameters(), lr=lr, momentum=0.9) | ||||
@@ -75,8 +60,8 @@ scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.0 | |||||
callbacks.append(scheduler) | callbacks.append(scheduler) | ||||
trainer = Trainer(train_data=data.datasets['dev'][:100], model=model, optimizer=optimizer, sampler=None, | |||||
device=0, dev_data=data.datasets['dev'][:100], batch_size=batch_size, | |||||
trainer = Trainer(train_data=data.get_dataset('train'), model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100), | |||||
device=0, dev_data=data.get_dataset('dev'), batch_size=batch_size, | |||||
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | ||||
callbacks=callbacks, num_workers=1, n_epochs=100) | |||||
callbacks=callbacks, num_workers=1, n_epochs=100, dev_batch_size=256) | |||||
trainer.train() | trainer.train() |
@@ -0,0 +1,14 @@ | |||||
import unittest | |||||
from fastNLP import Vocabulary | |||||
from fastNLP.embeddings import BertEmbedding | |||||
import torch | |||||
import os | |||||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||||
class TestDownload(unittest.TestCase): | |||||
def test_download(self): | |||||
# import os | |||||
vocab = Vocabulary().add_word_lst("This is a test .".split()) | |||||
embed = BertEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/embedding/bert-base-cased') | |||||
words = torch.LongTensor([[0, 1, 2]]) | |||||
print(embed(words).size()) |
@@ -0,0 +1,21 @@ | |||||
import unittest | |||||
import os | |||||
from fastNLP.io.loader.conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader | |||||
class MSRANERTest(unittest.TestCase): | |||||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||||
def test_download(self): | |||||
MsraNERLoader().download(re_download=False) | |||||
data_bundle = MsraNERLoader().load() | |||||
print(data_bundle) | |||||
class PeopleDailyTest(unittest.TestCase): | |||||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||||
def test_download(self): | |||||
PeopleDailyNERLoader().download() | |||||
class WeiboNERTest(unittest.TestCase): | |||||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||||
def test_download(self): | |||||
WeiboNERLoader().download() |
@@ -0,0 +1,12 @@ | |||||
import unittest | |||||
import os | |||||
from fastNLP.io import MsraNERPipe, PeopleDailyPipe, WeiboNERPipe | |||||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||||
class TestPipe(unittest.TestCase): | |||||
def test_process_from_file(self): | |||||
for pipe in [MsraNERPipe, PeopleDailyPipe, WeiboNERPipe]: | |||||
with self.subTest(pipe=pipe): | |||||
print(pipe) | |||||
data_bundle = pipe().process_from_file() | |||||
print(data_bundle) |