Browse Source

Merge branch 'dev0.5.0' of https://github.com/fastnlp/fastNLP into dev0.5.0

tags/v0.4.10
yh 6 years ago
parent
commit
1cc115e977
56 changed files with 4298 additions and 586 deletions
  1. +5
    -0
      fastNLP/core/vocabulary.py
  2. +24
    -19
      fastNLP/io/data_loader/sst.py
  3. +69
    -0
      fastNLP/io/utils.py
  4. +3
    -3
      fastNLP/models/star_transformer.py
  5. +4
    -5
      fastNLP/modules/aggregator/attention.py
  6. +288
    -170
      fastNLP/modules/encoder/_elmo.py
  7. +93
    -26
      fastNLP/modules/encoder/embedding.py
  8. +8
    -5
      fastNLP/modules/encoder/star_transformer.py
  9. +1
    -1
      reproduction/Star_transformer/README.md
  10. +8
    -3
      reproduction/Star_transformer/datasets.py
  11. +2
    -2
      reproduction/Star_transformer/run.sh
  12. +38
    -21
      reproduction/Star_transformer/train.py
  13. +0
    -0
      reproduction/coreference_resolution/__init__.py
  14. +0
    -0
      reproduction/coreference_resolution/data_load/__init__.py
  15. +68
    -0
      reproduction/coreference_resolution/data_load/cr_loader.py
  16. +0
    -0
      reproduction/coreference_resolution/model/__init__.py
  17. +54
    -0
      reproduction/coreference_resolution/model/config.py
  18. +163
    -0
      reproduction/coreference_resolution/model/metric.py
  19. +576
    -0
      reproduction/coreference_resolution/model/model_re.py
  20. +225
    -0
      reproduction/coreference_resolution/model/preprocess.py
  21. +32
    -0
      reproduction/coreference_resolution/model/softmax_loss.py
  22. +101
    -0
      reproduction/coreference_resolution/model/util.py
  23. +49
    -0
      reproduction/coreference_resolution/readme.md
  24. +0
    -0
      reproduction/coreference_resolution/test/__init__.py
  25. +14
    -0
      reproduction/coreference_resolution/test/test_dataloader.py
  26. +69
    -0
      reproduction/coreference_resolution/train.py
  27. +24
    -0
      reproduction/coreference_resolution/valid.py
  28. +105
    -0
      reproduction/matching/matching_cntn.py
  29. +120
    -0
      reproduction/matching/model/cntn.py
  30. +0
    -93
      reproduction/seqence_labelling/ner/data/Conll2003Loader.py
  31. +0
    -152
      reproduction/seqence_labelling/ner/data/OntoNoteLoader.py
  32. +0
    -49
      reproduction/seqence_labelling/ner/data/utils.py
  33. +142
    -0
      reproduction/seqence_labelling/ner/model/dilated_cnn.py
  34. +99
    -0
      reproduction/seqence_labelling/ner/train_idcnn.py
  35. +26
    -0
      reproduction/text_classification/README.md
  36. +110
    -0
      reproduction/text_classification/data/IMDBLoader.py
  37. +5
    -1
      reproduction/text_classification/data/MTL16Loader.py
  38. +187
    -0
      reproduction/text_classification/data/SSTLoader.py
  39. +187
    -0
      reproduction/text_classification/data/sstLoader.py
  40. +160
    -31
      reproduction/text_classification/data/yelpLoader.py
  41. +109
    -0
      reproduction/text_classification/model/HAN.py
  42. +31
    -0
      reproduction/text_classification/model/awd_lstm.py
  43. +86
    -0
      reproduction/text_classification/model/awdlstm_module.py
  44. +90
    -1
      reproduction/text_classification/model/char_cnn.py
  45. +97
    -1
      reproduction/text_classification/model/dpcnn.py
  46. +30
    -0
      reproduction/text_classification/model/lstm.py
  47. +35
    -0
      reproduction/text_classification/model/lstm_self_attention.py
  48. +99
    -0
      reproduction/text_classification/model/weight_drop.py
  49. +109
    -0
      reproduction/text_classification/train_HAN.py
  50. +69
    -0
      reproduction/text_classification/train_awdlstm.py
  51. +205
    -0
      reproduction/text_classification/train_char_cnn.py
  52. +120
    -0
      reproduction/text_classification/train_dpcnn.py
  53. +66
    -0
      reproduction/text_classification/train_lstm.py
  54. +68
    -0
      reproduction/text_classification/train_lstm_att.py
  55. +11
    -0
      reproduction/text_classification/utils/util_init.py
  56. +14
    -3
      reproduction/utils.py

+ 5
- 0
fastNLP/core/vocabulary.py View File

@@ -117,6 +117,8 @@ class Vocabulary(object):


:param str word: 新词 :param str word: 新词
""" """
if word in self._no_create_word:
self._no_create_word.pop(word)
self.add(word) self.add(word)
@_check_build_status @_check_build_status
@@ -126,6 +128,9 @@ class Vocabulary(object):


:param list[str] word_lst: 词的序列 :param list[str] word_lst: 词的序列
""" """
for word in word_lst:
if word in self._no_create_word:
self._no_create_word.pop(word)
self.update(word_lst) self.update(word_lst)
def build_vocab(self): def build_vocab(self):


+ 24
- 19
fastNLP/io/data_loader/sst.py View File

@@ -1,10 +1,11 @@
from typing import Iterable from typing import Iterable
from nltk import Tree from nltk import Tree
import spacy
from ..base_loader import DataInfo, DataSetLoader from ..base_loader import DataInfo, DataSetLoader
from ...core.vocabulary import VocabularyOption, Vocabulary from ...core.vocabulary import VocabularyOption, Vocabulary
from ...core.dataset import DataSet from ...core.dataset import DataSet
from ...core.instance import Instance from ...core.instance import Instance
from ..embed_loader import EmbeddingOption, EmbedLoader
from ..utils import check_dataloader_paths, get_tokenizer




class SSTLoader(DataSetLoader): class SSTLoader(DataSetLoader):
@@ -34,6 +35,7 @@ class SSTLoader(DataSetLoader):
tag_v['0'] = tag_v['1'] tag_v['0'] = tag_v['1']
tag_v['4'] = tag_v['3'] tag_v['4'] = tag_v['3']
self.tag_v = tag_v self.tag_v = tag_v
self.tokenizer = get_tokenizer()


def _load(self, path): def _load(self, path):
""" """
@@ -52,29 +54,37 @@ class SSTLoader(DataSetLoader):
ds.append(Instance(words=words, target=tag)) ds.append(Instance(words=words, target=tag))
return ds return ds


@staticmethod
def _get_one(data, subtree):
def _get_one(self, data, subtree):
tree = Tree.fromstring(data) tree = Tree.fromstring(data)
if subtree: if subtree:
return [(t.leaves(), t.label()) for t in tree.subtrees()]
return [(tree.leaves(), tree.label())]
return [([x.text for x in self.tokenizer(' '.join(t.leaves()))], t.label()) for t in tree.subtrees() ]
return [([x.text for x in self.tokenizer(' '.join(tree.leaves()))], tree.label())]


def process(self, def process(self,
paths,
train_ds: Iterable[str] = None,
paths, train_subtree=True,
src_vocab_op: VocabularyOption = None, src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
src_embed_op: EmbeddingOption = None):
tgt_vocab_op: VocabularyOption = None,):
paths = check_dataloader_paths(paths)
input_name, target_name = 'words', 'target' input_name, target_name = 'words', 'target'
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \ tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)


info = DataInfo(datasets=self.load(paths))
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()
src_vocab.from_dataset(*_train_ds, field_name=input_name)
tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
info = DataInfo()
origin_subtree = self.subtree
self.subtree = train_subtree
info.datasets['train'] = self._load(paths['train'])
self.subtree = origin_subtree
for n, p in paths.items():
if n != 'train':
info.datasets[n] = self._load(p)

src_vocab.from_dataset(
info.datasets['train'],
field_name=input_name,
no_create_entry_dataset=[ds for n, ds in info.datasets.items() if n != 'train'])
tgt_vocab.from_dataset(info.datasets['train'], field_name=target_name)

src_vocab.index_dataset( src_vocab.index_dataset(
*info.datasets.values(), *info.datasets.values(),
field_name=input_name, new_field_name=input_name) field_name=input_name, new_field_name=input_name)
@@ -86,10 +96,5 @@ class SSTLoader(DataSetLoader):
target_name: tgt_vocab target_name: tgt_vocab
} }


if src_embed_op is not None:
src_embed_op.vocab = src_vocab
init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
info.embeddings[input_name] = init_emb

return info return info



+ 69
- 0
fastNLP/io/utils.py View File

@@ -0,0 +1,69 @@
import os

from typing import Union, Dict


def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
"""
检查传入dataloader的文件的合法性。如果为合法路径,将返回至少包含'train'这个key的dict。类似于下面的结果
{
'train': '/some/path/to/', # 一定包含,建词表应该在这上面建立,剩下的其它文件应该只需要处理并index。
'test': 'xxx' # 可能有,也可能没有
...
}
如果paths为不合法的,将直接进行raise相应的错误

:param paths: 路径. 可以为一个文件路径(则认为该文件就是train的文件); 可以为一个文件目录,将在该目录下寻找train(文件名
中包含train这个字段), test.txt, dev.txt; 可以为一个dict, 则key是用户自定义的某个文件的名称,value是这个文件的路径。
:return:
"""
if isinstance(paths, str):
if os.path.isfile(paths):
return {'train': paths}
elif os.path.isdir(paths):
filenames = os.listdir(paths)
files = {}
for filename in filenames:
path_pair = None
if 'train' in filename:
path_pair = ('train', filename)
if 'dev' in filename:
if path_pair:
raise Exception("File:{} in {} contains bot `{}` and `dev`.".format(filename, paths, path_pair[0]))
path_pair = ('dev', filename)
if 'test' in filename:
if path_pair:
raise Exception("File:{} in {} contains bot `{}` and `test`.".format(filename, paths, path_pair[0]))
path_pair = ('test', filename)
if path_pair:
files[path_pair[0]] = os.path.join(paths, path_pair[1])
return files
else:
raise FileNotFoundError(f"{paths} is not a valid file path.")

elif isinstance(paths, dict):
if paths:
if 'train' not in paths:
raise KeyError("You have to include `train` in your dict.")
for key, value in paths.items():
if isinstance(key, str) and isinstance(value, str):
if not os.path.isfile(value):
raise TypeError(f"{value} is not a valid file.")
else:
raise TypeError("All keys and values in paths should be str.")
return paths
else:
raise ValueError("Empty paths is not allowed.")
else:
raise TypeError(f"paths only supports str and dict. not {type(paths)}.")

def get_tokenizer():
try:
import spacy
spacy.prefer_gpu()
en = spacy.load('en')
print('use spacy tokenizer')
return lambda x: [w.text for w in en.tokenizer(x)]
except Exception as e:
print('use raw tokenizer')
return lambda x: x.split()

+ 3
- 3
fastNLP/models/star_transformer.py View File

@@ -46,7 +46,7 @@ class StarTransEnc(nn.Module):
super(StarTransEnc, self).__init__() super(StarTransEnc, self).__init__()
self.embedding = get_embeddings(init_embed) self.embedding = get_embeddings(init_embed)
emb_dim = self.embedding.embedding_dim emb_dim = self.embedding.embedding_dim
self.emb_fc = nn.Linear(emb_dim, hidden_size)
#self.emb_fc = nn.Linear(emb_dim, hidden_size)
self.emb_drop = nn.Dropout(emb_dropout) self.emb_drop = nn.Dropout(emb_dropout)
self.encoder = StarTransformer(hidden_size=hidden_size, self.encoder = StarTransformer(hidden_size=hidden_size,
num_layers=num_layers, num_layers=num_layers,
@@ -65,7 +65,7 @@ class StarTransEnc(nn.Module):
[batch, hidden] 全局 relay 节点, 详见论文 [batch, hidden] 全局 relay 节点, 详见论文
""" """
x = self.embedding(x) x = self.embedding(x)
x = self.emb_fc(self.emb_drop(x))
#x = self.emb_fc(self.emb_drop(x))
nodes, relay = self.encoder(x, mask) nodes, relay = self.encoder(x, mask)
return nodes, relay return nodes, relay


@@ -205,7 +205,7 @@ class STSeqCls(nn.Module):
max_len=max_len, max_len=max_len,
emb_dropout=emb_dropout, emb_dropout=emb_dropout,
dropout=dropout) dropout=dropout)
self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)
self.cls = _Cls(hidden_size, num_cls, cls_hidden_size, dropout=dropout)
def forward(self, words, seq_len): def forward(self, words, seq_len):
""" """


+ 4
- 5
fastNLP/modules/aggregator/attention.py View File

@@ -19,7 +19,7 @@ class DotAttention(nn.Module):
补上文档 补上文档
""" """
def __init__(self, key_size, value_size, dropout=0):
def __init__(self, key_size, value_size, dropout=0.0):
super(DotAttention, self).__init__() super(DotAttention, self).__init__()
self.key_size = key_size self.key_size = key_size
self.value_size = value_size self.value_size = value_size
@@ -37,7 +37,7 @@ class DotAttention(nn.Module):
""" """
output = torch.matmul(Q, K.transpose(1, 2)) / self.scale output = torch.matmul(Q, K.transpose(1, 2)) / self.scale
if mask_out is not None: if mask_out is not None:
output.masked_fill_(mask_out, -1e8)
output.masked_fill_(mask_out, -1e18)
output = self.softmax(output) output = self.softmax(output)
output = self.drop(output) output = self.drop(output)
return torch.matmul(output, V) return torch.matmul(output, V)
@@ -67,9 +67,8 @@ class MultiHeadAttention(nn.Module):
self.k_in = nn.Linear(input_size, in_size) self.k_in = nn.Linear(input_size, in_size)
self.v_in = nn.Linear(input_size, in_size) self.v_in = nn.Linear(input_size, in_size)
# follow the paper, do not apply dropout within dot-product # follow the paper, do not apply dropout within dot-product
self.attention = DotAttention(key_size=key_size, value_size=value_size, dropout=0)
self.attention = DotAttention(key_size=key_size, value_size=value_size, dropout=dropout)
self.out = nn.Linear(value_size * num_head, input_size) self.out = nn.Linear(value_size * num_head, input_size)
self.drop = TimestepDropout(dropout)
self.reset_parameters() self.reset_parameters()
def reset_parameters(self): def reset_parameters(self):
@@ -105,7 +104,7 @@ class MultiHeadAttention(nn.Module):
# concat all heads, do output linear # concat all heads, do output linear
atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, sq, -1) atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, sq, -1)
output = self.drop(self.out(atte))
output = self.out(atte)
return output return output






+ 288
- 170
fastNLP/modules/encoder/_elmo.py View File

@@ -1,12 +1,13 @@

""" """
这个页面的代码大量参考了https://github.com/HIT-SCIR/ELMoForManyLangs/tree/master/elmoformanylangs
这个页面的代码大量参考了 allenNLP
""" """



from typing import Optional, Tuple, List, Callable from typing import Optional, Tuple, List, Callable


import os import os

import h5py
import numpy
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
@@ -16,7 +17,6 @@ import json


from ..utils import get_dropout_mask from ..utils import get_dropout_mask
import codecs import codecs
from torch import autograd


class LstmCellWithProjection(torch.nn.Module): class LstmCellWithProjection(torch.nn.Module):
""" """
@@ -58,6 +58,7 @@ class LstmCellWithProjection(torch.nn.Module):
respectively. The first dimension is 1 in order to match the Pytorch respectively. The first dimension is 1 in order to match the Pytorch
API for returning stacked LSTM states. API for returning stacked LSTM states.
""" """

def __init__(self, def __init__(self,
input_size: int, input_size: int,
hidden_size: int, hidden_size: int,
@@ -129,13 +130,13 @@ class LstmCellWithProjection(torch.nn.Module):
# We have to use this '.data.new().fill_' pattern to create tensors with the correct # We have to use this '.data.new().fill_' pattern to create tensors with the correct
# type - forward has no knowledge of whether these are torch.Tensors or torch.cuda.Tensors. # type - forward has no knowledge of whether these are torch.Tensors or torch.cuda.Tensors.
output_accumulator = inputs.data.new(batch_size, output_accumulator = inputs.data.new(batch_size,
total_timesteps,
self.hidden_size).fill_(0)
total_timesteps,
self.hidden_size).fill_(0)
if initial_state is None: if initial_state is None:
full_batch_previous_memory = inputs.data.new(batch_size, full_batch_previous_memory = inputs.data.new(batch_size,
self.cell_size).fill_(0)
self.cell_size).fill_(0)
full_batch_previous_state = inputs.data.new(batch_size, full_batch_previous_state = inputs.data.new(batch_size,
self.hidden_size).fill_(0)
self.hidden_size).fill_(0)
else: else:
full_batch_previous_state = initial_state[0].squeeze(0) full_batch_previous_state = initial_state[0].squeeze(0)
full_batch_previous_memory = initial_state[1].squeeze(0) full_batch_previous_memory = initial_state[1].squeeze(0)
@@ -169,7 +170,7 @@ class LstmCellWithProjection(torch.nn.Module):
# Second conditional: Does the next shortest sequence beyond the current batch # Second conditional: Does the next shortest sequence beyond the current batch
# index require computation use this timestep? # index require computation use this timestep?
while current_length_index < (len(batch_lengths) - 1) and \ while current_length_index < (len(batch_lengths) - 1) and \
batch_lengths[current_length_index + 1] > index:
batch_lengths[current_length_index + 1] > index:
current_length_index += 1 current_length_index += 1


# Actually get the slices of the batch which we # Actually get the slices of the batch which we
@@ -256,7 +257,7 @@ class LstmbiLm(nn.Module):
inputs = inputs[sort_idx] inputs = inputs[sort_idx]
inputs = nn.utils.rnn.pack_padded_sequence(inputs, sort_lens, batch_first=self.batch_first) inputs = nn.utils.rnn.pack_padded_sequence(inputs, sort_lens, batch_first=self.batch_first)
output, hx = self.encoder(inputs, None) # -> [N,L,C] output, hx = self.encoder(inputs, None) # -> [N,L,C]
output, _ = nn.util.rnn.pad_packed_sequence(output, batch_first=self.batch_first)
output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=self.batch_first)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False) _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
output = output[unsort_idx] output = output[unsort_idx]
forward, backward = output.split(self.config['encoder']['dim'], 2) forward, backward = output.split(self.config['encoder']['dim'], 2)
@@ -316,13 +317,13 @@ class ElmobiLm(torch.nn.Module):
:param seq_len: batch_size :param seq_len: batch_size
:return: torch.FloatTensor. num_layers x batch_size x max_len x hidden_size :return: torch.FloatTensor. num_layers x batch_size x max_len x hidden_size
""" """
max_len = inputs.size(1)
sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True) sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
inputs = inputs[sort_idx] inputs = inputs[sort_idx]
inputs = nn.utils.rnn.pack_padded_sequence(inputs, sort_lens, batch_first=True) inputs = nn.utils.rnn.pack_padded_sequence(inputs, sort_lens, batch_first=True)
output, _ = self._lstm_forward(inputs, None) output, _ = self._lstm_forward(inputs, None)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False) _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
output = output[:, unsort_idx] output = output[:, unsort_idx]

return output return output


def _lstm_forward(self, def _lstm_forward(self,
@@ -399,7 +400,7 @@ class ElmobiLm(torch.nn.Module):
torch.cat([forward_state[1], backward_state[1]], -1))) torch.cat([forward_state[1], backward_state[1]], -1)))


stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs) stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs)
# Stack the hidden state and memory for each layer into 2 tensors of shape
# Stack the hidden state and memory for each layer into 2 tensors of shape
# (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size) # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size)
# respectively. # respectively.
final_hidden_states, final_memory_states = zip(*final_states) final_hidden_states, final_memory_states = zip(*final_states)
@@ -408,6 +409,66 @@ class ElmobiLm(torch.nn.Module):
torch.cat(final_memory_states, 0)) torch.cat(final_memory_states, 0))
return stacked_sequence_outputs, final_state_tuple return stacked_sequence_outputs, final_state_tuple


def load_weights(self, weight_file: str) -> None:
"""
Load the pre-trained weights from the file.
"""
requires_grad = False

with h5py.File(weight_file, 'r') as fin:
for i_layer, lstms in enumerate(
zip(self.forward_layers, self.backward_layers)
):
for j_direction, lstm in enumerate(lstms):
# lstm is an instance of LSTMCellWithProjection
cell_size = lstm.cell_size

dataset = fin['RNN_%s' % j_direction]['RNN']['MultiRNNCell']['Cell%s' % i_layer
]['LSTMCell']

# tensorflow packs together both W and U matrices into one matrix,
# but pytorch maintains individual matrices. In addition, tensorflow
# packs the gates as input, memory, forget, output but pytorch
# uses input, forget, memory, output. So we need to modify the weights.
tf_weights = numpy.transpose(dataset['W_0'][...])
torch_weights = tf_weights.copy()

# split the W from U matrices
input_size = lstm.input_size
input_weights = torch_weights[:, :input_size]
recurrent_weights = torch_weights[:, input_size:]
tf_input_weights = tf_weights[:, :input_size]
tf_recurrent_weights = tf_weights[:, input_size:]

# handle the different gate order convention
for torch_w, tf_w in [[input_weights, tf_input_weights],
[recurrent_weights, tf_recurrent_weights]]:
torch_w[(1 * cell_size):(2 * cell_size), :] = tf_w[(2 * cell_size):(3 * cell_size), :]
torch_w[(2 * cell_size):(3 * cell_size), :] = tf_w[(1 * cell_size):(2 * cell_size), :]

lstm.input_linearity.weight.data.copy_(torch.FloatTensor(input_weights))
lstm.state_linearity.weight.data.copy_(torch.FloatTensor(recurrent_weights))
lstm.input_linearity.weight.requires_grad = requires_grad
lstm.state_linearity.weight.requires_grad = requires_grad

# the bias weights
tf_bias = dataset['B'][...]
# tensorflow adds 1.0 to forget gate bias instead of modifying the
# parameters...
tf_bias[(2 * cell_size):(3 * cell_size)] += 1
torch_bias = tf_bias.copy()
torch_bias[(1 * cell_size):(2 * cell_size)
] = tf_bias[(2 * cell_size):(3 * cell_size)]
torch_bias[(2 * cell_size):(3 * cell_size)
] = tf_bias[(1 * cell_size):(2 * cell_size)]
lstm.state_linearity.bias.data.copy_(torch.FloatTensor(torch_bias))
lstm.state_linearity.bias.requires_grad = requires_grad

# the projection weights
proj_weights = numpy.transpose(dataset['W_P_0'][...])
lstm.state_projection.weight.data.copy_(torch.FloatTensor(proj_weights))
lstm.state_projection.weight.requires_grad = requires_grad



class LstmTokenEmbedder(nn.Module): class LstmTokenEmbedder(nn.Module):
def __init__(self, config, word_emb_layer, char_emb_layer): def __init__(self, config, word_emb_layer, char_emb_layer):
@@ -441,7 +502,7 @@ class LstmTokenEmbedder(nn.Module):
chars_emb = self.char_emb_layer(chars) chars_emb = self.char_emb_layer(chars)
# TODO 这里应该要考虑seq_len的问题 # TODO 这里应该要考虑seq_len的问题
_, (chars_outputs, __) = self.char_lstm(chars_emb) _, (chars_outputs, __) = self.char_lstm(chars_emb)
chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['char_dim'] * 2)
chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['embedding']['dim'] * 2)
embs.append(chars_outputs) embs.append(chars_outputs)


token_embedding = torch.cat(embs, dim=2) token_embedding = torch.cat(embs, dim=2)
@@ -450,79 +511,143 @@ class LstmTokenEmbedder(nn.Module):




class ConvTokenEmbedder(nn.Module): class ConvTokenEmbedder(nn.Module):
def __init__(self, config, word_emb_layer, char_emb_layer):
def __init__(self, config, weight_file, word_emb_layer, char_emb_layer, char_vocab):
super(ConvTokenEmbedder, self).__init__() super(ConvTokenEmbedder, self).__init__()
self.config = config
self.weight_file = weight_file
self.word_emb_layer = word_emb_layer self.word_emb_layer = word_emb_layer
self.char_emb_layer = char_emb_layer self.char_emb_layer = char_emb_layer


self.output_dim = config['encoder']['projection_dim'] self.output_dim = config['encoder']['projection_dim']
self.emb_dim = 0
if word_emb_layer is not None:
self.emb_dim += word_emb_layer.weight.size(1)

if char_emb_layer is not None:
self.convolutions = []
cnn_config = config['token_embedder']
filters = cnn_config['filters']
char_embed_dim = cnn_config['char_dim']

for i, (width, num) in enumerate(filters):
conv = torch.nn.Conv1d(
in_channels=char_embed_dim,
out_channels=num,
kernel_size=width,
bias=True
)
self.convolutions.append(conv)

self.convolutions = nn.ModuleList(self.convolutions)

self.n_filters = sum(f[1] for f in filters)
self.n_highway = cnn_config['n_highway']

self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu)
self.emb_dim += self.n_filters

self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True)
self._options = config
self.requires_grad = False
self._load_weights()
self._char_embedding_weights = char_emb_layer.weight.data

def _load_weights(self):
self._load_cnn_weights()
self._load_highway()
self._load_projection()

def _load_cnn_weights(self):
cnn_options = self._options['token_embedder']
filters = cnn_options['filters']
char_embed_dim = cnn_options['embedding']['dim']

convolutions = []
for i, (width, num) in enumerate(filters):
conv = torch.nn.Conv1d(
in_channels=char_embed_dim,
out_channels=num,
kernel_size=width,
bias=True
)
# load the weights
with h5py.File(self.weight_file, 'r') as fin:
weight = fin['CNN']['W_cnn_{}'.format(i)][...]
bias = fin['CNN']['b_cnn_{}'.format(i)][...]

w_reshaped = numpy.transpose(weight.squeeze(axis=0), axes=(2, 1, 0))
if w_reshaped.shape != tuple(conv.weight.data.shape):
raise ValueError("Invalid weight file")
conv.weight.data.copy_(torch.FloatTensor(w_reshaped))
conv.bias.data.copy_(torch.FloatTensor(bias))

conv.weight.requires_grad = self.requires_grad
conv.bias.requires_grad = self.requires_grad

convolutions.append(conv)
self.add_module('char_conv_{}'.format(i), conv)

self._convolutions = convolutions

def _load_highway(self):
# the highway layers have same dimensionality as the number of cnn filters
cnn_options = self._options['token_embedder']
filters = cnn_options['filters']
n_filters = sum(f[1] for f in filters)
n_highway = cnn_options['n_highway']

# create the layers, and load the weights
self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu)
for k in range(n_highway):
# The AllenNLP highway is one matrix multplication with concatenation of
# transform and carry weights.
with h5py.File(self.weight_file, 'r') as fin:
# The weights are transposed due to multiplication order assumptions in tf
# vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X))
w_transform = numpy.transpose(fin['CNN_high_{}'.format(k)]['W_transform'][...])
# -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x)
w_carry = -1.0 * numpy.transpose(fin['CNN_high_{}'.format(k)]['W_carry'][...])
weight = numpy.concatenate([w_transform, w_carry], axis=0)
self._highways._layers[k].weight.data.copy_(torch.FloatTensor(weight))
self._highways._layers[k].weight.requires_grad = self.requires_grad

b_transform = fin['CNN_high_{}'.format(k)]['b_transform'][...]
b_carry = -1.0 * fin['CNN_high_{}'.format(k)]['b_carry'][...]
bias = numpy.concatenate([b_transform, b_carry], axis=0)
self._highways._layers[k].bias.data.copy_(torch.FloatTensor(bias))
self._highways._layers[k].bias.requires_grad = self.requires_grad

def _load_projection(self):
cnn_options = self._options['token_embedder']
filters = cnn_options['filters']
n_filters = sum(f[1] for f in filters)

self._projection = torch.nn.Linear(n_filters, self.output_dim, bias=True)
with h5py.File(self.weight_file, 'r') as fin:
weight = fin['CNN_proj']['W_proj'][...]
bias = fin['CNN_proj']['b_proj'][...]
self._projection.weight.data.copy_(torch.FloatTensor(numpy.transpose(weight)))
self._projection.bias.data.copy_(torch.FloatTensor(bias))

self._projection.weight.requires_grad = self.requires_grad
self._projection.bias.requires_grad = self.requires_grad


def forward(self, words, chars): def forward(self, words, chars):
embs = []
if self.word_emb_layer is not None:
if hasattr(self, 'words_to_words'):
words = self.words_to_words[words]
word_emb = self.word_emb_layer(words)
embs.append(word_emb)
"""
:param words:
:param chars: Tensor Shape ``(batch_size, sequence_length, 50)``:
:return Tensor Shape ``(batch_size, sequence_length + 2, embedding_dim)`` :
"""
# the character id embedding
# (batch_size * sequence_length, max_chars_per_token, embed_dim)
# character_embedding = torch.nn.functional.embedding(
# chars.view(-1, max_chars_per_token),
# self._char_embedding_weights
# )
batch_size, sequence_length, max_char_len = chars.size()
character_embedding = self.char_emb_layer(chars).reshape(batch_size*sequence_length, max_char_len, -1)
# run convolutions
cnn_options = self._options['token_embedder']
if cnn_options['activation'] == 'tanh':
activation = torch.tanh
elif cnn_options['activation'] == 'relu':
activation = torch.nn.functional.relu
else:
raise Exception("Unknown activation")


if self.char_emb_layer is not None:
batch_size, seq_len, _ = chars.size()
chars = chars.view(batch_size * seq_len, -1)
character_embedding = self.char_emb_layer(chars)
character_embedding = torch.transpose(character_embedding, 1, 2)

cnn_config = self.config['token_embedder']
if cnn_config['activation'] == 'tanh':
activation = torch.nn.functional.tanh
elif cnn_config['activation'] == 'relu':
activation = torch.nn.functional.relu
else:
raise Exception("Unknown activation")
# (batch_size * sequence_length, embed_dim, max_chars_per_token)
character_embedding = torch.transpose(character_embedding, 1, 2)
convs = []
for i in range(len(self._convolutions)):
conv = getattr(self, 'char_conv_{}'.format(i))
convolved = conv(character_embedding)
# (batch_size * sequence_length, n_filters for this width)
convolved, _ = torch.max(convolved, dim=-1)
convolved = activation(convolved)
convs.append(convolved)


convs = []
for i in range(len(self.convolutions)):
convolved = self.convolutions[i](character_embedding)
# (batch_size * sequence_length, n_filters for this width)
convolved, _ = torch.max(convolved, dim=-1)
convolved = activation(convolved)
convs.append(convolved)
char_emb = torch.cat(convs, dim=-1)
char_emb = self.highways(char_emb)
# (batch_size * sequence_length, n_filters)
token_embedding = torch.cat(convs, dim=-1)


embs.append(char_emb.view(batch_size, -1, self.n_filters))
# apply the highway layers (batch_size * sequence_length, n_filters)
token_embedding = self._highways(token_embedding)


token_embedding = torch.cat(embs, dim=2)
# final projection (batch_size * sequence_length, embedding_dim)
token_embedding = self._projection(token_embedding)


return self.projection(token_embedding)
# reshape to (batch_size, sequence_length+2, embedding_dim)
return token_embedding.view(batch_size, sequence_length, -1)




class Highway(torch.nn.Module): class Highway(torch.nn.Module):
@@ -543,6 +668,7 @@ class Highway(torch.nn.Module):
activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``) activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``)
The non-linearity to use in the highway layers. The non-linearity to use in the highway layers.
""" """

def __init__(self, def __init__(self,
input_dim: int, input_dim: int,
num_layers: int = 1, num_layers: int = 1,
@@ -573,6 +699,7 @@ class Highway(torch.nn.Module):
current_input = gate * linear_part + (1 - gate) * nonlinear_part current_input = gate * linear_part + (1 - gate) * nonlinear_part
return current_input return current_input



class _ElmoModel(nn.Module): class _ElmoModel(nn.Module):
""" """
该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作,包括 该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作,包括
@@ -582,11 +709,32 @@ class _ElmoModel(nn.Module):
(4) 设计一个保存token的embedding,允许缓存word的表示。 (4) 设计一个保存token的embedding,允许缓存word的表示。


""" """
def __init__(self, model_dir:str, vocab:Vocabulary=None, cache_word_reprs:bool=False):

def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False):
super(_ElmoModel, self).__init__() super(_ElmoModel, self).__init__()
config = json.load(open(os.path.join(model_dir, 'structure_config.json'), 'r'))


dir = os.walk(model_dir)
config_file = None
weight_file = None
config_count = 0
weight_count = 0
for path, dir_list, file_list in dir:
for file_name in file_list:
if file_name.__contains__(".json"):
config_file = file_name
config_count += 1
elif file_name.__contains__(".hdf5"):
weight_file = file_name
weight_count += 1
if config_count > 1 or weight_count > 1:
raise Exception(f"Multiple config files(*.json) or weight files(*.hdf5) detected in {model_dir}.")
elif config_count == 0 or weight_count == 0:
raise Exception(f"No config file or weight file found in {model_dir}")

config = json.load(open(os.path.join(model_dir, config_file), 'r'))
self.weight_file = os.path.join(model_dir, weight_file)
self.config = config self.config = config
self.requires_grad = False


OOV_TAG = '<oov>' OOV_TAG = '<oov>'
PAD_TAG = '<pad>' PAD_TAG = '<pad>'
@@ -595,48 +743,8 @@ class _ElmoModel(nn.Module):
BOW_TAG = '<bow>' BOW_TAG = '<bow>'
EOW_TAG = '<eow>' EOW_TAG = '<eow>'


# 将加载embedding放到这里
token_embedder_states = torch.load(os.path.join(model_dir, 'token_embedder.pkl'), map_location='cpu')

# For the model trained with word form word encoder.
if config['token_embedder']['word_dim'] > 0:
word_lexicon = {}
with codecs.open(os.path.join(model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi:
for line in fpi:
tokens = line.strip().split('\t')
if len(tokens) == 1:
tokens.insert(0, '\u3000')
token, i = tokens
word_lexicon[token] = int(i)
# 做一些sanity check
for special_word in [PAD_TAG, OOV_TAG, BOS_TAG, EOS_TAG]:
assert special_word in word_lexicon, f"{special_word} not found in word.dic."
# 根据vocab调整word_embedding
pre_word_embedding = token_embedder_states.pop('word_emb_layer.embedding.weight')
word_emb_layer = nn.Embedding(len(vocab)+2, config['token_embedder']['word_dim']) #多增加两个是为了<bos>与<eos>
found_word_count = 0
for word, index in vocab:
if index == vocab.unknown_idx: # 因为fastNLP的unknow是<unk> 而在这里是<oov>所以ugly强制适配一下
index_in_pre = word_lexicon[OOV_TAG]
found_word_count += 1
elif index == vocab.padding_idx: # 需要pad对齐
index_in_pre = word_lexicon[PAD_TAG]
found_word_count += 1
elif word in word_lexicon:
index_in_pre = word_lexicon[word]
found_word_count += 1
else:
index_in_pre = word_lexicon[OOV_TAG]
word_emb_layer.weight.data[index] = pre_word_embedding[index_in_pre]
print(f"{found_word_count} out of {len(vocab)} words were found in pretrained elmo embedding.")
word_emb_layer.weight.data[-1] = pre_word_embedding[word_lexicon[EOS_TAG]]
word_emb_layer.weight.data[-2] = pre_word_embedding[word_lexicon[BOS_TAG]]
self.word_vocab = vocab
else:
word_emb_layer = None

# For the model trained with character-based word encoder. # For the model trained with character-based word encoder.
if config['token_embedder']['char_dim'] > 0:
if config['token_embedder']['embedding']['dim'] > 0:
char_lexicon = {} char_lexicon = {}
with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
for line in fpi: for line in fpi:
@@ -645,22 +753,26 @@ class _ElmoModel(nn.Module):
tokens.insert(0, '\u3000') tokens.insert(0, '\u3000')
token, i = tokens token, i = tokens
char_lexicon[token] = int(i) char_lexicon[token] = int(i)

# 做一些sanity check # 做一些sanity check
for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
assert special_word in char_lexicon, f"{special_word} not found in char.dic." assert special_word in char_lexicon, f"{special_word} not found in char.dic."

# 从vocab中构建char_vocab # 从vocab中构建char_vocab
char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
# 需要保证<bow>与<eow>在里面 # 需要保证<bow>与<eow>在里面
char_vocab.add_word(BOW_TAG)
char_vocab.add_word(EOW_TAG)
char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG])
for word, index in vocab: for word, index in vocab:
char_vocab.add_word_lst(list(word)) char_vocab.add_word_lst(list(word))
# 保证<eos>, <bos>也在
char_vocab.add_word_lst(list(BOS_TAG))
char_vocab.add_word_lst(list(EOS_TAG))
# 根据char_lexicon调整
char_emb_layer = nn.Embedding(len(char_vocab), int(config['token_embedder']['char_dim']))
pre_char_embedding = token_embedder_states.pop('char_emb_layer.embedding.weight')

self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab)+1, vocab.padding_idx
# 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示)
char_emb_layer = nn.Embedding(len(char_vocab)+1, int(config['token_embedder']['embedding']['dim']),
padding_idx=len(char_vocab))
with h5py.File(self.weight_file, 'r') as fin:
char_embed_weights = fin['char_embed'][...]
char_embed_weights = torch.from_numpy(char_embed_weights)
found_char_count = 0 found_char_count = 0
for char, index in char_vocab: # 调整character embedding for char, index in char_vocab: # 调整character embedding
if char in char_lexicon: if char in char_lexicon:
@@ -668,79 +780,84 @@ class _ElmoModel(nn.Module):
found_char_count += 1 found_char_count += 1
else: else:
index_in_pre = char_lexicon[OOV_TAG] index_in_pre = char_lexicon[OOV_TAG]
char_emb_layer.weight.data[index] = pre_char_embedding[index_in_pre]
char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre]

print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
# 生成words到chars的映射 # 生成words到chars的映射
if config['token_embedder']['name'].lower() == 'cnn': if config['token_embedder']['name'].lower() == 'cnn':
max_chars = config['token_embedder']['max_characters_per_token'] max_chars = config['token_embedder']['max_characters_per_token']
elif config['token_embedder']['name'].lower() == 'lstm': elif config['token_embedder']['name'].lower() == 'lstm':
max_chars = max(map(lambda x: len(x[0]), vocab)) + 2 # 需要补充两个<bow>与<eow>
max_chars = max(map(lambda x: len(x[0]), vocab)) + 2 # 需要补充两个<bow>与<eow>
else: else:
raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name'])) raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name']))
# 增加<bos>, <eos>所以加2.
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab)+2, max_chars), self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab)+2, max_chars),
fill_value=char_vocab.to_index(PAD_TAG), dtype=torch.long),
fill_value=len(char_vocab),
dtype=torch.long),
requires_grad=False) requires_grad=False)
for word, index in vocab:
if len(word)+2>max_chars:
word = word[:max_chars-2]
if index==vocab.padding_idx: # 如果是pad的话,需要和给定的对齐
word = PAD_TAG
elif index==vocab.unknown_idx:
word = OOV_TAG
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)]*(max_chars-len(char_ids))
for word, index in list(iter(vocab)) + [(BOS_TAG, len(vocab)), (EOS_TAG, len(vocab)+1)]:
if len(word) + 2 > max_chars:
word = word[:max_chars - 2]
if index == self._pad_index:
continue
elif word == BOS_TAG or word == EOS_TAG:
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(word)] + [
char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
else:
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [
char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)
for index, word in enumerate([BOS_TAG, EOS_TAG]): # 加上<eos>, <bos>
if len(word)+2>max_chars:
word = word[:max_chars-2]
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)]*(max_chars-len(char_ids))
self.words_to_chars_embedding[index+len(vocab)] = torch.LongTensor(char_ids)

self.char_vocab = char_vocab self.char_vocab = char_vocab
else: else:
char_emb_layer = None char_emb_layer = None


if config['token_embedder']['name'].lower() == 'cnn': if config['token_embedder']['name'].lower() == 'cnn':
self.token_embedder = ConvTokenEmbedder( self.token_embedder = ConvTokenEmbedder(
config, word_emb_layer, char_emb_layer)
config, self.weight_file, None, char_emb_layer, self.char_vocab)
elif config['token_embedder']['name'].lower() == 'lstm': elif config['token_embedder']['name'].lower() == 'lstm':
self.token_embedder = LstmTokenEmbedder( self.token_embedder = LstmTokenEmbedder(
config, word_emb_layer, char_emb_layer)
self.token_embedder.load_state_dict(token_embedder_states, strict=False)
if config['token_embedder']['word_dim'] > 0 and vocab._no_create_word_length > 0: # 需要映射,使得来自于dev, test的idx指向unk
words_to_words = nn.Parameter(torch.arange(len(vocab)+2).long(), requires_grad=False)
config, None, char_emb_layer)

if config['token_embedder']['word_dim'] > 0 \
and vocab._no_create_word_length > 0: # 需要映射,使得来自于dev, test的idx指向unk
words_to_words = nn.Parameter(torch.arange(len(vocab) + 2).long(), requires_grad=False)
for word, idx in vocab: for word, idx in vocab:
if vocab._is_word_no_create_entry(word): if vocab._is_word_no_create_entry(word):
words_to_words[idx] = vocab.unknown_idx words_to_words[idx] = vocab.unknown_idx
setattr(self.token_embedder, 'words_to_words', words_to_words) setattr(self.token_embedder, 'words_to_words', words_to_words)
self.output_dim = config['encoder']['projection_dim'] self.output_dim = config['encoder']['projection_dim']


# 暂时只考虑 elmo
if config['encoder']['name'].lower() == 'elmo': if config['encoder']['name'].lower() == 'elmo':
self.encoder = ElmobiLm(config) self.encoder = ElmobiLm(config)
elif config['encoder']['name'].lower() == 'lstm': elif config['encoder']['name'].lower() == 'lstm':
self.encoder = LstmbiLm(config) self.encoder = LstmbiLm(config)
self.encoder.load_state_dict(torch.load(os.path.join(model_dir, 'encoder.pkl'),
map_location='cpu'))


self.bos_index = len(vocab)
self.eos_index = len(vocab) + 1
self._pad_index = vocab.padding_idx
self.encoder.load_weights(self.weight_file)


if cache_word_reprs: if cache_word_reprs:
if config['token_embedder']['char_dim']>0: # 只有在使用了chars的情况下有用
if config['token_embedder']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用
print("Start to generate cache word representations.") print("Start to generate cache word representations.")
batch_size = 320 batch_size = 320
num_batches = self.words_to_chars_embedding.size(0)//batch_size + \
int(self.words_to_chars_embedding.size(0)%batch_size!=0)
self.cached_word_embedding = nn.Embedding(self.words_to_chars_embedding.size(0),
# bos eos
word_size = self.words_to_chars_embedding.size(0)
num_batches = word_size // batch_size + \
int(word_size % batch_size != 0)

self.cached_word_embedding = nn.Embedding(word_size,
config['encoder']['projection_dim']) config['encoder']['projection_dim'])
with torch.no_grad(): with torch.no_grad():
for i in range(num_batches): for i in range(num_batches):
words = torch.arange(i*batch_size, min((i+1)*batch_size, self.words_to_chars_embedding.size(0))).long()
words = torch.arange(i * batch_size,
min((i + 1) * batch_size, word_size)).long()
chars = self.words_to_chars_embedding[words].unsqueeze(1) # batch_size x 1 x max_chars chars = self.words_to_chars_embedding[words].unsqueeze(1) # batch_size x 1 x max_chars
word_reprs = self.token_embedder(words.unsqueeze(1), chars).detach() # batch_size x 1 x config['encoder']['projection_dim']
word_reprs = self.token_embedder(words.unsqueeze(1),
chars).detach() # batch_size x 1 x config['encoder']['projection_dim']
self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)

print("Finish generating cached word representations. Going to delete the character encoder.") print("Finish generating cached word representations. Going to delete the character encoder.")
del self.token_embedder, self.words_to_chars_embedding del self.token_embedder, self.words_to_chars_embedding
else: else:
@@ -758,7 +875,7 @@ class _ElmoModel(nn.Module):
seq_len = words.ne(self._pad_index).sum(dim=-1) seq_len = words.ne(self._pad_index).sum(dim=-1)
expanded_words[:, 1:-1] = words expanded_words[:, 1:-1] = words
expanded_words[:, 0].fill_(self.bos_index) expanded_words[:, 0].fill_(self.bos_index)
expanded_words[torch.arange(batch_size).to(words), seq_len+1] = self.eos_index
expanded_words[torch.arange(batch_size).to(words), seq_len + 1] = self.eos_index
seq_len = seq_len + 2 seq_len = seq_len + 2
if hasattr(self, 'cached_word_embedding'): if hasattr(self, 'cached_word_embedding'):
token_embedding = self.cached_word_embedding(expanded_words) token_embedding = self.cached_word_embedding(expanded_words)
@@ -767,16 +884,18 @@ class _ElmoModel(nn.Module):
chars = self.words_to_chars_embedding[expanded_words] chars = self.words_to_chars_embedding[expanded_words]
else: else:
chars = None chars = None
token_embedding = self.token_embedder(expanded_words, chars)
token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim

if self.config['encoder']['name'] == 'elmo': if self.config['encoder']['name'] == 'elmo':
encoder_output = self.encoder(token_embedding, seq_len) encoder_output = self.encoder(token_embedding, seq_len)
if encoder_output.size(2) < max_len+2:
dummy_tensor = encoder_output.new_zeros(encoder_output.size(0), batch_size,
max_len + 2 - encoder_output.size(2), encoder_output.size(-1))
encoder_output = torch.cat([encoder_output, dummy_tensor], 2)
sz = encoder_output.size() # 2, batch_size, max_len, hidden_size
token_embedding = torch.cat([token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3])
encoder_output = torch.cat([token_embedding, encoder_output], dim=0)
if encoder_output.size(2) < max_len + 2:
num_layers, _, output_len, hidden_size = encoder_output.size()
dummy_tensor = encoder_output.new_zeros(num_layers, batch_size,
max_len + 2 - output_len, hidden_size)
encoder_output = torch.cat((encoder_output, dummy_tensor), 2)
sz = encoder_output.size() # 2, batch_size, max_len, hidden_size
token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3])
encoder_output = torch.cat((token_embedding, encoder_output), dim=0)
elif self.config['encoder']['name'] == 'lstm': elif self.config['encoder']['name'] == 'lstm':
encoder_output = self.encoder(token_embedding, seq_len) encoder_output = self.encoder(token_embedding, seq_len)
else: else:
@@ -784,5 +903,4 @@ class _ElmoModel(nn.Module):


# 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。 # 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。
encoder_output = encoder_output[:, :, 1:-1] encoder_output = encoder_output[:, :, 1:-1]

return encoder_output return encoder_output

+ 93
- 26
fastNLP/modules/encoder/embedding.py View File

@@ -179,16 +179,16 @@ class StaticEmbedding(TokenEmbedding):
:param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding的文件名,第二种是传入embedding :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding的文件名,第二种是传入embedding
的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d, 的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
`en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 `en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。
:param requires_grad: 是否需要gradient. 默认为True
:param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。
:param normailize: 是否对vector进行normalize,使得每个vector的norm为1。
:param bool requires_grad: 是否需要gradient. 默认为True
:param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。
:param bool normailize: 是否对vector进行normalize,使得每个vector的norm为1。
:param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独
为大写的词语开辟一个vector表示,则将lower设置为False。
""" """
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None, def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None,
normalize=False):
normalize=False, lower=False):
super(StaticEmbedding, self).__init__(vocab) super(StaticEmbedding, self).__init__(vocab)


# 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server,

# 得到cache_path # 得到cache_path
if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
PRETRAIN_URL = _get_base_url('static') PRETRAIN_URL = _get_base_url('static')
@@ -202,8 +202,40 @@ class StaticEmbedding(TokenEmbedding):
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")


# 读取embedding # 读取embedding
embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method,
normalize=normalize)
if lower:
lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown)
for word, index in vocab:
if not vocab._is_word_no_create_entry(word):
lowered_vocab.add_word(word.lower()) # 先加入需要创建entry的
for word in vocab._no_create_word.keys(): # 不需要创建entry的
if word in vocab:
lowered_word = word.lower()
if lowered_word not in lowered_vocab.word_count:
lowered_vocab.add_word(lowered_word)
lowered_vocab._no_create_word[lowered_word] += 1
print(f"All word in vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered "
f"words.")
embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method,
normalize=normalize)
# 需要适配一下
if not hasattr(self, 'words_to_words'):
self.words_to_words = torch.arange(len(lowered_vocab, )).long()
if lowered_vocab.unknown:
unknown_idx = lowered_vocab.unknown_idx
else:
unknown_idx = embedding.size(0) - 1 # 否则是最后一个为unknow
words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
requires_grad=False)
for word, index in vocab:
if word not in lowered_vocab:
word = word.lower()
if lowered_vocab._is_word_no_create_entry(word): # 如果不需要创建entry,已经默认unknown了
continue
words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)]
self.words_to_words = words_to_words
else:
embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method,
normalize=normalize)
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
padding_idx=vocab.padding_idx, padding_idx=vocab.padding_idx,
max_norm=None, norm_type=2, scale_grad_by_freq=False, max_norm=None, norm_type=2, scale_grad_by_freq=False,
@@ -301,7 +333,7 @@ class StaticEmbedding(TokenEmbedding):
if vocab._no_create_word_length>0: if vocab._no_create_word_length>0:
if vocab.unknown is None: # 创建一个专门的unknown if vocab.unknown is None: # 创建一个专门的unknown
unknown_idx = len(matrix) unknown_idx = len(matrix)
vectors = torch.cat([vectors, torch.zeros(1, dim)], dim=0).contiguous()
vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous()
else: else:
unknown_idx = vocab.unknown_idx unknown_idx = vocab.unknown_idx
words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
@@ -438,19 +470,15 @@ class ElmoEmbedding(ContextualEmbedding):
:param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo权重的文件名,第二种是传入ELMo版本的名称, :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo权重的文件名,第二种是传入ELMo版本的名称,
目前支持的ELMo包括{`en` : 英文版本的ELMo, `cn` : 中文版本的ELMo,}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载 目前支持的ELMo包括{`en` : 英文版本的ELMo, `cn` : 中文版本的ELMo,}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载
:param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果 :param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果
按照这个顺序concat起来。默认为'2'。
:param requires_grad: bool, 该层是否需要gradient. 默认为False
按照这个顺序concat起来。默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致,
初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。)
:param requires_grad: bool, 该层是否需要gradient, 默认为False.
:param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding,
并删除character encoder,之后将直接使用cache的embedding。默认为False。 并删除character encoder,之后将直接使用cache的embedding。默认为False。
""" """
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en',
layers: str='2', requires_grad: bool=False, cache_word_reprs: bool=False): layers: str='2', requires_grad: bool=False, cache_word_reprs: bool=False):
super(ElmoEmbedding, self).__init__(vocab) super(ElmoEmbedding, self).__init__(vocab)
layers = list(map(int, layers.split(',')))
assert len(layers) > 0, "Must choose one output"
for layer in layers:
assert 0 <= layer <= 2, "Layer index should be in range [0, 2]."
self.layers = layers


# 根据model_dir_or_name检查是否存在并下载 # 根据model_dir_or_name检查是否存在并下载
if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
@@ -464,8 +492,49 @@ class ElmoEmbedding(ContextualEmbedding):
else: else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")
self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs)

if layers=='mix':
self.layer_weights = nn.Parameter(torch.zeros(self.model.config['encoder']['n_layers']+1),
requires_grad=requires_grad)
self.gamma = nn.Parameter(torch.ones(1), requires_grad=requires_grad)
self._get_outputs = self._get_mixed_outputs
self._embed_size = self.model.config['encoder']['projection_dim'] * 2
else:
layers = list(map(int, layers.split(',')))
assert len(layers) > 0, "Must choose one output"
for layer in layers:
assert 0 <= layer <= 2, "Layer index should be in range [0, 2]."
self.layers = layers
self._get_outputs = self._get_layer_outputs
self._embed_size = len(self.layers) * self.model.config['encoder']['projection_dim'] * 2

self.requires_grad = requires_grad self.requires_grad = requires_grad
self._embed_size = len(self.layers) * self.model.config['encoder']['projection_dim'] * 2

def _get_mixed_outputs(self, outputs):
# outputs: num_layers x batch_size x max_len x hidden_size
# return: batch_size x max_len x hidden_size
weights = F.softmax(self.layer_weights+1/len(outputs), dim=0).to(outputs)
outputs = torch.einsum('l,lbij->bij', weights, outputs)
return self.gamma.to(outputs)*outputs

def set_mix_weights_requires_grad(self, flag=True):
"""
当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用
该方法没有用。
:param bool flag: 混合不同层表示的结果是否可以训练。
:return:
"""
if hasattr(self, 'layer_weights'):
self.layer_weights.requires_grad = flag
self.gamma.requires_grad = flag

def _get_layer_outputs(self, outputs):
if len(self.layers) == 1:
outputs = outputs[self.layers[0]]
else:
outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1)

return outputs


def forward(self, words: torch.LongTensor): def forward(self, words: torch.LongTensor):
""" """
@@ -480,15 +549,12 @@ class ElmoEmbedding(ContextualEmbedding):
if outputs is not None: if outputs is not None:
return outputs return outputs
outputs = self.model(words) outputs = self.model(words)
if len(self.layers) == 1:
outputs = outputs[self.layers[0]]
else:
outputs = torch.cat([*outputs[self.layers]], dim=-1)

return outputs
return self._get_outputs(outputs)


def _delete_model_weights(self): def _delete_model_weights(self):
del self.layers, self.model
for name in ['layers', 'model', 'layer_weights', 'gamma']:
if hasattr(self, name):
delattr(self, name)


@property @property
def requires_grad(self): def requires_grad(self):
@@ -892,10 +958,11 @@ class StackEmbedding(TokenEmbedding):
def __init__(self, embeds: List[TokenEmbedding]): def __init__(self, embeds: List[TokenEmbedding]):
vocabs = [] vocabs = []
for embed in embeds: for embed in embeds:
vocabs.append(embed.get_word_vocab())
if hasattr(embed, 'get_word_vocab'):
vocabs.append(embed.get_word_vocab())
_vocab = vocabs[0] _vocab = vocabs[0]
for vocab in vocabs[1:]: for vocab in vocabs[1:]:
assert vocab == _vocab, "All embeddings should use the same word vocabulary."
assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary."


super(StackEmbedding, self).__init__(_vocab) super(StackEmbedding, self).__init__(_vocab)
assert isinstance(embeds, list) assert isinstance(embeds, list)


+ 8
- 5
fastNLP/modules/encoder/star_transformer.py View File

@@ -35,11 +35,13 @@ class StarTransformer(nn.Module):
self.iters = num_layers self.iters = num_layers
self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)]) self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)])
self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1)
self.emb_drop = nn.Dropout(dropout)
self.ring_att = nn.ModuleList( self.ring_att = nn.ModuleList(
[_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
[_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=0.0)
for _ in range(self.iters)]) for _ in range(self.iters)])
self.star_att = nn.ModuleList( self.star_att = nn.ModuleList(
[_MSA2(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
[_MSA2(hidden_size, nhead=num_head, head_dim=head_dim, dropout=0.0)
for _ in range(self.iters)]) for _ in range(self.iters)])
if max_len is not None: if max_len is not None:
@@ -66,18 +68,19 @@ class StarTransformer(nn.Module):
smask = torch.cat([torch.zeros(B, 1, ).byte().to(mask), mask], 1) smask = torch.cat([torch.zeros(B, 1, ).byte().to(mask), mask], 1)
embs = data.permute(0, 2, 1)[:, :, :, None] # B H L 1 embs = data.permute(0, 2, 1)[:, :, :, None] # B H L 1
if self.pos_emb:
if self.pos_emb and False:
P = self.pos_emb(torch.arange(L, dtype=torch.long, device=embs.device) \ P = self.pos_emb(torch.arange(L, dtype=torch.long, device=embs.device) \
.view(1, L)).permute(0, 2, 1).contiguous()[:, :, :, None] # 1 H L 1 .view(1, L)).permute(0, 2, 1).contiguous()[:, :, :, None] # 1 H L 1
embs = embs + P embs = embs + P
embs = norm_func(self.emb_drop, embs)
nodes = embs nodes = embs
relay = embs.mean(2, keepdim=True) relay = embs.mean(2, keepdim=True)
ex_mask = mask[:, None, :, None].expand(B, H, L, 1) ex_mask = mask[:, None, :, None].expand(B, H, L, 1)
r_embs = embs.view(B, H, 1, L) r_embs = embs.view(B, H, 1, L)
for i in range(self.iters): for i in range(self.iters):
ax = torch.cat([r_embs, relay.expand(B, H, 1, L)], 2) ax = torch.cat([r_embs, relay.expand(B, H, 1, L)], 2)
nodes = nodes + F.leaky_relu(self.ring_att[i](norm_func(self.norm[i], nodes), ax=ax))
nodes = F.leaky_relu(self.ring_att[i](norm_func(self.norm[i], nodes), ax=ax))
#nodes = F.leaky_relu(self.ring_att[i](nodes, ax=ax))
relay = F.leaky_relu(self.star_att[i](relay, torch.cat([relay, nodes], 2), smask)) relay = F.leaky_relu(self.star_att[i](relay, torch.cat([relay, nodes], 2), smask))
nodes = nodes.masked_fill_(ex_mask, 0) nodes = nodes.masked_fill_(ex_mask, 0)


+ 1
- 1
reproduction/Star_transformer/README.md View File

@@ -6,7 +6,7 @@ paper: [Star-Transformer](https://arxiv.org/abs/1902.09113)
|Pos Tagging|CTB 9.0|-|ACC 92.31| |Pos Tagging|CTB 9.0|-|ACC 92.31|
|Pos Tagging|CONLL 2012|-|ACC 96.51| |Pos Tagging|CONLL 2012|-|ACC 96.51|
|Named Entity Recognition|CONLL 2012|-|F1 85.66| |Named Entity Recognition|CONLL 2012|-|F1 85.66|
|Text Classification|SST|-|49.18|
|Text Classification|SST|-|51.2|
|Natural Language Inference|SNLI|-|83.76| |Natural Language Inference|SNLI|-|83.76|


## Usage ## Usage


+ 8
- 3
reproduction/Star_transformer/datasets.py View File

@@ -51,13 +51,15 @@ def load_sst(path, files):
for sub in [True, False, False]] for sub in [True, False, False]]
ds_list = [loader.load(os.path.join(path, fn)) ds_list = [loader.load(os.path.join(path, fn))
for fn, loader in zip(files, loaders)] for fn, loader in zip(files, loaders)]
word_v = Vocabulary(min_freq=2)
word_v = Vocabulary(min_freq=0)
tag_v = Vocabulary(unknown=None, padding=None) tag_v = Vocabulary(unknown=None, padding=None)
for ds in ds_list: for ds in ds_list:
ds.apply(lambda x: [w.lower() ds.apply(lambda x: [w.lower()
for w in x['words']], new_field_name='words') for w in x['words']], new_field_name='words')
ds_list[0].drop(lambda x: len(x['words']) < 3)
#ds_list[0].drop(lambda x: len(x['words']) < 3)
update_v(word_v, ds_list[0], 'words') update_v(word_v, ds_list[0], 'words')
update_v(word_v, ds_list[1], 'words')
update_v(word_v, ds_list[2], 'words')
ds_list[0].apply(lambda x: tag_v.add_word( ds_list[0].apply(lambda x: tag_v.add_word(
x['target']), new_field_name=None) x['target']), new_field_name=None)


@@ -152,7 +154,10 @@ class EmbedLoader:
# some words from vocab are missing in pre-trained embedding # some words from vocab are missing in pre-trained embedding
# we normally sample each dimension # we normally sample each dimension
vocab_embed = embedding_matrix[np.where(hit_flags)] vocab_embed = embedding_matrix[np.where(hit_flags)]
sampled_vectors = np.random.normal(vocab_embed.mean(axis=0), vocab_embed.std(axis=0),
#sampled_vectors = np.random.normal(vocab_embed.mean(axis=0), vocab_embed.std(axis=0),
# size=(len(vocab) - np.sum(hit_flags), emb_dim))
sampled_vectors = np.random.uniform(-0.01, 0.01,
size=(len(vocab) - np.sum(hit_flags), emb_dim)) size=(len(vocab) - np.sum(hit_flags), emb_dim))

embedding_matrix[np.where(1 - hit_flags)] = sampled_vectors embedding_matrix[np.where(1 - hit_flags)] = sampled_vectors
return embedding_matrix return embedding_matrix

+ 2
- 2
reproduction/Star_transformer/run.sh View File

@@ -1,5 +1,5 @@
#python -u train.py --task pos --ds conll --mode train --gpu 1 --lr 3e-4 --w_decay 2e-5 --lr_decay .95 --drop 0.3 --ep 25 --bsz 64 > conll_pos102.log 2>&1 & #python -u train.py --task pos --ds conll --mode train --gpu 1 --lr 3e-4 --w_decay 2e-5 --lr_decay .95 --drop 0.3 --ep 25 --bsz 64 > conll_pos102.log 2>&1 &
#python -u train.py --task pos --ds ctb --mode train --gpu 1 --lr 3e-4 --w_decay 2e-5 --lr_decay .95 --drop 0.3 --ep 25 --bsz 64 > ctb_pos101.log 2>&1 & #python -u train.py --task pos --ds ctb --mode train --gpu 1 --lr 3e-4 --w_decay 2e-5 --lr_decay .95 --drop 0.3 --ep 25 --bsz 64 > ctb_pos101.log 2>&1 &
#python -u train.py --task cls --ds sst --mode train --gpu 2 --lr 1e-4 --w_decay 1e-5 --lr_decay 0.9 --drop 0.5 --ep 50 --bsz 128 > sst_cls201.log &
python -u train.py --task cls --ds sst --mode train --gpu 0 --lr 1e-4 --w_decay 5e-5 --lr_decay 1.0 --drop 0.4 --ep 20 --bsz 64 > sst_cls.log &
#python -u train.py --task nli --ds snli --mode train --gpu 1 --lr 1e-4 --w_decay 1e-5 --lr_decay 0.9 --drop 0.4 --ep 120 --bsz 128 > snli_nli201.log & #python -u train.py --task nli --ds snli --mode train --gpu 1 --lr 1e-4 --w_decay 1e-5 --lr_decay 0.9 --drop 0.4 --ep 120 --bsz 128 > snli_nli201.log &
python -u train.py --task ner --ds conll --mode train --gpu 0 --lr 1e-4 --w_decay 1e-5 --lr_decay 0.9 --drop 0.4 --ep 120 --bsz 64 > conll_ner201.log &
#python -u train.py --task ner --ds conll --mode train --gpu 0 --lr 1e-4 --w_decay 1e-5 --lr_decay 0.9 --drop 0.4 --ep 120 --bsz 64 > conll_ner201.log &

+ 38
- 21
reproduction/Star_transformer/train.py View File

@@ -1,4 +1,6 @@
from util import get_argparser, set_gpu, set_rng_seeds, add_model_args from util import get_argparser, set_gpu, set_rng_seeds, add_model_args
seed = set_rng_seeds(15360)
print('RNG SEED {}'.format(seed))
from datasets import load_seqtag, load_sst, load_snli, EmbedLoader, MAX_LEN from datasets import load_seqtag, load_sst, load_snli, EmbedLoader, MAX_LEN
import torch.nn as nn import torch.nn as nn
import torch import torch
@@ -7,8 +9,8 @@ import fastNLP as FN
from fastNLP.models.star_transformer import STSeqLabel, STSeqCls, STNLICls from fastNLP.models.star_transformer import STSeqLabel, STSeqCls, STNLICls
from fastNLP.core.const import Const as C from fastNLP.core.const import Const as C
import sys import sys
sys.path.append('/remote-home/yfshao/workdir/dev_fastnlp/')
#sys.path.append('/remote-home/yfshao/workdir/dev_fastnlp/')
pre_dir = '/home/ec2-user/fast_data/'


g_model_select = { g_model_select = {
'pos': STSeqLabel, 'pos': STSeqLabel,
@@ -17,8 +19,8 @@ g_model_select = {
'nli': STNLICls, 'nli': STNLICls,
} }


g_emb_file_path = {'en': '/remote-home/yfshao/workdir/datasets/word_vector/glove.840B.300d.txt',
'zh': '/remote-home/yfshao/workdir/datasets/word_vector/cc.zh.300.vec'}
g_emb_file_path = {'en': pre_dir + 'glove.840B.300d.txt',
'zh': pre_dir + 'cc.zh.300.vec'}


g_args = None g_args = None
g_model_cfg = None g_model_cfg = None
@@ -53,7 +55,7 @@ def get_conll2012_ner():




def get_sst(): def get_sst():
path = '/remote-home/yfshao/workdir/datasets/SST'
path = pre_dir + 'sst'
files = ['train.txt', 'dev.txt', 'test.txt'] files = ['train.txt', 'dev.txt', 'test.txt']
return load_sst(path, files) return load_sst(path, files)


@@ -94,6 +96,7 @@ class MyCallback(FN.core.callback.Callback):
nn.utils.clip_grad.clip_grad_norm_(self.model.parameters(), 5.0) nn.utils.clip_grad.clip_grad_norm_(self.model.parameters(), 5.0)


def on_step_end(self): def on_step_end(self):
return
warm_steps = 6000 warm_steps = 6000
# learning rate warm-up & decay # learning rate warm-up & decay
if self.step <= warm_steps: if self.step <= warm_steps:
@@ -108,12 +111,11 @@ class MyCallback(FN.core.callback.Callback):




def train(): def train():
seed = set_rng_seeds(1234)
print('RNG SEED {}'.format(seed))
print('loading data') print('loading data')
ds_list, word_v, tag_v = g_datasets['{}-{}'.format( ds_list, word_v, tag_v = g_datasets['{}-{}'.format(
g_args.ds, g_args.task)]() g_args.ds, g_args.task)]()
print(ds_list[0][:2]) print(ds_list[0][:2])
print(len(ds_list[0]), len(ds_list[1]), len(ds_list[2]))
embed = load_pretrain_emb(word_v, lang='zh' if g_args.ds == 'ctb' else 'en') embed = load_pretrain_emb(word_v, lang='zh' if g_args.ds == 'ctb' else 'en')
g_model_cfg['num_cls'] = len(tag_v) g_model_cfg['num_cls'] = len(tag_v)
print(g_model_cfg) print(g_model_cfg)
@@ -123,11 +125,14 @@ def train():
def init_model(model): def init_model(model):
for p in model.parameters(): for p in model.parameters():
if p.size(0) != len(word_v): if p.size(0) != len(word_v):
nn.init.normal_(p, 0.0, 0.05)
if len(p.size())<2:
nn.init.constant_(p, 0.0)
else:
nn.init.normal_(p, 0.0, 0.05)
init_model(model) init_model(model)
train_data = ds_list[0] train_data = ds_list[0]
dev_data = ds_list[2]
test_data = ds_list[1]
dev_data = ds_list[1]
test_data = ds_list[2]
print(tag_v.word2idx) print(tag_v.word2idx)


if g_args.task in ['pos', 'ner']: if g_args.task in ['pos', 'ner']:
@@ -145,14 +150,26 @@ def train():
} }
metric_key, metric = metrics[g_args.task] metric_key, metric = metrics[g_args.task]
device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cuda' if torch.cuda.is_available() else 'cpu'
ex_param = [x for x in model.parameters(
) if x.requires_grad and x.size(0) != len(word_v)]
optim_cfg = [{'params': model.enc.embedding.parameters(), 'lr': g_args.lr*0.1},
{'params': ex_param, 'lr': g_args.lr, 'weight_decay': g_args.w_decay}, ]
trainer = FN.Trainer(train_data=train_data, model=model, optimizer=torch.optim.Adam(optim_cfg), loss=loss,
batch_size=g_args.bsz, n_epochs=g_args.ep, print_every=10, dev_data=dev_data, metrics=metric,
metric_key=metric_key, validate_every=3000, save_path=g_args.log, use_tqdm=False,
device=device, callbacks=[MyCallback()])

params = [(x,y) for x,y in list(model.named_parameters()) if y.requires_grad and y.size(0) != len(word_v)]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
print([n for n,p in params])
optim_cfg = [
#{'params': model.enc.embedding.parameters(), 'lr': g_args.lr*0.1},
{'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'lr': g_args.lr, 'weight_decay': 1.0*g_args.w_decay},
{'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'lr': g_args.lr, 'weight_decay': 0.0*g_args.w_decay}
]

print(model)
trainer = FN.Trainer(model=model, train_data=train_data, dev_data=dev_data,
loss=loss, metrics=metric, metric_key=metric_key,
optimizer=torch.optim.Adam(optim_cfg),
n_epochs=g_args.ep, batch_size=g_args.bsz, print_every=100, validate_every=1000,
device=device,
use_tqdm=False, prefetch=False,
save_path=g_args.log,
sampler=FN.BucketSampler(100, g_args.bsz, C.INPUT_LEN),
callbacks=[MyCallback()])


trainer.train() trainer.train()
tester = FN.Tester(data=test_data, model=model, metrics=metric, tester = FN.Tester(data=test_data, model=model, metrics=metric,
@@ -195,12 +212,12 @@ def main():
'init_embed': (None, 300), 'init_embed': (None, 300),
'num_cls': None, 'num_cls': None,
'hidden_size': g_args.hidden, 'hidden_size': g_args.hidden,
'num_layers': 4,
'num_layers': 2,
'num_head': g_args.nhead, 'num_head': g_args.nhead,
'head_dim': g_args.hdim, 'head_dim': g_args.hdim,
'max_len': MAX_LEN, 'max_len': MAX_LEN,
'cls_hidden_size': 600,
'emb_dropout': 0.3,
'cls_hidden_size': 200,
'emb_dropout': g_args.drop,
'dropout': g_args.drop, 'dropout': g_args.drop,
} }
run_select[g_args.mode.lower()]() run_select[g_args.mode.lower()]()


+ 0
- 0
reproduction/coreference_resolution/__init__.py View File


+ 0
- 0
reproduction/coreference_resolution/data_load/__init__.py View File


+ 68
- 0
reproduction/coreference_resolution/data_load/cr_loader.py View File

@@ -0,0 +1,68 @@
from fastNLP.io.dataset_loader import JsonLoader,DataSet,Instance
from fastNLP.io.file_reader import _read_json
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.io.base_loader import DataInfo
from reproduction.coreference_resolution.model.config import Config
import reproduction.coreference_resolution.model.preprocess as preprocess


class CRLoader(JsonLoader):
def __init__(self, fields=None, dropna=False):
super().__init__(fields, dropna)

def _load(self, path):
"""
加载数据
:param path:
:return:
"""
dataset = DataSet()
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
if self.fields:
ins = {self.fields[k]: v for k, v in d.items()}
else:
ins = d
dataset.append(Instance(**ins))
return dataset

def process(self, paths, **kwargs):
data_info = DataInfo()
for name in ['train', 'test', 'dev']:
data_info.datasets[name] = self.load(paths[name])

config = Config()
vocab = Vocabulary().from_dataset(*data_info.datasets.values(), field_name='sentences')
vocab.build_vocab()
word2id = vocab.word2idx

char_dict = preprocess.get_char_dict(config.char_path)
data_info.vocabs = vocab

genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}

for name, ds in data_info.datasets.items():
ds.apply(lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter),
config.max_sentences, is_train=name=='train')[0],
new_field_name='doc_np')
ds.apply(lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter),
config.max_sentences, is_train=name=='train')[1],
new_field_name='char_index')
ds.apply(lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter),
config.max_sentences, is_train=name=='train')[2],
new_field_name='seq_len')
ds.apply(lambda x: preprocess.speaker2numpy(x["speakers"], config.max_sentences, is_train=name=='train'),
new_field_name='speaker_ids_np')
ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre')

ds.set_ignore_type('clusters')
ds.set_padder('clusters', None)
ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len")
ds.set_target("clusters")

# train_dev, test = self.ds.split(348 / (2802 + 343 + 348), shuffle=False)
# train, dev = train_dev.split(343 / (2802 + 343), shuffle=False)

return data_info




+ 0
- 0
reproduction/coreference_resolution/model/__init__.py View File


+ 54
- 0
reproduction/coreference_resolution/model/config.py View File

@@ -0,0 +1,54 @@
class Config():
def __init__(self):
self.is_training = True
# path
self.glove = 'data/glove.840B.300d.txt.filtered'
self.turian = 'data/turian.50d.txt'
self.train_path = "data/train.english.jsonlines"
self.dev_path = "data/dev.english.jsonlines"
self.test_path = "data/test.english.jsonlines"
self.char_path = "data/char_vocab.english.txt"

self.cuda = "0"
self.max_word = 1500
self.epoch = 200

# config
# self.use_glove = True
# self.use_turian = True #No
self.use_elmo = False
self.use_CNN = True
self.model_heads = True #Yes
self.use_width = True # Yes
self.use_distance = True #Yes
self.use_metadata = True #Yes

self.mention_ratio = 0.4
self.max_sentences = 50
self.span_width = 10
self.feature_size = 20 #宽度信息emb的size
self.lr = 0.001
self.lr_decay = 1e-3
self.max_antecedents = 100 # 这个参数在mention detection中没有用
self.atten_hidden_size = 150
self.mention_hidden_size = 150
self.sa_hidden_size = 150

self.char_emb_size = 8
self.filter = [3,4,5]


# decay = 1e-5

def __str__(self):
d = self.__dict__
out = 'config==============\n'
for i in list(d):
out += i+":"
out += str(d[i])+"\n"
out+="config==============\n"
return out

if __name__=="__main__":
config = Config()
print(config)

+ 163
- 0
reproduction/coreference_resolution/model/metric.py View File

@@ -0,0 +1,163 @@
from fastNLP.core.metrics import MetricBase

import numpy as np

from collections import Counter
from sklearn.utils.linear_assignment_ import linear_assignment

"""
Mostly borrowed from https://github.com/clarkkev/deep-coref/blob/master/evaluation.py
"""



class CRMetric(MetricBase):
def __init__(self):
super().__init__()
self.evaluators = [Evaluator(m) for m in (muc, b_cubed, ceafe)]

# TODO 改名为evaluate,输入也
def evaluate(self, predicted, mention_to_predicted,clusters):
for e in self.evaluators:
e.update(predicted,mention_to_predicted, clusters)

def get_f1(self):
return sum(e.get_f1() for e in self.evaluators) / len(self.evaluators)

def get_recall(self):
return sum(e.get_recall() for e in self.evaluators) / len(self.evaluators)

def get_precision(self):
return sum(e.get_precision() for e in self.evaluators) / len(self.evaluators)

# TODO 原本的getprf
def get_metric(self,reset=False):
res = {"pre":self.get_precision(), "rec":self.get_recall(), "f":self.get_f1()}
self.evaluators = [Evaluator(m) for m in (muc, b_cubed, ceafe)]
return res






class Evaluator():
def __init__(self, metric, beta=1):
self.p_num = 0
self.p_den = 0
self.r_num = 0
self.r_den = 0
self.metric = metric
self.beta = beta

def update(self, predicted,mention_to_predicted,gold):
gold = gold[0].tolist()
gold = [tuple(tuple(m) for m in gc) for gc in gold]
mention_to_gold = {}
for gc in gold:
for mention in gc:
mention_to_gold[mention] = gc

if self.metric == ceafe:
pn, pd, rn, rd = self.metric(predicted, gold)
else:
pn, pd = self.metric(predicted, mention_to_gold)
rn, rd = self.metric(gold, mention_to_predicted)
self.p_num += pn
self.p_den += pd
self.r_num += rn
self.r_den += rd

def get_f1(self):
return f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=self.beta)

def get_recall(self):
return 0 if self.r_num == 0 else self.r_num / float(self.r_den)

def get_precision(self):
return 0 if self.p_num == 0 else self.p_num / float(self.p_den)

def get_prf(self):
return self.get_precision(), self.get_recall(), self.get_f1()

def get_counts(self):
return self.p_num, self.p_den, self.r_num, self.r_den



def b_cubed(clusters, mention_to_gold):
num, dem = 0, 0

for c in clusters:
if len(c) == 1:
continue

gold_counts = Counter()
correct = 0
for m in c:
if m in mention_to_gold:
gold_counts[tuple(mention_to_gold[m])] += 1
for c2, count in gold_counts.items():
if len(c2) != 1:
correct += count * count

num += correct / float(len(c))
dem += len(c)

return num, dem


def muc(clusters, mention_to_gold):
tp, p = 0, 0
for c in clusters:
p += len(c) - 1
tp += len(c)
linked = set()
for m in c:
if m in mention_to_gold:
linked.add(mention_to_gold[m])
else:
tp -= 1
tp -= len(linked)
return tp, p


def phi4(c1, c2):
return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2))


def ceafe(clusters, gold_clusters):
clusters = [c for c in clusters if len(c) != 1]
scores = np.zeros((len(gold_clusters), len(clusters)))
for i in range(len(gold_clusters)):
for j in range(len(clusters)):
scores[i, j] = phi4(gold_clusters[i], clusters[j])
matching = linear_assignment(-scores)
similarity = sum(scores[matching[:, 0], matching[:, 1]])
return similarity, len(clusters), similarity, len(gold_clusters)


def lea(clusters, mention_to_gold):
num, dem = 0, 0

for c in clusters:
if len(c) == 1:
continue

common_links = 0
all_links = len(c) * (len(c) - 1) / 2.0
for i, m in enumerate(c):
if m in mention_to_gold:
for m2 in c[i + 1:]:
if m2 in mention_to_gold and mention_to_gold[m] == mention_to_gold[m2]:
common_links += 1

num += len(c) * common_links / float(all_links)
dem += len(c)

return num, dem

def f1(p_num, p_den, r_num, r_den, beta=1):
p = 0 if p_den == 0 else p_num / float(p_den)
r = 0 if r_den == 0 else r_num / float(r_den)
return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)

+ 576
- 0
reproduction/coreference_resolution/model/model_re.py View File

@@ -0,0 +1,576 @@
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from allennlp.commands.elmo import ElmoEmbedder
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.encoder.variational_rnn import VarLSTM
from reproduction.coreference_resolution.model import preprocess
from fastNLP.io.embed_loader import EmbedLoader
import random

# 设置seed
torch.manual_seed(0) # cpu
torch.cuda.manual_seed(0) # gpu
np.random.seed(0) # numpy
random.seed(0)


class ffnn(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(ffnn, self).__init__()

self.f = nn.Sequential(
# 多少层数
nn.Linear(input_size, hidden_size),
nn.ReLU(inplace=True),
nn.Dropout(p=0.2),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(inplace=True),
nn.Dropout(p=0.2),
nn.Linear(hidden_size, output_size)
)
self.reset_param()

def reset_param(self):
for name, param in self.named_parameters():
if param.dim() > 1:
nn.init.xavier_normal_(param)
# param.data = torch.tensor(np.random.randn(*param.shape)).float()
else:
nn.init.zeros_(param)

def forward(self, input):
return self.f(input).squeeze()


class Model(BaseModel):
def __init__(self, vocab, config):
word2id = vocab.word2idx
super(Model, self).__init__()
vocab_num = len(word2id)
self.word2id = word2id
self.config = config
self.char_dict = preprocess.get_char_dict('data/char_vocab.english.txt')
self.genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
self.device = torch.device("cuda:" + config.cuda)

self.emb = nn.Embedding(vocab_num, 350)

emb1 = EmbedLoader().load_with_vocab(config.glove, vocab,normalize=False)
emb2 = EmbedLoader().load_with_vocab(config.turian, vocab ,normalize=False)
pre_emb = np.concatenate((emb1, emb2), axis=1)
pre_emb /= (np.linalg.norm(pre_emb, axis=1, keepdims=True) + 1e-12)

if pre_emb is not None:
self.emb.weight = nn.Parameter(torch.from_numpy(pre_emb).float())
for param in self.emb.parameters():
param.requires_grad = False
self.emb_dropout = nn.Dropout(inplace=True)


if config.use_elmo:
self.elmo = ElmoEmbedder(options_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json',
weight_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5',
cuda_device=int(config.cuda))
print("elmo load over.")
self.elmo_args = torch.randn((3), requires_grad=True).to(self.device)

self.char_emb = nn.Embedding(len(self.char_dict), config.char_emb_size)
self.conv1 = nn.Conv1d(config.char_emb_size, 50, 3)
self.conv2 = nn.Conv1d(config.char_emb_size, 50, 4)
self.conv3 = nn.Conv1d(config.char_emb_size, 50, 5)

self.feature_emb = nn.Embedding(config.span_width, config.feature_size)
self.feature_emb_dropout = nn.Dropout(p=0.2, inplace=True)

self.mention_distance_emb = nn.Embedding(10, config.feature_size)
self.distance_drop = nn.Dropout(p=0.2, inplace=True)

self.genre_emb = nn.Embedding(7, config.feature_size)
self.speaker_emb = nn.Embedding(2, config.feature_size)

self.bilstm = VarLSTM(input_size=350+150*config.use_CNN+config.use_elmo*1024,hidden_size=200,bidirectional=True,batch_first=True,hidden_dropout=0.2)
# self.bilstm = nn.LSTM(input_size=500, hidden_size=200, bidirectional=True, batch_first=True)
self.h0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device)
self.c0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device)
self.bilstm_drop = nn.Dropout(p=0.2, inplace=True)

self.atten = ffnn(input_size=400, hidden_size=config.atten_hidden_size, output_size=1)
self.mention_score = ffnn(input_size=1320, hidden_size=config.mention_hidden_size, output_size=1)
self.sa = ffnn(input_size=3980+40*config.use_metadata, hidden_size=config.sa_hidden_size, output_size=1)
self.mention_start_np = None
self.mention_end_np = None

def _reorder_lstm(self, word_emb, seq_lens):
sort_ind = sorted(range(len(seq_lens)), key=lambda i: seq_lens[i], reverse=True)
seq_lens_re = [seq_lens[i] for i in sort_ind]
emb_seq = self.reorder_sequence(word_emb, sort_ind, batch_first=True)
packed_seq = nn.utils.rnn.pack_padded_sequence(emb_seq, seq_lens_re, batch_first=True)

h0 = self.h0.repeat(1, len(seq_lens), 1)
c0 = self.c0.repeat(1, len(seq_lens), 1)
packed_out, final_states = self.bilstm(packed_seq, (h0, c0))

lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
back_map = {ind: i for i, ind in enumerate(sort_ind)}
reorder_ind = [back_map[i] for i in range(len(seq_lens_re))]
lstm_out = self.reorder_sequence(lstm_out, reorder_ind, batch_first=True)
return lstm_out

def reorder_sequence(self, sequence_emb, order, batch_first=True):
"""
sequence_emb: [T, B, D] if not batch_first
order: list of sequence length
"""
batch_dim = 0 if batch_first else 1
assert len(order) == sequence_emb.size()[batch_dim]

order = torch.LongTensor(order)
order = order.to(sequence_emb).long()

sorted_ = sequence_emb.index_select(index=order, dim=batch_dim)

del order
return sorted_

def flat_lstm(self, lstm_out, seq_lens):
batch = lstm_out.shape[0]
seq = lstm_out.shape[1]
dim = lstm_out.shape[2]
l = [j + i * seq for i, seq_len in enumerate(seq_lens) for j in range(seq_len)]
flatted = torch.index_select(lstm_out.view(batch * seq, dim), 0, torch.LongTensor(l).to(self.device))
return flatted

def potential_mention_index(self, word_index, max_sent_len):
# get mention index [3,2]:the first sentence is 3 and secend 2
# [0,0,0,1,1] --> [[0, 0], [0, 1], [1, 1], [1, 2], [2, 2], [3, 3], [3, 4], [4, 4]] (max =2)
potential_mention = []
for i in range(len(word_index)):
for j in range(i, i + max_sent_len):
if (j < len(word_index) and word_index[i] == word_index[j]):
potential_mention.append([i, j])
return potential_mention

def get_mention_start_end(self, seq_lens):
# 序列长度转换成mention
# [3,2] --> [0,0,0,1,1]
word_index = [0] * sum(seq_lens)
sent_index = 0
index = 0
for length in seq_lens:
for l in range(length):
word_index[index] = sent_index
index += 1
sent_index += 1

# [0,0,0,1,1]-->[[0,0],[0,1],[0,2]....]
mention_id = self.potential_mention_index(word_index, self.config.span_width)
mention_start = np.array(mention_id, dtype=int)[:, 0]
mention_end = np.array(mention_id, dtype=int)[:, 1]
return mention_start, mention_end

def get_mention_emb(self, flatten_lstm, mention_start, mention_end):
mention_start_tensor = torch.from_numpy(mention_start).to(self.device)
mention_end_tensor = torch.from_numpy(mention_end).to(self.device)
emb_start = flatten_lstm.index_select(dim=0, index=mention_start_tensor) # [mention_num,embed]
emb_end = flatten_lstm.index_select(dim=0, index=mention_end_tensor) # [mention_num,embed]
return emb_start, emb_end

def get_mask(self, mention_start, mention_end):
# big mask for attention
mention_num = mention_start.shape[0]
mask = np.zeros((mention_num, self.config.span_width)) # [mention_num,span_width]
for i in range(mention_num):
start = mention_start[i]
end = mention_end[i]
# 实际上是宽度
for j in range(end - start + 1):
mask[i][j] = 1
mask = torch.from_numpy(mask) # [mention_num,max_mention]
# 0-->-inf 1-->0
log_mask = torch.log(mask)
return log_mask

def get_mention_index(self, mention_start, max_mention):
# TODO 后面可能要改
assert len(mention_start.shape) == 1
mention_start_tensor = torch.from_numpy(mention_start)
num_mention = mention_start_tensor.shape[0]
mention_index = mention_start_tensor.expand(max_mention, num_mention).transpose(0,
1) # [num_mention,max_mention]
assert mention_index.shape[0] == num_mention
assert mention_index.shape[1] == max_mention
range_add = torch.arange(0, max_mention).expand(num_mention, max_mention).long() # [num_mention,max_mention]
mention_index = mention_index + range_add
mention_index = torch.min(mention_index, torch.LongTensor([mention_start[-1]]).expand(num_mention, max_mention))
return mention_index.to(self.device)

def sort_mention(self, mention_start, mention_end, candidate_mention_emb, candidate_mention_score, seq_lens):
# 排序记录,高分段在前面
mention_score, mention_ids = torch.sort(candidate_mention_score, descending=True)
preserve_mention_num = int(self.config.mention_ratio * sum(seq_lens))
mention_ids = mention_ids[0:preserve_mention_num]
mention_score = mention_score[0:preserve_mention_num]

mention_start_tensor = torch.from_numpy(mention_start).to(self.device).index_select(dim=0,
index=mention_ids) # [lamda*word_num]
mention_end_tensor = torch.from_numpy(mention_end).to(self.device).index_select(dim=0,
index=mention_ids) # [lamda*word_num]
mention_emb = candidate_mention_emb.index_select(index=mention_ids, dim=0) # [lamda*word_num,emb]
assert mention_score.shape[0] == preserve_mention_num
assert mention_start_tensor.shape[0] == preserve_mention_num
assert mention_end_tensor.shape[0] == preserve_mention_num
assert mention_emb.shape[0] == preserve_mention_num
# TODO 不交叉没做处理

# 对start进行再排序,实际位置在前面
# TODO 这里只考虑了start没有考虑end
mention_start_tensor, temp_index = torch.sort(mention_start_tensor)
mention_end_tensor = mention_end_tensor.index_select(dim=0, index=temp_index)
mention_emb = mention_emb.index_select(dim=0, index=temp_index)
mention_score = mention_score.index_select(dim=0, index=temp_index)
return mention_start_tensor, mention_end_tensor, mention_score, mention_emb

def get_antecedents(self, mention_starts, max_antecedents):
num_mention = mention_starts.shape[0]
max_antecedents = min(max_antecedents, num_mention)
# mention和它是第几个mention之间的对应关系
antecedents = np.zeros((num_mention, max_antecedents), dtype=int) # [num_mention,max_an]
# 记录长度
antecedents_len = [0] * num_mention
for i in range(num_mention):
ante_count = 0
for j in range(max(0, i - max_antecedents), i):
antecedents[i, ante_count] = j
ante_count += 1
# 补位操作
for j in range(ante_count, max_antecedents):
antecedents[i, j] = 0
antecedents_len[i] = ante_count
assert antecedents.shape[1] == max_antecedents
return antecedents, antecedents_len

def get_antecedents_score(self, span_represent, mention_score, antecedents, antecedents_len, mention_speakers_ids,
genre):
num_mention = mention_score.shape[0]
max_antecedent = antecedents.shape[1]

pair_emb = self.get_pair_emb(span_represent, antecedents, mention_speakers_ids, genre) # [span_num,max_ant,emb]
antecedent_scores = self.sa(pair_emb)
mask01 = self.sequence_mask(antecedents_len, max_antecedent)
maskinf = torch.log(mask01).to(self.device)
assert maskinf.shape[1] <= max_antecedent
assert antecedent_scores.shape[0] == num_mention
antecedent_scores = antecedent_scores + maskinf
antecedents = torch.from_numpy(antecedents).to(self.device)
mention_scoreij = mention_score.unsqueeze(1) + torch.gather(
mention_score.unsqueeze(0).expand(num_mention, num_mention), dim=1, index=antecedents)
antecedent_scores += mention_scoreij

antecedent_scores = torch.cat([torch.zeros([mention_score.shape[0], 1]).to(self.device), antecedent_scores],
1) # [num_mentions, max_ant + 1]
return antecedent_scores

##############################
def distance_bin(self, mention_distance):
bins = torch.zeros(mention_distance.size()).byte().to(self.device)
rg = [[1, 1], [2, 2], [3, 3], [4, 4], [5, 7], [8, 15], [16, 31], [32, 63], [64, 300]]
for t, k in enumerate(rg):
i, j = k[0], k[1]
b = torch.LongTensor([i]).unsqueeze(-1).expand(mention_distance.size()).to(self.device)
m1 = torch.ge(mention_distance, b)
e = torch.LongTensor([j]).unsqueeze(-1).expand(mention_distance.size()).to(self.device)
m2 = torch.le(mention_distance, e)
bins = bins + (t + 1) * (m1 & m2)
return bins.long()

def get_distance_emb(self, antecedents_tensor):
num_mention = antecedents_tensor.shape[0]
max_ant = antecedents_tensor.shape[1]

assert max_ant <= self.config.max_antecedents
source = torch.arange(0, num_mention).expand(max_ant, num_mention).transpose(0,1).to(self.device) # [num_mention,max_ant]
mention_distance = source - antecedents_tensor
mention_distance_bin = self.distance_bin(mention_distance)
distance_emb = self.mention_distance_emb(mention_distance_bin)
distance_emb = self.distance_drop(distance_emb)
return distance_emb

def get_pair_emb(self, span_emb, antecedents, mention_speakers_ids, genre):
emb_dim = span_emb.shape[1]
num_span = span_emb.shape[0]
max_ant = antecedents.shape[1]
assert span_emb.shape[0] == antecedents.shape[0]
antecedents = torch.from_numpy(antecedents).to(self.device)

# [num_span,max_ant,emb]
antecedent_emb = torch.gather(span_emb.unsqueeze(0).expand(num_span, num_span, emb_dim), dim=1,
index=antecedents.unsqueeze(2).expand(num_span, max_ant, emb_dim))
# [num_span,max_ant,emb]
target_emb_tiled = span_emb.expand((max_ant, num_span, emb_dim))
target_emb_tiled = target_emb_tiled.transpose(0, 1)

similarity_emb = antecedent_emb * target_emb_tiled

pair_emb_list = [target_emb_tiled, antecedent_emb, similarity_emb]

# get speakers and genre
if self.config.use_metadata:
antecedent_speaker_ids = mention_speakers_ids.unsqueeze(0).expand(num_span, num_span).gather(dim=1,
index=antecedents)
same_speaker = torch.eq(mention_speakers_ids.unsqueeze(1).expand(num_span, max_ant),
antecedent_speaker_ids) # [num_mention,max_ant]
speaker_embedding = self.speaker_emb(same_speaker.long().to(self.device)) # [mention_num.max_ant,emb]
genre_embedding = self.genre_emb(
torch.LongTensor([genre]).expand(num_span, max_ant).to(self.device)) # [mention_num,max_ant,emb]
pair_emb_list.append(speaker_embedding)
pair_emb_list.append(genre_embedding)

# get distance emb
if self.config.use_distance:
distance_emb = self.get_distance_emb(antecedents)
pair_emb_list.append(distance_emb)

pair_emb = torch.cat(pair_emb_list, 2)
return pair_emb

def sequence_mask(self, len_list, max_len):
x = np.zeros((len(len_list), max_len))
for i in range(len(len_list)):
l = len_list[i]
for j in range(l):
x[i][j] = 1
return torch.from_numpy(x).float()

def logsumexp(self, value, dim=None, keepdim=False):
"""Numerically stable implementation of the operation

value.exp().sum(dim, keepdim).log()
"""
# TODO: torch.max(value, dim=None) threw an error at time of writing
if dim is not None:
m, _ = torch.max(value, dim=dim, keepdim=True)
value0 = value - m
if keepdim is False:
m = m.squeeze(dim)
return m + torch.log(torch.sum(torch.exp(value0),
dim=dim, keepdim=keepdim))
else:
m = torch.max(value)
sum_exp = torch.sum(torch.exp(value - m))

return m + torch.log(sum_exp)

def softmax_loss(self, antecedent_scores, antecedent_labels):
antecedent_labels = torch.from_numpy(antecedent_labels * 1).to(self.device)
gold_scores = antecedent_scores + torch.log(antecedent_labels.float()) # [num_mentions, max_ant + 1]
marginalized_gold_scores = self.logsumexp(gold_scores, 1) # [num_mentions]
log_norm = self.logsumexp(antecedent_scores, 1) # [num_mentions]
return torch.sum(log_norm - marginalized_gold_scores) # [num_mentions]reduce_logsumexp

def get_predicted_antecedents(self, antecedents, antecedent_scores):
predicted_antecedents = []
for i, index in enumerate(np.argmax(antecedent_scores.detach(), axis=1) - 1):
if index < 0:
predicted_antecedents.append(-1)
else:
predicted_antecedents.append(antecedents[i, index])
return predicted_antecedents

def get_predicted_clusters(self, mention_starts, mention_ends, predicted_antecedents):
mention_to_predicted = {}
predicted_clusters = []
for i, predicted_index in enumerate(predicted_antecedents):
if predicted_index < 0:
continue
assert i > predicted_index
predicted_antecedent = (int(mention_starts[predicted_index]), int(mention_ends[predicted_index]))
if predicted_antecedent in mention_to_predicted:
predicted_cluster = mention_to_predicted[predicted_antecedent]
else:
predicted_cluster = len(predicted_clusters)
predicted_clusters.append([predicted_antecedent])
mention_to_predicted[predicted_antecedent] = predicted_cluster

mention = (int(mention_starts[i]), int(mention_ends[i]))
predicted_clusters[predicted_cluster].append(mention)
mention_to_predicted[mention] = predicted_cluster

predicted_clusters = [tuple(pc) for pc in predicted_clusters]
mention_to_predicted = {m: predicted_clusters[i] for m, i in mention_to_predicted.items()}

return predicted_clusters, mention_to_predicted

def evaluate_coref(self, mention_starts, mention_ends, predicted_antecedents, gold_clusters, evaluator):
gold_clusters = [tuple(tuple(m) for m in gc) for gc in gold_clusters]
mention_to_gold = {}
for gc in gold_clusters:
for mention in gc:
mention_to_gold[mention] = gc
predicted_clusters, mention_to_predicted = self.get_predicted_clusters(mention_starts, mention_ends,
predicted_antecedents)
evaluator.update(predicted_clusters, gold_clusters, mention_to_predicted, mention_to_gold)
return predicted_clusters


def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len):
"""
实际输入都是tensor
:param sentences: 句子,被fastNLP转化成了numpy,
:param doc_np: 被fastNLP转化成了Tensor
:param speaker_ids_np: 被fastNLP转化成了Tensor
:param genre: 被fastNLP转化成了Tensor
:param char_index: 被fastNLP转化成了Tensor
:param seq_len: 被fastNLP转化成了Tensor
:return:
"""
# change for fastNLP
sentences = sentences[0].tolist()
doc_tensor = doc_np[0]
speakers_tensor = speaker_ids_np[0]
genre = genre[0].item()
char_index = char_index[0]
seq_len = seq_len[0].cpu().numpy()

# 类型

# doc_tensor = torch.from_numpy(doc_np).to(self.device)
# speakers_tensor = torch.from_numpy(speaker_ids_np).to(self.device)
mention_emb_list = []

word_emb = self.emb(doc_tensor)
word_emb_list = [word_emb]
if self.config.use_CNN:
# [batch, length, char_length, char_dim]
char = self.char_emb(char_index)
char_size = char.size()
# first transform to [batch *length, char_length, char_dim]
# then transpose to [batch * length, char_dim, char_length]
char = char.view(char_size[0] * char_size[1], char_size[2], char_size[3]).transpose(1, 2)

# put into cnn [batch*length, char_filters, char_length]
# then put into maxpooling [batch * length, char_filters]
char_over_cnn, _ = self.conv1(char).max(dim=2)
# reshape to [batch, length, char_filters]
char_over_cnn = torch.tanh(char_over_cnn).view(char_size[0], char_size[1], -1)
word_emb_list.append(char_over_cnn)

char_over_cnn, _ = self.conv2(char).max(dim=2)
char_over_cnn = torch.tanh(char_over_cnn).view(char_size[0], char_size[1], -1)
word_emb_list.append(char_over_cnn)

char_over_cnn, _ = self.conv3(char).max(dim=2)
char_over_cnn = torch.tanh(char_over_cnn).view(char_size[0], char_size[1], -1)
word_emb_list.append(char_over_cnn)

# word_emb = torch.cat(word_emb_list, dim=2)

# use elmo or not
if self.config.use_elmo:
# 如果确实被截断了
if doc_tensor.shape[0] == 50 and len(sentences) > 50:
sentences = sentences[0:50]
elmo_embedding, elmo_mask = self.elmo.batch_to_embeddings(sentences)
elmo_embedding = elmo_embedding.to(
self.device) # [sentence_num,max_sent_len,3,1024]--[sentence_num,max_sent,1024]
elmo_embedding = elmo_embedding[:, 0, :, :] * self.elmo_args[0] + elmo_embedding[:, 1, :, :] * \
self.elmo_args[1] + elmo_embedding[:, 2, :, :] * self.elmo_args[2]
word_emb_list.append(elmo_embedding)
# print(word_emb_list[0].shape)
# print(word_emb_list[1].shape)
# print(word_emb_list[2].shape)
# print(word_emb_list[3].shape)
# print(word_emb_list[4].shape)

word_emb = torch.cat(word_emb_list, dim=2)

word_emb = self.emb_dropout(word_emb)
# word_emb_elmo = self.emb_dropout(word_emb_elmo)
lstm_out = self._reorder_lstm(word_emb, seq_len)
flatten_lstm = self.flat_lstm(lstm_out, seq_len) # [word_num,emb]
flatten_lstm = self.bilstm_drop(flatten_lstm)
# TODO 没有按照论文写
flatten_word_emb = self.flat_lstm(word_emb, seq_len) # [word_num,emb]

mention_start, mention_end = self.get_mention_start_end(seq_len) # [mention_num]
self.mention_start_np = mention_start # [mention_num] np
self.mention_end_np = mention_end
mention_num = mention_start.shape[0]
emb_start, emb_end = self.get_mention_emb(flatten_lstm, mention_start, mention_end) # [mention_num,emb]

# list
mention_emb_list.append(emb_start)
mention_emb_list.append(emb_end)

if self.config.use_width:
mention_width_index = mention_end - mention_start
mention_width_tensor = torch.from_numpy(mention_width_index).to(self.device) # [mention_num]
mention_width_emb = self.feature_emb(mention_width_tensor)
mention_width_emb = self.feature_emb_dropout(mention_width_emb)
mention_emb_list.append(mention_width_emb)

if self.config.model_heads:
mention_index = self.get_mention_index(mention_start, self.config.span_width) # [mention_num,max_mention]
log_mask_tensor = self.get_mask(mention_start, mention_end).float().to(
self.device) # [mention_num,max_mention]
alpha = self.atten(flatten_lstm).to(self.device) # [word_num]

# 得到attention
mention_head_score = torch.gather(alpha.expand(mention_num, -1), 1,
mention_index).float().to(self.device) # [mention_num,max_mention]
mention_attention = F.softmax(mention_head_score + log_mask_tensor, dim=1) # [mention_num,max_mention]

# TODO flatte lstm
word_num = flatten_lstm.shape[0]
lstm_emb = flatten_lstm.shape[1]
emb_num = flatten_word_emb.shape[1]

# [num_mentions, max_mention_width, emb]
mention_text_emb = torch.gather(
flatten_word_emb.unsqueeze(1).expand(word_num, self.config.span_width, emb_num),
0, mention_index.unsqueeze(2).expand(mention_num, self.config.span_width,
emb_num))
# [mention_num,emb]
mention_head_emb = torch.sum(
mention_attention.unsqueeze(2).expand(mention_num, self.config.span_width, emb_num) * mention_text_emb,
dim=1)
mention_emb_list.append(mention_head_emb)

candidate_mention_emb = torch.cat(mention_emb_list, 1) # [candidate_mention_num,emb]
candidate_mention_score = self.mention_score(candidate_mention_emb) # [candidate_mention_num]

antecedent_scores, antecedents, mention_start_tensor, mention_end_tensor = (None, None, None, None)
mention_start_tensor, mention_end_tensor, mention_score, mention_emb = \
self.sort_mention(mention_start, mention_end, candidate_mention_emb, candidate_mention_score, seq_len)
mention_speakers_ids = speakers_tensor.index_select(dim=0, index=mention_start_tensor) # num_mention

antecedents, antecedents_len = self.get_antecedents(mention_start_tensor, self.config.max_antecedents)
antecedent_scores = self.get_antecedents_score(mention_emb, mention_score, antecedents, antecedents_len,
mention_speakers_ids, genre)

ans = {"candidate_mention_score": candidate_mention_score, "antecedent_scores": antecedent_scores,
"antecedents": antecedents, "mention_start_tensor": mention_start_tensor,
"mention_end_tensor": mention_end_tensor}

return ans

def predict(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len):
ans = self(sentences,
doc_np,
speaker_ids_np,
genre,
char_index,
seq_len)

predicted_antecedents = self.get_predicted_antecedents(ans["antecedents"], ans["antecedent_scores"])
predicted_clusters, mention_to_predicted = self.get_predicted_clusters(ans["mention_start_tensor"],
ans["mention_end_tensor"],
predicted_antecedents)

return {'predicted':predicted_clusters,"mention_to_predicted":mention_to_predicted}


if __name__ == '__main__':
pass

+ 225
- 0
reproduction/coreference_resolution/model/preprocess.py View File

@@ -0,0 +1,225 @@
import json
import numpy as np
from . import util
import collections

def load(path):
"""
load the file from jsonline
:param path:
:return: examples with many example(dict): {"clusters":[[[mention],[mention]],[another cluster]],
"doc_key":"str","speakers":[[,,,],[]...],"sentence":[[][]]}
"""
with open(path) as f:
train_examples = [json.loads(jsonline) for jsonline in f.readlines()]
return train_examples

def get_vocab():
"""
从所有的句子中得到最终的字典,被main调用,不止是train,还有dev和test
:param examples:
:return: word2id & id2word
"""
word2id = {'PAD':0,'UNK':1}
id2word = {0:'PAD',1:'UNK'}
index = 2
data = [load("../data/train.english.jsonlines"),load("../data/dev.english.jsonlines"),load("../data/test.english.jsonlines")]
for examples in data:
for example in examples:
for sent in example["sentences"]:
for word in sent:
if(word not in word2id):
word2id[word]=index
id2word[index] = word
index += 1
return word2id,id2word

def normalize(v):
norm = np.linalg.norm(v)
if norm > 0:
return v / norm
else:
return v

# 加载glove得到embedding
def get_emb(id2word,embedding_size):
glove_oov = 0
turian_oov = 0
both = 0
glove_emb_path = "../data/glove.840B.300d.txt.filtered"
turian_emb_path = "../data/turian.50d.txt"
word_num = len(id2word)
emb = np.zeros((word_num,embedding_size))
glove_emb_dict = util.load_embedding_dict(glove_emb_path,300,"txt")
turian_emb_dict = util.load_embedding_dict(turian_emb_path,50,"txt")
for i in range(word_num):
if id2word[i] in glove_emb_dict:
word_embedding = glove_emb_dict.get(id2word[i])
emb[i][0:300] = np.array(word_embedding)
else:
# print(id2word[i])
glove_oov += 1
if id2word[i] in turian_emb_dict:
word_embedding = turian_emb_dict.get(id2word[i])
emb[i][300:350] = np.array(word_embedding)
else:
# print(id2word[i])
turian_oov += 1
if id2word[i] not in glove_emb_dict and id2word[i] not in turian_emb_dict:
both += 1
emb[i] = normalize(emb[i])
print("embedding num:"+str(word_num))
print("glove num:"+str(glove_oov))
print("glove oov rate:"+str(glove_oov/word_num))
print("turian num:"+str(turian_oov))
print("turian oov rate:"+str(turian_oov/word_num))
print("both num:"+str(both))
return emb


def _doc2vec(doc,word2id,char_dict,max_filter,max_sentences,is_train):
max_len = 0
max_word_length = 0
docvex = []
length = []
if is_train:
sent_num = min(max_sentences,len(doc))
else:
sent_num = len(doc)

for i in range(sent_num):
sent = doc[i]
length.append(len(sent))
if (len(sent) > max_len):
max_len = len(sent)
sent_vec =[]
for j,word in enumerate(sent):
if len(word)>max_word_length:
max_word_length = len(word)
if word in word2id:
sent_vec.append(word2id[word])
else:
sent_vec.append(word2id["UNK"])
docvex.append(sent_vec)

char_index = np.zeros((sent_num, max_len, max_word_length),dtype=int)
for i in range(sent_num):
sent = doc[i]
for j,word in enumerate(sent):
char_index[i, j, :len(word)] = [char_dict[c] for c in word]

return docvex,char_index,length,max_len

# TODO 修改了接口,确认所有该修改的地方都修改好
def doc2numpy(doc,word2id,chardict,max_filter,max_sentences,is_train):
docvec, char_index, length, max_len = _doc2vec(doc,word2id,chardict,max_filter,max_sentences,is_train)
assert max(length) == max_len
assert char_index.shape[0]==len(length)
assert char_index.shape[1]==max_len
doc_np = np.zeros((len(docvec), max_len), int)
for i in range(len(docvec)):
for j in range(len(docvec[i])):
doc_np[i][j] = docvec[i][j]
return doc_np,char_index,length

# TODO 没有测试
def speaker2numpy(speakers_raw,max_sentences,is_train):
if is_train and len(speakers_raw)> max_sentences:
speakers_raw = speakers_raw[0:max_sentences]
speakers = flatten(speakers_raw)
speaker_dict = {s: i for i, s in enumerate(set(speakers))}
speaker_ids = np.array([speaker_dict[s] for s in speakers])
return speaker_ids


def flat_cluster(clusters):
flatted = []
for cluster in clusters:
for item in cluster:
flatted.append(item)
return flatted

def get_right_mention(clusters,mention_start_np,mention_end_np):
flatted = flat_cluster(clusters)
cluster_num = len(flatted)
mention_num = mention_start_np.shape[0]
right_mention = np.zeros(mention_num,dtype=int)
for i in range(mention_num):
if [mention_start_np[i],mention_end_np[i]] in flatted:
right_mention[i]=1
return right_mention,cluster_num

def handle_cluster(clusters):
gold_mentions = sorted(tuple(m) for m in flatten(clusters))
gold_mention_map = {m: i for i, m in enumerate(gold_mentions)}
cluster_ids = np.zeros(len(gold_mentions), dtype=int)
for cluster_id, cluster in enumerate(clusters):
for mention in cluster:
cluster_ids[gold_mention_map[tuple(mention)]] = cluster_id
gold_starts, gold_ends = tensorize_mentions(gold_mentions)
return cluster_ids, gold_starts, gold_ends

# 展平
def flatten(l):
return [item for sublist in l for item in sublist]

# 把mention分成start end
def tensorize_mentions(mentions):
if len(mentions) > 0:
starts, ends = zip(*mentions)
else:
starts, ends = [], []
return np.array(starts), np.array(ends)

def get_char_dict(path):
vocab = ["<UNK>"]
with open(path) as f:
vocab.extend(c.strip() for c in f.readlines())
char_dict = collections.defaultdict(int)
char_dict.update({c: i for i, c in enumerate(vocab)})
return char_dict

def get_labels(clusters,mention_starts,mention_ends,max_antecedents):
cluster_ids, gold_starts, gold_ends = handle_cluster(clusters)
num_mention = mention_starts.shape[0]
num_gold = gold_starts.shape[0]
max_antecedents = min(max_antecedents, num_mention)
mention_indices = {}

for i in range(num_mention):
mention_indices[(mention_starts[i].detach().item(), mention_ends[i].detach().item())] = i
# 用来记录哪些mention是对的,-1表示错误,正数代表这个mention实际上对应哪个gold cluster的id
mention_cluster_ids = [-1] * num_mention
# test
right_mention_count = 0
for i in range(num_gold):
right_mention = mention_indices.get((gold_starts[i], gold_ends[i]))
if (right_mention != None):
right_mention_count += 1
mention_cluster_ids[right_mention] = cluster_ids[i]

# i j 是否属于同一个cluster
labels = np.zeros((num_mention, max_antecedents + 1), dtype=bool) # [num_mention,max_an+1]
for i in range(num_mention):
ante_count = 0
null_label = True
for j in range(max(0, i - max_antecedents), i):
if (mention_cluster_ids[i] >= 0 and mention_cluster_ids[i] == mention_cluster_ids[j]):
labels[i, ante_count + 1] = True
null_label = False
else:
labels[i, ante_count + 1] = False
ante_count += 1
for j in range(ante_count, max_antecedents):
labels[i, j + 1] = False
labels[i, 0] = null_label
return labels

# test===========================


if __name__=="__main__":
word2id,id2word = get_vocab()
get_emb(id2word,350)



+ 32
- 0
reproduction/coreference_resolution/model/softmax_loss.py View File

@@ -0,0 +1,32 @@
from fastNLP.core.losses import LossBase

from reproduction.coreference_resolution.model.preprocess import get_labels
from reproduction.coreference_resolution.model.config import Config
import torch


class SoftmaxLoss(LossBase):
"""
交叉熵loss
允许多标签分类
"""

def __init__(self, antecedent_scores=None, clusters=None, mention_start_tensor=None, mention_end_tensor=None):
"""

:param pred:
:param target:
"""
super().__init__()
self._init_param_map(antecedent_scores=antecedent_scores, clusters=clusters,
mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor)

def get_loss(self, antecedent_scores, clusters, mention_start_tensor, mention_end_tensor):
antecedent_labels = get_labels(clusters[0], mention_start_tensor, mention_end_tensor,
Config().max_antecedents)

antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda))
gold_scores = antecedent_scores + torch.log(antecedent_labels.float()).to(torch.device("cuda:" + Config().cuda)) # [num_mentions, max_ant + 1]
marginalized_gold_scores = gold_scores.logsumexp(dim=1) # [num_mentions]
log_norm = antecedent_scores.logsumexp(dim=1) # [num_mentions]
return torch.sum(log_norm - marginalized_gold_scores)

+ 101
- 0
reproduction/coreference_resolution/model/util.py View File

@@ -0,0 +1,101 @@
import os
import errno
import collections
import torch
import numpy as np
import pyhocon



# flatten the list
def flatten(l):
return [item for sublist in l for item in sublist]


def get_config(filename):
return pyhocon.ConfigFactory.parse_file(filename)


# safe make directions
def mkdirs(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
return path


def load_char_dict(char_vocab_path):
vocab = ["<unk>"]
with open(char_vocab_path) as f:
vocab.extend(c.strip() for c in f.readlines())
char_dict = collections.defaultdict(int)
char_dict.update({c: i for i, c in enumerate(vocab)})
return char_dict

# 加载embedding
def load_embedding_dict(embedding_path, embedding_size, embedding_format):
print("Loading word embeddings from {}...".format(embedding_path))
default_embedding = np.zeros(embedding_size)
embedding_dict = collections.defaultdict(lambda: default_embedding)
skip_first = embedding_format == "vec"
with open(embedding_path) as f:
for i, line in enumerate(f.readlines()):
if skip_first and i == 0:
continue
splits = line.split()
assert len(splits) == embedding_size + 1
word = splits[0]
embedding = np.array([float(s) for s in splits[1:]])
embedding_dict[word] = embedding
print("Done loading word embeddings.")
return embedding_dict


# safe devide
def maybe_divide(x, y):
return 0 if y == 0 else x / float(y)


def shape(x, dim):
return x.get_shape()[dim].value or torch.shape(x)[dim]


def normalize(v):
norm = np.linalg.norm(v)
if norm > 0:
return v / norm
else:
return v


class RetrievalEvaluator(object):
def __init__(self):
self._num_correct = 0
self._num_gold = 0
self._num_predicted = 0

def update(self, gold_set, predicted_set):
self._num_correct += len(gold_set & predicted_set)
self._num_gold += len(gold_set)
self._num_predicted += len(predicted_set)

def recall(self):
return maybe_divide(self._num_correct, self._num_gold)

def precision(self):
return maybe_divide(self._num_correct, self._num_predicted)

def metrics(self):
recall = self.recall()
precision = self.precision()
f1 = maybe_divide(2 * recall * precision, precision + recall)
return recall, precision, f1



if __name__=="__main__":
print(load_char_dict("../data/char_vocab.english.txt"))
embedding_dict = load_embedding_dict("../data/glove.840B.300d.txt.filtered",300,"txt")
print("hello")

+ 49
- 0
reproduction/coreference_resolution/readme.md View File

@@ -0,0 +1,49 @@
# 共指消解复现
## 介绍
Coreference resolution是查找文本中指向同一现实实体的所有表达式的任务。
对于涉及自然语言理解的许多更高级别的NLP任务来说,
这是一个重要的步骤,例如文档摘要,问题回答和信息提取。
代码的实现主要基于[ End-to-End Coreference Resolution (Lee et al, 2017)](https://arxiv.org/pdf/1707.07045).


## 数据获取与预处理
论文在[OntoNote5.0](https://allennlp.org/models)数据集上取得了当时的sota结果。
由于版权问题,本文无法提供数据集的下载,请自行下载。
原始数据集的格式为conll格式,详细介绍参考数据集给出的官方介绍页面。

代码实现采用了论文作者Lee的预处理方法,具体细节参加[链接](https://github.com/kentonl/e2e-coref/blob/e2e/setup_training.sh)。
处理之后的数据集为json格式,例子:
```
{
"clusters": [],
"doc_key": "nw",
"sentences": [["This", "is", "the", "first", "sentence", "."], ["This", "is", "the", "second", "."]],
"speakers": [["spk1", "spk1", "spk1", "spk1", "spk1", "spk1"], ["spk2", "spk2", "spk2", "spk2", "spk2"]]
}
```

### embedding 数据集下载
[turian emdedding](https://lil.cs.washington.edu/coref/turian.50d.txt)

[glove embedding]( https://nlp.stanford.edu/data/glove.840B.300d.zip)



## 运行
```python
# 训练代码
CUDA_VISIBLE_DEVICES=0 python train.py
# 测试代码
CUDA_VISIBLE_DEVICES=0 python valid.py
```

## 结果
原论文作者在测试集上取得了67.2%的结果,AllenNLP复现的结果为 [63.0%](https://allennlp.org/models)。
其中allenNLP训练时没有加入speaker信息,没有variational dropout以及只使用了100的antecedents而不是250。

在与allenNLP使用同样的超参和配置时,本代码复现取得了63.6%的F1值。


## 问题
如果您有什么问题或者反馈,请提issue或者邮件联系我:
yexu_i@qq.com

+ 0
- 0
reproduction/coreference_resolution/test/__init__.py View File


+ 14
- 0
reproduction/coreference_resolution/test/test_dataloader.py View File

@@ -0,0 +1,14 @@
import unittest
from ..data_load.cr_loader import CRLoader

class Test_CRLoader(unittest.TestCase):
def test_cr_loader(self):
train_path = 'data/train.english.jsonlines.mini'
dev_path = 'data/dev.english.jsonlines.minid'
test_path = 'data/test.english.jsonlines'
cr = CRLoader()
data_info = cr.process({'train':train_path,'dev':dev_path,'test':test_path})

print(data_info.datasets['train'][0])
print(data_info.datasets['dev'][0])
print(data_info.datasets['test'][0])

+ 69
- 0
reproduction/coreference_resolution/train.py View File

@@ -0,0 +1,69 @@
import sys
sys.path.append('../..')

import torch
from torch.optim import Adam

from fastNLP.core.callback import Callback, GradientClipCallback
from fastNLP.core.trainer import Trainer

from reproduction.coreference_resolution.data_load.cr_loader import CRLoader
from reproduction.coreference_resolution.model.config import Config
from reproduction.coreference_resolution.model.model_re import Model
from reproduction.coreference_resolution.model.softmax_loss import SoftmaxLoss
from reproduction.coreference_resolution.model.metric import CRMetric
from fastNLP import SequentialSampler
from fastNLP import cache_results


# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.deterministic = True

class LRCallback(Callback):
def __init__(self, parameters, decay_rate=1e-3):
super().__init__()
self.paras = parameters
self.decay_rate = decay_rate

def on_step_end(self):
if self.step % 100 == 0:
for para in self.paras:
para['lr'] = para['lr'] * (1 - self.decay_rate)


if __name__ == "__main__":
config = Config()

print(config)

@cache_results('cache.pkl')
def cache():
cr_train_dev_test = CRLoader()

data_info = cr_train_dev_test.process({'train': config.train_path, 'dev': config.dev_path,
'test': config.test_path})
return data_info
data_info = cache()
print("数据集划分:\ntrain:", str(len(data_info.datasets["train"])),
"\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"])))
# print(data_info)
model = Model(data_info.vocabs, config)
print(model)

loss = SoftmaxLoss()

metric = CRMetric()

optim = Adam(model.parameters(), lr=config.lr)

lr_decay_callback = LRCallback(optim.param_groups, config.lr_decay)

trainer = Trainer(model=model, train_data=data_info.datasets["train"], dev_data=data_info.datasets["dev"],
loss=loss, metrics=metric, check_code_level=-1,sampler=None,
batch_size=1, device=torch.device("cuda:" + config.cuda), metric_key='f', n_epochs=config.epoch,
optimizer=optim,
save_path='/remote-home/xxliu/pycharm/fastNLP/fastNLP/reproduction/coreference_resolution/save',
callbacks=[lr_decay_callback, GradientClipCallback(clip_value=5)])
print()

trainer.train()

+ 24
- 0
reproduction/coreference_resolution/valid.py View File

@@ -0,0 +1,24 @@
import torch
from reproduction.coreference_resolution.model.config import Config
from reproduction.coreference_resolution.model.metric import CRMetric
from reproduction.coreference_resolution.data_load.cr_loader import CRLoader
from fastNLP import Tester
import argparse


if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--path')
args = parser.parse_args()
cr_loader = CRLoader()
config = Config()
data_info = cr_loader.process({'train': config.train_path, 'dev': config.dev_path,
'test': config.test_path})
metirc = CRMetric()
model = torch.load(args.path)
tester = Tester(data_info.datasets['test'],model,metirc,batch_size=1,device="cuda:0")
tester.test()
print('test over')



+ 105
- 0
reproduction/matching/matching_cntn.py View File

@@ -0,0 +1,105 @@
import argparse
import torch
import os

from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const
from fastNLP.modules.encoder.embedding import StaticEmbedding

from reproduction.matching.data.MatchingDataLoader import QNLILoader, RTELoader, SNLILoader, MNLILoader
from reproduction.matching.model.cntn import CNTNModel

# define hyper-parameters
argument = argparse.ArgumentParser()
argument.add_argument('--embedding', choices=['glove', 'word2vec'], default='glove')
argument.add_argument('--batch-size-per-gpu', type=int, default=256)
argument.add_argument('--n-epochs', type=int, default=200)
argument.add_argument('--lr', type=float, default=1e-5)
argument.add_argument('--seq-len-type', choices=['mask', 'seq_len'], default='mask')
argument.add_argument('--save-dir', type=str, default=None)
argument.add_argument('--cntn-depth', type=int, default=1)
argument.add_argument('--cntn-ns', type=int, default=200)
argument.add_argument('--cntn-k-top', type=int, default=10)
argument.add_argument('--cntn-r', type=int, default=5)
argument.add_argument('--dataset', choices=['qnli', 'rte', 'snli', 'mnli'], default='qnli')
argument.add_argument('--max-len', type=int, default=50)
arg = argument.parse_args()

# dataset dict
dev_dict = {
'qnli': 'dev',
'rte': 'dev',
'snli': 'dev',
'mnli': 'dev_matched',
}

test_dict = {
'qnli': 'dev',
'rte': 'dev',
'snli': 'test',
'mnli': 'dev_matched',
}

# set num_labels
if arg.dataset == 'qnli' or arg.dataset == 'rte':
num_labels = 2
else:
num_labels = 3

# load data set
if arg.dataset == 'qnli':
data_info = QNLILoader().process(
paths='path/to/qnli/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None,
get_index=True, concat=False, auto_pad_length=arg.max_len)
elif arg.dataset == 'rte':
data_info = RTELoader().process(
paths='path/to/rte/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None,
get_index=True, concat=False, auto_pad_length=arg.max_len)
elif arg.dataset == 'snli':
data_info = SNLILoader().process(
paths='path/to/snli/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None,
get_index=True, concat=False, auto_pad_length=arg.max_len)
elif arg.dataset == 'mnli':
data_info = MNLILoader().process(
paths='path/to/mnli/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None,
get_index=True, concat=False, auto_pad_length=arg.max_len)
else:
raise ValueError(f'now we only support [qnli,rte,snli,mnli] dataset for cntn model!')

# load embedding
if arg.embedding == 'word2vec':
embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], model_dir_or_name='en-word2vec-300', requires_grad=True)
elif arg.embedding == 'glove':
embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], model_dir_or_name='en-glove-840b-300',
requires_grad=True)
else:
raise ValueError(f'now we only support word2vec or glove embedding for cntn model!')

# define model
model = CNTNModel(embedding, ns=arg.cntn_ns, k_top=arg.cntn_k_top, num_labels=num_labels, depth=arg.cntn_depth,
r=arg.cntn_r)
print(model)

# define trainer
trainer = Trainer(train_data=data_info.datasets['train'], model=model,
optimizer=Adam(lr=arg.lr, model_params=model.parameters()),
batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
n_epochs=arg.n_epochs, print_every=-1,
dev_data=data_info.datasets[dev_dict[arg.dataset]],
metrics=AccuracyMetric(), metric_key='acc',
device=[i for i in range(torch.cuda.device_count())],
check_code_level=-1)

# train model
trainer.train(load_best_model=True)

# define tester
tester = Tester(
data=data_info.datasets[test_dict[arg.dataset]],
model=model,
metrics=AccuracyMetric(),
batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
device=[i for i in range(torch.cuda.device_count())]
)

# test model
tester.test()

+ 120
- 0
reproduction/matching/model/cntn.py View File

@@ -0,0 +1,120 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from torch.nn import CrossEntropyLoss

from fastNLP.models import BaseModel
from fastNLP.modules.encoder.embedding import TokenEmbedding
from fastNLP.core.const import Const


class DynamicKMaxPooling(nn.Module):
"""
:param k_top: Fixed number of pooling output features for the topmost convolutional layer.
:param l: Number of convolutional layers.
"""

def __init__(self, k_top, l):
super(DynamicKMaxPooling, self).__init__()
self.k_top = k_top
self.L = l

def forward(self, x, l):
"""
:param x: Input sequence.
:param l: Current convolutional layers.
"""
s = x.size()[3]
k_ll = ((self.L - l) / self.L) * s
k_l = int(round(max(self.k_top, np.ceil(k_ll))))
out = F.adaptive_max_pool2d(x, (x.size()[2], k_l))
return out


class CNTNModel(BaseModel):
"""
使用CNN进行问答匹配的模型
'Qiu, Xipeng, and Xuanjing Huang.
Convolutional neural tensor network architecture for community-based question answering.
Twenty-Fourth International Joint Conference on Artificial Intelligence. 2015.'

:param init_embedding: Embedding.
:param ns: Sentence embedding size.
:param k_top: Fixed number of pooling output features for the topmost convolutional layer.
:param num_labels: Number of labels.
:param depth: Number of convolutional layers.
:param r: Number of weight tensor slices.
:param drop_rate: Dropout rate.
"""

def __init__(self, init_embedding: TokenEmbedding, ns=200, k_top=10, num_labels=2, depth=2, r=5,
dropout_rate=0.3):
super(CNTNModel, self).__init__()
self.embedding = init_embedding
self.depth = depth
self.kmaxpooling = DynamicKMaxPooling(k_top, depth)
self.conv_q = nn.ModuleList()
self.conv_a = nn.ModuleList()
width = self.embedding.embed_size
for i in range(depth):
self.conv_q.append(nn.Sequential(
nn.Dropout(p=dropout_rate),
nn.Conv2d(
in_channels=1,
out_channels=width // 2,
kernel_size=(width, 3),
padding=(0, 2))
))
self.conv_a.append(nn.Sequential(
nn.Dropout(p=dropout_rate),
nn.Conv2d(
in_channels=1,
out_channels=width // 2,
kernel_size=(width, 3),
padding=(0, 2))
))
width = width // 2

self.fc_q = nn.Sequential(nn.Dropout(p=dropout_rate), nn.Linear(width * k_top, ns))
self.fc_a = nn.Sequential(nn.Dropout(p=dropout_rate), nn.Linear(width * k_top, ns))
self.weight_M = nn.Bilinear(ns, ns, r)
self.weight_V = nn.Linear(2 * ns, r)
self.weight_u = nn.Sequential(nn.Dropout(p=dropout_rate), nn.Linear(r, num_labels))

def forward(self, words1, words2, seq_len1, seq_len2, target=None):
"""
:param words1: [batch, seq_len, emb_size] Question.
:param words2: [batch, seq_len, emb_size] Answer.
:param seq_len1: [batch]
:param seq_len2: [batch]
:param target: [batch] Glod labels.
:return:
"""
in_q = self.embedding(words1)
in_a = self.embedding(words2)
in_q = in_q.permute(0, 2, 1).unsqueeze(1)
in_a = in_a.permute(0, 2, 1).unsqueeze(1)

for i in range(self.depth):
in_q = F.relu(self.conv_q[i](in_q))
in_q = in_q.squeeze().unsqueeze(1)
in_q = self.kmaxpooling(in_q, i + 1)
in_a = F.relu(self.conv_a[i](in_a))
in_a = in_a.squeeze().unsqueeze(1)
in_a = self.kmaxpooling(in_a, i + 1)

in_q = self.fc_q(in_q.view(in_q.size(0), -1))
in_a = self.fc_q(in_a.view(in_a.size(0), -1))
score = torch.tanh(self.weight_u(self.weight_M(in_q, in_a) + self.weight_V(torch.cat((in_q, in_a), -1))))

if target is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(score, target)
return {Const.LOSS: loss, Const.OUTPUT: score}
else:
return {Const.OUTPUT: score}

def predict(self, **kwargs):
return self.forward(**kwargs)

+ 0
- 93
reproduction/seqence_labelling/ner/data/Conll2003Loader.py View File

@@ -1,93 +0,0 @@

from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict
from fastNLP import Vocabulary
from fastNLP import Const
from reproduction.utils import check_dataloader_paths

from fastNLP.io.dataset_loader import ConllLoader
from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2


class Conll2003DataLoader(DataSetLoader):
def __init__(self, task:str='ner', encoding_type:str='bioes'):
"""
加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos
时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回
的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但
鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的-DOCTSTART-开头的行
ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。

:param task: 指定需要标注任务。可选ner, pos, chunk
"""
assert task in ('ner', 'pos', 'chunk')
index = {'ner':3, 'pos':1, 'chunk':2}[task]
self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index])
self._tag_converters = None
if task in ('ner', 'chunk'):
self._tag_converters = [iob2]
if encoding_type == 'bioes':
self._tag_converters.append(iob2bioes)

def load(self, path: str):
dataset = self._loader.load(path)
def convert_tag_schema(tags):
for converter in self._tag_converters:
tags = converter(tags)
return tags
if self._tag_converters:
dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET)
return dataset

def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, lower:bool=True):
"""
读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略

:param paths:
:param word_vocab_opt: vocabulary的初始化值
:param lower: 是否将所有字母转为小写
:return:
"""
# 读取数据
paths = check_dataloader_paths(paths)
data = DataInfo()
input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
target_fields = [Const.TARGET, Const.INPUT_LEN]
for name, path in paths.items():
dataset = self.load(path)
dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
if lower:
dataset.words.lower()
data.datasets[name] = dataset

# 对construct vocab
word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
data.vocabs[Const.INPUT] = word_vocab

# cap words
cap_word_vocab = Vocabulary()
cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words',
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words')
input_fields.append('cap_words')
data.vocabs['cap_words'] = cap_word_vocab

# 对target建vocab
target_vocab = Vocabulary(unknown=None, padding=None)
target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET)
target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET)
data.vocabs[Const.TARGET] = target_vocab

for name, dataset in data.datasets.items():
dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
dataset.set_input(*input_fields)
dataset.set_target(*target_fields)

return data

if __name__ == '__main__':
pass

+ 0
- 152
reproduction/seqence_labelling/ner/data/OntoNoteLoader.py View File

@@ -1,152 +0,0 @@
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict
from fastNLP import DataSet
from fastNLP import Vocabulary
from fastNLP import Const
from reproduction.utils import check_dataloader_paths

from fastNLP.io.dataset_loader import ConllLoader
from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2

class OntoNoteNERDataLoader(DataSetLoader):
"""
用于读取处理为Conll格式后的OntoNote数据。将OntoNote数据处理为conll格式的过程可以参考https://github.com/yhcc/OntoNotes-5.0-NER。

"""
def __init__(self, encoding_type:str='bioes'):
assert encoding_type in ('bioes', 'bio')
self.encoding_type = encoding_type
if encoding_type=='bioes':
self.encoding_method = iob2bioes
else:
self.encoding_method = iob2

def load(self, path:str)->DataSet:
"""
给定一个文件路径,读取数据。返回的DataSet包含以下的field
raw_words: List[str]
target: List[str]

:param path:
:return:
"""
dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path)
def convert_to_bio(tags):
bio_tags = []
flag = None
for tag in tags:
label = tag.strip("()*")
if '(' in tag:
bio_label = 'B-' + label
flag = label
elif flag:
bio_label = 'I-' + flag
else:
bio_label = 'O'
if ')' in tag:
flag = None
bio_tags.append(bio_label)
return self.encoding_method(bio_tags)

def convert_word(words):
converted_words = []
for word in words:
word = word.replace('/.', '.') # 有些结尾的.是/.形式的
if not word.startswith('-'):
converted_words.append(word)
continue
# 以下是由于这些符号被转义了,再转回来
tfrs = {'-LRB-':'(',
'-RRB-': ')',
'-LSB-': '[',
'-RSB-': ']',
'-LCB-': '{',
'-RCB-': '}'
}
if word in tfrs:
converted_words.append(tfrs[word])
else:
converted_words.append(word)
return converted_words

dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words')
dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target')

return dataset

def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None,
lower:bool=True)->DataInfo:
"""
读取并处理数据。返回的DataInfo包含以下的内容
vocabs:
word: Vocabulary
target: Vocabulary
datasets:
train: DataSet
words: List[int], 被设置为input
target: int. label,被同时设置为input和target
seq_len: int. 句子的长度,被同时设置为input和target
raw_words: List[str]
xxx(根据传入的paths可能有所变化)

:param paths:
:param word_vocab_opt: vocabulary的初始化值
:param lower: 是否使用小写
:return:
"""
paths = check_dataloader_paths(paths)
data = DataInfo()
input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
target_fields = [Const.TARGET, Const.INPUT_LEN]
for name, path in paths.items():
dataset = self.load(path)
dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
if lower:
dataset.words.lower()
data.datasets[name] = dataset

# 对construct vocab
word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
data.vocabs[Const.INPUT] = word_vocab

# cap words
cap_word_vocab = Vocabulary()
cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words')
cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words')
input_fields.append('cap_words')
data.vocabs['cap_words'] = cap_word_vocab

# 对target建vocab
target_vocab = Vocabulary(unknown=None, padding=None)
target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET)
target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET)
data.vocabs[Const.TARGET] = target_vocab

for name, dataset in data.datasets.items():
dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
dataset.set_input(*input_fields)
dataset.set_target(*target_fields)

return data


if __name__ == '__main__':
loader = OntoNoteNERDataLoader()
dataset = loader.load('/hdd/fudanNLP/fastNLP/others/data/v4/english/test.txt')
print(dataset.target.value_count())
print(dataset[:4])


"""
train 115812 2200752
development 15680 304684
test 12217 230111

train 92403 1901772
valid 13606 279180
test 10258 204135
"""

+ 0
- 49
reproduction/seqence_labelling/ner/data/utils.py View File

@@ -1,49 +0,0 @@
from typing import List

def iob2(tags:List[str])->List[str]:
"""
检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。

:param tags: 需要转换的tags
"""
for i, tag in enumerate(tags):
if tag == "O":
continue
split = tag.split("-")
if len(split) != 2 or split[0] not in ["I", "B"]:
raise TypeError("The encoding schema is not a valid IOB type.")
if split[0] == "B":
continue
elif i == 0 or tags[i - 1] == "O": # conversion IOB1 to IOB2
tags[i] = "B" + tag[1:]
elif tags[i - 1][1:] == tag[1:]:
continue
else: # conversion IOB1 to IOB2
tags[i] = "B" + tag[1:]
return tags

def iob2bioes(tags:List[str])->List[str]:
"""
将iob的tag转换为bmeso编码
:param tags:
:return:
"""
new_tags = []
for i, tag in enumerate(tags):
if tag == 'O':
new_tags.append(tag)
else:
split = tag.split('-')[0]
if split == 'B':
if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('B-', 'S-'))
elif split == 'I':
if i + 1<len(tags) and tags[i+1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('I-', 'E-'))
else:
raise TypeError("Invalid IOB format.")
return new_tags

+ 142
- 0
reproduction/seqence_labelling/ner/model/dilated_cnn.py View File

@@ -0,0 +1,142 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastNLP.modules.decoder import ConditionalRandomField
from fastNLP.modules.encoder import Embedding
from fastNLP.core.utils import seq_len_to_mask
from fastNLP.core.const import Const as C


class IDCNN(nn.Module):
def __init__(self,
init_embed,
char_embed,
num_cls,
repeats, num_layers, num_filters, kernel_size,
use_crf=False, use_projection=False, block_loss=False,
input_dropout=0.3, hidden_dropout=0.2, inner_dropout=0.0):
super(IDCNN, self).__init__()
self.word_embeddings = Embedding(init_embed)

if char_embed is None:
self.char_embeddings = None
embedding_size = self.word_embeddings.embedding_dim
else:
self.char_embeddings = Embedding(char_embed)
embedding_size = self.word_embeddings.embedding_dim + \
self.char_embeddings.embedding_dim

self.conv0 = nn.Sequential(
nn.Conv1d(in_channels=embedding_size,
out_channels=num_filters,
kernel_size=kernel_size,
stride=1, dilation=1,
padding=kernel_size//2,
bias=True),
nn.ReLU(),
)

block = []
for layer_i in range(num_layers):
dilated = 2 ** layer_i if layer_i+1 < num_layers else 1
block.append(nn.Conv1d(
in_channels=num_filters,
out_channels=num_filters,
kernel_size=kernel_size,
stride=1, dilation=dilated,
padding=(kernel_size//2) * dilated,
bias=True))
block.append(nn.ReLU())
self.block = nn.Sequential(*block)

if use_projection:
self.projection = nn.Sequential(
nn.Conv1d(
in_channels=num_filters,
out_channels=num_filters//2,
kernel_size=1,
bias=True),
nn.ReLU(),)
encode_dim = num_filters // 2
else:
self.projection = None
encode_dim = num_filters

self.input_drop = nn.Dropout(input_dropout)
self.hidden_drop = nn.Dropout(hidden_dropout)
self.inner_drop = nn.Dropout(inner_dropout)
self.repeats = repeats
self.out_fc = nn.Conv1d(
in_channels=encode_dim,
out_channels=num_cls,
kernel_size=1,
bias=True)
self.crf = ConditionalRandomField(
num_tags=num_cls) if use_crf else None
self.block_loss = block_loss
self.reset_parameters()

def reset_parameters(self):
for m in self.modules():
if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Linear)):
nn.init.xavier_normal_(m.weight, gain=1)
if m.bias is not None:
nn.init.normal_(m.bias, mean=0, std=0.01)

def forward(self, words, seq_len, target=None, chars=None):
if self.char_embeddings is None:
x = self.word_embeddings(words)
else:
if chars is None:
raise ValueError('must provide chars for model with char embedding')
e1 = self.word_embeddings(words)
e2 = self.char_embeddings(chars)
x = torch.cat((e1, e2), dim=-1) # b,l,h
mask = seq_len_to_mask(seq_len)

x = x.transpose(1, 2) # b,h,l
last_output = self.conv0(x)
output = []
for repeat in range(self.repeats):
last_output = self.block(last_output)
hidden = self.projection(last_output) if self.projection is not None else last_output
output.append(self.out_fc(hidden))

def compute_loss(y, t, mask):
if self.crf is not None and target is not None:
loss = self.crf(y.transpose(1, 2), t, mask)
else:
t.masked_fill_(mask == 0, -100)
loss = F.cross_entropy(y, t, ignore_index=-100)
return loss

if target is not None:
if self.block_loss:
losses = [compute_loss(o, target, mask) for o in output]
loss = sum(losses)
else:
loss = compute_loss(output[-1], target, mask)
else:
loss = None

scores = output[-1]
if self.crf is not None:
pred, _ = self.crf.viterbi_decode(scores.transpose(1, 2), mask)
else:
pred = scores.max(1)[1] * mask.long()

return {
C.LOSS: loss,
C.OUTPUT: pred,
}

def predict(self, words, seq_len, chars=None):
res = self.forward(
words=words,
seq_len=seq_len,
chars=chars,
target=None
)[C.OUTPUT]
return {
C.OUTPUT: res
}

+ 99
- 0
reproduction/seqence_labelling/ner/train_idcnn.py View File

@@ -0,0 +1,99 @@
from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader
from fastNLP.core.callback import FitlogCallback, LRScheduler
from fastNLP import GradientClipCallback
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
from torch.optim import SGD, Adam
from fastNLP import Const
from fastNLP import RandomSampler, BucketSampler
from fastNLP import SpanFPreRecMetric
from fastNLP import Trainer
from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN
from fastNLP.core.utils import Option
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding
from fastNLP.core.utils import cache_results
import sys
import torch.cuda
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

encoding_type = 'bioes'


def get_path(path):
return os.path.join(os.environ['HOME'], path)

data_path = get_path('workdir/datasets/ontonotes-v4')

ops = Option(
batch_size=128,
num_epochs=100,
lr=3e-4,
repeats=3,
num_layers=3,
num_filters=400,
use_crf=True,
gradient_clip=5,
)

@cache_results('ontonotes-cache')
def load_data():

data = OntoNoteNERDataLoader(encoding_type=encoding_type).process(data_path,
lower=True)

# char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
# kernel_sizes=[3])

word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='en-glove-840b-300',
requires_grad=True)
return data, [word_embed]

data, embeds = load_data()
print(data.datasets['train'][0])
print(list(data.vocabs.keys()))

for ds in data.datasets.values():
ds.rename_field('cap_words', 'chars')
ds.set_input('chars')

word_embed = embeds[0]
char_embed = CNNCharEmbedding(data.vocabs['cap_words'])
# for ds in data.datasets:
# ds.rename_field('')

print(data.vocabs[Const.TARGET].word2idx)

model = IDCNN(init_embed=word_embed,
char_embed=char_embed,
num_cls=len(data.vocabs[Const.TARGET]),
repeats=ops.repeats,
num_layers=ops.num_layers,
num_filters=ops.num_filters,
kernel_size=3,
use_crf=ops.use_crf, use_projection=True,
block_loss=True,
input_dropout=0.33, hidden_dropout=0.2, inner_dropout=0.2)

print(model)

callbacks = [GradientClipCallback(clip_value=ops.gradient_clip, clip_type='norm'),]

optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=0)
# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
# callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 15)))
# optimizer = SWATS(model.parameters(), verbose=True)
# optimizer = Adam(model.parameters(), lr=0.005)

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer,
sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size),
device=device, dev_data=data.datasets['dev'], batch_size=ops.batch_size,
metrics=SpanFPreRecMetric(
tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
check_code_level=-1,
callbacks=callbacks, num_workers=2, n_epochs=ops.num_epochs)
trainer.train()

+ 26
- 0
reproduction/text_classification/README.md View File

@@ -0,0 +1,26 @@
# text_classification任务模型复现
这里使用fastNLP复现以下模型:

char_cnn :论文链接[Character-level Convolutional Networks for Text Classification](https://arxiv.org/pdf/1509.01626v3.pdf)

dpcnn:论文链接[Deep Pyramid Convolutional Neural Networks for TextCategorization](https://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf)

HAN:论文链接[Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf)

LSTM+self_attention:论文链接[A Structured Self-attentive Sentence Embedding](<https://arxiv.org/pdf/1703.03130.pdf>)

AWD-LSTM:论文链接[Regularizing and Optimizing LSTM Language Models](<https://arxiv.org/pdf/1708.02182.pdf>)

# 数据集及复现结果汇总

使用fastNLP复现的结果vs论文汇报结果(/前为fastNLP实现,后面为论文报道,-表示论文没有在该数据集上列出结果)

model name | yelp_p | yelp_f | sst-2|IMDB
:---: | :---: | :---: | :---: |-----
char_cnn | 93.80/95.12 | - | - |-
dpcnn | 95.50/97.36 | - | - |-
HAN |- | - | - |-
LSTM| 95.74/- |- |- |88.52/-
AWD-LSTM| 95.96/- |- |- |88.91/-
LSTM+self_attention| 96.34/- | - | - |89.53/-


+ 110
- 0
reproduction/text_classification/data/IMDBLoader.py View File

@@ -0,0 +1,110 @@
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict, List, Iterator
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP import Const
# from reproduction.utils import check_dataloader_paths
from functools import partial


class IMDBLoader(DataSetLoader):
"""
读取IMDB数据集,DataSet包含以下fields:

words: list(str), 需要分类的文本
target: str, 文本的标签


"""

def __init__(self):
super(IMDBLoader, self).__init__()

def _load(self, path):
dataset = DataSet()
with open(path, 'r', encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split('\t')
target = parts[0]
words = parts[1].lower().split()
dataset.append(Instance(words=words, target=target))

if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.")

return dataset
def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset
def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
return chars

if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')

src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')

info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets

if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

for name, dataset in info.datasets.items():
dataset.set_input("words")
dataset.set_target("target")

return info



if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
"test": "/remote-home/ygwang/IMDB_data/test.csv"}
datainfo=IMDBLoader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)


+ 5
- 1
reproduction/text_classification/data/MTL16Loader.py View File

@@ -32,7 +32,7 @@ class MTL16Loader(DataSetLoader):
continue continue
parts = line.split('\t') parts = line.split('\t')
target = parts[0] target = parts[0]
words = parts[1].split()
words = parts[1].lower().split()
dataset.append(Instance(words=words, target=target)) dataset.append(Instance(words=words, target=target))
if len(dataset)==0: if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.") raise RuntimeError(f"{path} has no valid data.")
@@ -72,4 +72,8 @@ class MTL16Loader(DataSetLoader):
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed info.embeddings['words'] = embed


for name, dataset in info.datasets.items():
dataset.set_input("words")
dataset.set_target("target")

return info return info

+ 187
- 0
reproduction/text_classification/data/SSTLoader.py View File

@@ -0,0 +1,187 @@
from typing import Iterable
from nltk import Tree
from fastNLP.io.base_loader import DataInfo, DataSetLoader
from fastNLP.core.vocabulary import VocabularyOption, Vocabulary
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
import csv
from typing import Union, Dict

class SSTLoader(DataSetLoader):
URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
DATA_DIR = 'sst/'

"""
别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.dataset_loader.SSTLoader`

读取SST数据集, DataSet包含fields::

words: list(str) 需要分类的文本
target: str 文本的标签

数据来源: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip

:param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False``
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""

def __init__(self, subtree=False, fine_grained=False):
self.subtree = subtree

tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral',
'3': 'positive', '4': 'very positive'}
if not fine_grained:
tag_v['0'] = tag_v['1']
tag_v['4'] = tag_v['3']
self.tag_v = tag_v

def _load(self, path):
"""

:param str path: 存储数据的路径
:return: 一个 :class:`~fastNLP.DataSet` 类型的对象
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
datas = []
for l in f:
datas.extend([(s, self.tag_v[t])
for s, t in self._get_one(l, self.subtree)])
ds = DataSet()
for words, tag in datas:
ds.append(Instance(words=words, target=tag))
return ds

@staticmethod
def _get_one(data, subtree):
tree = Tree.fromstring(data)
if subtree:
return [(t.leaves(), t.label()) for t in tree.subtrees()]
return [(tree.leaves(), tree.label())]

def process(self,
paths,
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
src_embed_op: EmbeddingOption = None):
input_name, target_name = 'words', 'target'
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

info = DataInfo(datasets=self.load(paths))
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()
src_vocab.from_dataset(*_train_ds, field_name=input_name)
tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
src_vocab.index_dataset(
*info.datasets.values(),
field_name=input_name, new_field_name=input_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)
info.vocabs = {
input_name: src_vocab,
target_name: tgt_vocab
}

if src_embed_op is not None:
src_embed_op.vocab = src_vocab
init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
info.embeddings[input_name] = init_emb

for name, dataset in info.datasets.items():
dataset.set_input(input_name)
dataset.set_target(target_name)

return info

class sst2Loader(DataSetLoader):
'''
数据来源"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
'''
def __init__(self):
super(sst2Loader, self).__init__()

def _load(self, path: str) -> DataSet:
ds = DataSet()
all_count=0
csv_reader = csv.reader(open(path, encoding='utf-8'),delimiter='\t')
skip_row = 0
for idx,row in enumerate(csv_reader):
if idx<=skip_row:
continue
target = row[1]
words = row[0].split()
ds.append(Instance(words=words,target=target))
all_count+=1
print("all count:", all_count)
return ds

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):

paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

def wordtochar(words):
chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs={}

# 就分隔为char形式
if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')


info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets


if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

return info

if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
"dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}
datainfo=sst2Loader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 187
- 0
reproduction/text_classification/data/sstLoader.py View File

@@ -0,0 +1,187 @@
from typing import Iterable
from nltk import Tree
from fastNLP.io.base_loader import DataInfo, DataSetLoader
from fastNLP.core.vocabulary import VocabularyOption, Vocabulary
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
import csv
from typing import Union, Dict

class SSTLoader(DataSetLoader):
URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
DATA_DIR = 'sst/'

"""
别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.dataset_loader.SSTLoader`

读取SST数据集, DataSet包含fields::

words: list(str) 需要分类的文本
target: str 文本的标签

数据来源: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip

:param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False``
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""

def __init__(self, subtree=False, fine_grained=False):
self.subtree = subtree

tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral',
'3': 'positive', '4': 'very positive'}
if not fine_grained:
tag_v['0'] = tag_v['1']
tag_v['4'] = tag_v['3']
self.tag_v = tag_v

def _load(self, path):
"""

:param str path: 存储数据的路径
:return: 一个 :class:`~fastNLP.DataSet` 类型的对象
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
datas = []
for l in f:
datas.extend([(s, self.tag_v[t])
for s, t in self._get_one(l, self.subtree)])
ds = DataSet()
for words, tag in datas:
ds.append(Instance(words=words, target=tag))
return ds

@staticmethod
def _get_one(data, subtree):
tree = Tree.fromstring(data)
if subtree:
return [(t.leaves(), t.label()) for t in tree.subtrees()]
return [(tree.leaves(), tree.label())]

def process(self,
paths,
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
src_embed_op: EmbeddingOption = None):
input_name, target_name = 'words', 'target'
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

info = DataInfo(datasets=self.load(paths))
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()
src_vocab.from_dataset(*_train_ds, field_name=input_name)
tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
src_vocab.index_dataset(
*info.datasets.values(),
field_name=input_name, new_field_name=input_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)
info.vocabs = {
input_name: src_vocab,
target_name: tgt_vocab
}

if src_embed_op is not None:
src_embed_op.vocab = src_vocab
init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
info.embeddings[input_name] = init_emb

for name, dataset in info.datasets.items():
dataset.set_input(input_name)
dataset.set_target(target_name)

return info

class sst2Loader(DataSetLoader):
'''
数据来源"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
'''
def __init__(self):
super(sst2Loader, self).__init__()

def _load(self, path: str) -> DataSet:
ds = DataSet()
all_count=0
csv_reader = csv.reader(open(path, encoding='utf-8'),delimiter='\t')
skip_row = 0
for idx,row in enumerate(csv_reader):
if idx<=skip_row:
continue
target = row[1]
words = row[0].split()
ds.append(Instance(words=words,target=target))
all_count+=1
print("all count:", all_count)
return ds

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):

paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

def wordtochar(words):
chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs={}

# 就分隔为char形式
if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')


info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets


if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

return info

if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
"dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}
datainfo=sst2Loader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 160
- 31
reproduction/text_classification/data/yelpLoader.py View File

@@ -1,18 +1,64 @@
import ast import ast
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io import JsonLoader from fastNLP.io import JsonLoader
from fastNLP.io.base_loader import DataInfo
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json from fastNLP.io.file_reader import _read_json
from typing import Union, Dict from typing import Union, Dict
from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths
from reproduction.utils import check_dataloader_paths, get_tokenizer

def clean_str(sentence, tokenizer, char_lower=False):
"""
heavily borrowed from github
https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb
:param sentence: is a str
:return:
"""
if char_lower:
sentence = sentence.lower()
import re
nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
words = tokenizer(sentence)
words_collection = []
for word in words:
if word in ['-lrb-', '-rrb-', '<sssss>', '-r', '-l', 'b-']:
continue
tt = nonalpnum.split(word)
t = ''.join(tt)
if t != '':
words_collection.append(t)

return words_collection




class yelpLoader(JsonLoader):
class yelpLoader(DataSetLoader):
""" """
读取Yelp_full/Yelp_polarity数据集, DataSet包含fields:
words: list(str), 需要分类的文本
target: str, 文本的标签
chars:list(str),未index的字符列表

数据集:yelp_full/yelp_polarity
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""
def __init__(self, fine_grained=False,lower=False):
super(yelpLoader, self).__init__()
tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
'4.0': 'positive', '5.0': 'very positive'}
if not fine_grained:
tag_v['1.0'] = tag_v['2.0']
tag_v['5.0'] = tag_v['4.0']
self.fine_grained = fine_grained
self.tag_v = tag_v
self.lower = lower
self.tokenizer = get_tokenizer()

'''
读取Yelp数据集, DataSet包含fields: 读取Yelp数据集, DataSet包含fields:
review_id: str, 22 character unique review id review_id: str, 22 character unique review id
@@ -27,20 +73,8 @@ class yelpLoader(JsonLoader):
数据来源: https://www.yelp.com/dataset/download 数据来源: https://www.yelp.com/dataset/download
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""
def __init__(self, fine_grained=False):
super(yelpLoader, self).__init__()
tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
'4.0': 'positive', '5.0': 'very positive'}
if not fine_grained:
tag_v['1.0'] = tag_v['2.0']
tag_v['5.0'] = tag_v['4.0']
self.fine_grained = fine_grained
self.tag_v = tag_v
def _load(self, path):

def _load_json(self, path):
ds = DataSet() ds = DataSet()
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
d = ast.literal_eval(d) d = ast.literal_eval(d)
@@ -48,21 +82,116 @@ class yelpLoader(JsonLoader):
d["target"] = self.tag_v[str(d.pop("stars"))] d["target"] = self.tag_v[str(d.pop("stars"))]
ds.append(Instance(**d)) ds.append(Instance(**d))
return ds return ds
def _load_yelp2015_broken(self,path):
ds = DataSet()
with open (path,encoding='ISO 8859-1') as f:
row=f.readline()
all_count=0
exp_count=0
while row:
row=row.split("\t\t")
all_count+=1
if len(row)>=3:
words=row[-1].split()
try:
target=self.tag_v[str(row[-2])+".0"]
ds.append(Instance(words=words, target=target))
except KeyError:
exp_count+=1
else:
exp_count+=1
row = f.readline()
print("error sample count:",exp_count)
print("all count:",all_count)
return ds
'''

def _load(self, path):
ds = DataSet()
csv_reader=csv.reader(open(path,encoding='utf-8'))
all_count=0
real_count=0
for row in csv_reader:
all_count+=1
if len(row)==2:
target=self.tag_v[row[0]+".0"]
words = clean_str(row[1], self.tokenizer, self.lower)
if len(words)!=0:
ds.append(Instance(words=words,target=target))
real_count += 1
print("all count:", all_count)
print("real count:", real_count)
return ds



def process(self, paths: Union[str, Dict[str, str]], vocab_opt: VocabularyOption = None,
embed_opt: EmbeddingOption = None):

def process(self, paths: Union[str, Dict[str, str]],
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
embed_opt: EmbeddingOption = None,
char_level_op=False):
paths = check_dataloader_paths(paths) paths = check_dataloader_paths(paths)
datasets = {} datasets = {}
info = DataInfo()
vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset
vocab.from_dataset(dataset, field_name="words")
info.vocabs = vocab
info.datasets = datasets
if embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
info.embeddings['words'] = embed
info = DataInfo(datasets=self.load(paths))
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()


def wordtochar(words):

chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs={}
#就分隔为char形式
if char_level_op:
for dataset in info.datasets.values():
dataset.apply_field(wordtochar, field_name="words",new_field_name='chars')
# if embed_opt is not None:
# embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
# info.embeddings['words'] = embed
else:
src_vocab.from_dataset(*_train_ds, field_name=input_name)
src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name)
info.vocabs[input_name]=src_vocab

tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)

info.vocabs[target_name]=tgt_vocab

info.datasets['train'],info.datasets['dev']=info.datasets['train'].split(0.1, shuffle=False)

for name, dataset in info.datasets.items():
dataset.set_input("words")
dataset.set_target("target")

return info return info


if __name__=="__main__":
testloader=yelpLoader()
# datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv",
# "test": "/remote-home/ygwang/yelp_full/test.csv"}
#datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
datainfo=testloader.process(datapath,char_level_op=True)

len_count=0
for instance in datainfo.datasets["train"]:
len_count+=len(instance["chars"])

ave_len=len_count/len(datainfo.datasets["train"])
print(ave_len)

+ 109
- 0
reproduction/text_classification/model/HAN.py View File

@@ -0,0 +1,109 @@
import torch
import torch.nn as nn
from torch.autograd import Variable
from fastNLP.modules.utils import get_embeddings
from fastNLP.core import Const as C


def pack_sequence(tensor_seq, padding_value=0.0):
if len(tensor_seq) <= 0:
return
length = [v.size(0) for v in tensor_seq]
max_len = max(length)
size = [len(tensor_seq), max_len]
size.extend(list(tensor_seq[0].size()[1:]))
ans = torch.Tensor(*size).fill_(padding_value)
if tensor_seq[0].data.is_cuda:
ans = ans.cuda()
ans = Variable(ans)
for i, v in enumerate(tensor_seq):
ans[i, :length[i], :] = v
return ans


class HANCLS(nn.Module):
def __init__(self, init_embed, num_cls):
super(HANCLS, self).__init__()

self.embed = get_embeddings(init_embed)
self.han = HAN(input_size=300,
output_size=num_cls,
word_hidden_size=50, word_num_layers=1, word_context_size=100,
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100
)

def forward(self, input_sents):
# input_sents [B, num_sents, seq-len] dtype long
# target
B, num_sents, seq_len = input_sents.size()
input_sents = input_sents.view(-1, seq_len) # flat
words_embed = self.embed(input_sents) # should be [B*num-sent, seqlen , word-dim]
words_embed = words_embed.view(B, num_sents, seq_len, -1) # recover # [B, num-sent, seqlen , word-dim]
out = self.han(words_embed)

return {C.OUTPUT: out}

def predict(self, input_sents):
x = self.forward(input_sents)[C.OUTPUT]
return {C.OUTPUT: torch.argmax(x, 1)}


class HAN(nn.Module):
def __init__(self, input_size, output_size,
word_hidden_size, word_num_layers, word_context_size,
sent_hidden_size, sent_num_layers, sent_context_size):
super(HAN, self).__init__()

self.word_layer = AttentionNet(input_size,
word_hidden_size,
word_num_layers,
word_context_size)
self.sent_layer = AttentionNet(2 * word_hidden_size,
sent_hidden_size,
sent_num_layers,
sent_context_size)
self.output_layer = nn.Linear(2 * sent_hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)

def forward(self, batch_doc):
# input is a sequence of matrix
doc_vec_list = []
for doc in batch_doc:
sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim)
doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim)
doc_vec = self.sent_layer(pack_sequence(doc_vec_list))
output = self.softmax(self.output_layer(doc_vec))
return output


class AttentionNet(nn.Module):
def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size):
super(AttentionNet, self).__init__()

self.input_size = input_size
self.gru_hidden_size = gru_hidden_size
self.gru_num_layers = gru_num_layers
self.context_vec_size = context_vec_size

# Encoder
self.gru = nn.GRU(input_size=input_size,
hidden_size=gru_hidden_size,
num_layers=gru_num_layers,
batch_first=True,
bidirectional=True)
# Attention
self.fc = nn.Linear(2 * gru_hidden_size, context_vec_size)
self.tanh = nn.Tanh()
self.softmax = nn.Softmax(dim=1)
# context vector
self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1))
self.context_vec.data.uniform_(-0.1, 0.1)

def forward(self, inputs):
# GRU part
h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len, word_dim)
u = self.tanh(self.fc(h_t))
# Attention part
alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size)
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1)
return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1)

+ 31
- 0
reproduction/text_classification/model/awd_lstm.py View File

@@ -0,0 +1,31 @@
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C
from .awdlstm_module import LSTM
from fastNLP.modules import encoder
from fastNLP.modules.decoder.mlp import MLP


class AWDLSTMSentiment(nn.Module):
def __init__(self, init_embed,
num_classes,
hidden_dim=256,
num_layers=1,
nfc=128,
wdrop=0.5):
super(AWDLSTMSentiment,self).__init__()
self.embed = encoder.Embedding(init_embed)
self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True, wdrop=wdrop)
self.mlp = MLP(size_layer=[hidden_dim* 2, nfc, num_classes])

def forward(self, words):
x_emb = self.embed(words)
output, _ = self.lstm(x_emb)
output = self.mlp(output[:,-1,:])
return {C.OUTPUT: output}

def predict(self, words):
output = self(words)
_, predict = output[C.OUTPUT].max(dim=1)
return {C.OUTPUT: predict}


+ 86
- 0
reproduction/text_classification/model/awdlstm_module.py View File

@@ -0,0 +1,86 @@
"""
轻量封装的 Pytorch LSTM 模块.
可在 forward 时传入序列的长度, 自动对padding做合适的处理.
"""
__all__ = [
"LSTM"
]

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn

from fastNLP.modules.utils import initial_parameter
from torch import autograd
from .weight_drop import WeightDrop


class LSTM(nn.Module):
"""
别名::class:`fastNLP.modules.LSTM` :class:`fastNLP.modules.encoder.lstm.LSTM`

LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下,将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化
为1; 且可以应对DataParallel中LSTM的使用问题。

:param input_size: 输入 `x` 的特征维度
:param hidden_size: 隐状态 `h` 的特征维度.
:param num_layers: rnn的层数. Default: 1
:param dropout: 层间dropout概率. Default: 0
:param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
:param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
:(batch, seq, feature). Default: ``False``
:param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
"""
def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
bidirectional=False, bias=True, wdrop=0.5):
super(LSTM, self).__init__()
self.batch_first = batch_first
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
dropout=dropout, bidirectional=bidirectional)
self.lstm = WeightDrop(self.lstm, ['weight_hh_l0'], dropout=wdrop)
self.init_param()

def init_param(self):
for name, param in self.named_parameters():
if 'bias' in name:
# based on https://github.com/pytorch/pytorch/issues/750#issuecomment-280671871
param.data.fill_(0)
n = param.size(0)
start, end = n // 4, n // 2
param.data[start:end].fill_(1)
else:
nn.init.xavier_uniform_(param)

def forward(self, x, seq_len=None, h0=None, c0=None):
"""

:param x: [batch, seq_len, input_size] 输入序列
:param seq_len: [batch, ] 序列长度, 若为 ``None``, 所有输入看做一样长. Default: ``None``
:param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全0向量. Default: ``None``
:param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全0向量. Default: ``None``
:return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列
和 [batch, hidden_size*num_direction] 最后时刻隐状态.
"""
batch_size, max_len, _ = x.size()
if h0 is not None and c0 is not None:
hx = (h0, c0)
else:
hx = None
if seq_len is not None and not isinstance(x, rnn.PackedSequence):
sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
if self.batch_first:
x = x[sort_idx]
else:
x = x[:, sort_idx]
x = rnn.pack_padded_sequence(x, sort_lens, batch_first=self.batch_first)
output, hx = self.lstm(x, hx) # -> [N,L,C]
output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first, total_length=max_len)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
if self.batch_first:
output = output[unsort_idx]
else:
output = output[:, unsort_idx]
else:
output, hx = self.lstm(x, hx)
return output, hx

+ 90
- 1
reproduction/text_classification/model/char_cnn.py View File

@@ -1 +1,90 @@
# TODO
'''
@author: https://github.com/ahmedbesbes/character-based-cnn
这里借鉴了上述链接中char-cnn model的代码,改动主要为将其改动为符合fastnlp的pipline
'''
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C

class CharacterLevelCNN(nn.Module):
def __init__(self, args,embedding):
super(CharacterLevelCNN, self).__init__()

self.config=args.char_cnn_config
self.embedding=embedding

conv_layers = []
for i, conv_layer_parameter in enumerate(self.config['model_parameters'][args.model_size]['conv']):
if i == 0:
#in_channels = args.number_of_characters + len(args.extra_characters)
in_channels = args.embedding_dim
out_channels = conv_layer_parameter[0]
else:
in_channels, out_channels = conv_layer_parameter[0], conv_layer_parameter[0]

if conv_layer_parameter[2] != -1:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU(),
nn.MaxPool1d(conv_layer_parameter[2]))
else:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU())
conv_layers.append(conv_layer)
self.conv_layers = nn.ModuleList(conv_layers)

input_shape = (args.batch_size, args.max_length,
args.number_of_characters + len(args.extra_characters))
dimension = self._get_conv_output(input_shape)

print('dimension :', dimension)

fc_layer_parameter = self.config['model_parameters'][args.model_size]['fc'][0]
fc_layers = nn.ModuleList([
nn.Sequential(
nn.Linear(dimension, fc_layer_parameter), nn.Dropout(0.5)),
nn.Sequential(nn.Linear(fc_layer_parameter,
fc_layer_parameter), nn.Dropout(0.5)),
nn.Linear(fc_layer_parameter, args.num_classes),
])

self.fc_layers = fc_layers

if args.model_size == 'small':
self._create_weights(mean=0.0, std=0.05)
elif args.model_size == 'large':
self._create_weights(mean=0.0, std=0.02)

def _create_weights(self, mean=0.0, std=0.05):
for module in self.modules():
if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
module.weight.data.normal_(mean, std)

def _get_conv_output(self, shape):
input = torch.rand(shape)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)
n_size = output.size(1)
return n_size

def forward(self, chars):
input=self.embedding(chars)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)

# forward pass through fc layers
for i in range(len(self.fc_layers)):
output = self.fc_layers[i](output)

return {C.OUTPUT: output}

+ 97
- 1
reproduction/text_classification/model/dpcnn.py View File

@@ -1 +1,97 @@
# TODO
import torch
import torch.nn as nn
from fastNLP.modules.utils import get_embeddings
from fastNLP.core import Const as C


class DPCNN(nn.Module):
def __init__(self, init_embed, num_cls, n_filters=256,
kernel_size=3, n_layers=7, embed_dropout=0.1, cls_dropout=0.1):
super().__init__()
self.region_embed = RegionEmbedding(
init_embed, out_dim=n_filters, kernel_sizes=[1, 3, 5])
embed_dim = self.region_embed.embedding_dim
self.conv_list = nn.ModuleList()
for i in range(n_layers):
self.conv_list.append(nn.Sequential(
nn.ReLU(),
nn.Conv1d(n_filters, n_filters, kernel_size,
padding=kernel_size//2),
nn.Conv1d(n_filters, n_filters, kernel_size,
padding=kernel_size//2),
))
self.pool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
self.embed_drop = nn.Dropout(embed_dropout)
self.classfier = nn.Sequential(
nn.Dropout(cls_dropout),
nn.Linear(n_filters, num_cls),
)
self.reset_parameters()

def reset_parameters(self):
for m in self.modules():
if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Linear)):
nn.init.normal_(m.weight, mean=0, std=0.01)
if m.bias is not None:
nn.init.normal_(m.bias, mean=0, std=0.01)

def forward(self, words, seq_len=None):
words = words.long()
# get region embeddings
x = self.region_embed(words)
x = self.embed_drop(x)

# not pooling on first conv
x = self.conv_list[0](x) + x
for conv in self.conv_list[1:]:
x = self.pool(x)
x = conv(x) + x

# B, C, L => B, C
x, _ = torch.max(x, dim=2)
x = self.classfier(x)
return {C.OUTPUT: x}

def predict(self, words, seq_len=None):
x = self.forward(words, seq_len)[C.OUTPUT]
return {C.OUTPUT: torch.argmax(x, 1)}


class RegionEmbedding(nn.Module):
def __init__(self, init_embed, out_dim=300, kernel_sizes=None):
super().__init__()
if kernel_sizes is None:
kernel_sizes = [5, 9]
assert isinstance(
kernel_sizes, list), 'kernel_sizes should be List(int)'
self.embed = get_embeddings(init_embed)
try:
embed_dim = self.embed.embedding_dim
except Exception:
embed_dim = self.embed.embed_size
self.region_embeds = nn.ModuleList()
for ksz in kernel_sizes:
self.region_embeds.append(nn.Sequential(
nn.Conv1d(embed_dim, embed_dim, ksz, padding=ksz // 2),
))
self.linears = nn.ModuleList([nn.Conv1d(embed_dim, out_dim, 1)
for _ in range(len(kernel_sizes))])
self.embedding_dim = embed_dim

def forward(self, x):
x = self.embed(x)
x = x.transpose(1, 2)
# B, C, L
out = 0
for conv, fc in zip(self.region_embeds, self.linears[1:]):
conv_i = conv(x)
out = out + fc(conv_i)
# B, C, L
return out


if __name__ == '__main__':
x = torch.randint(0, 10000, size=(5, 15), dtype=torch.long)
model = DPCNN((10000, 300), 20)
y = model(x)
print(y.size(), y.mean(1), y.std(1))

+ 30
- 0
reproduction/text_classification/model/lstm.py View File

@@ -0,0 +1,30 @@
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C
from fastNLP.modules.encoder.lstm import LSTM
from fastNLP.modules import encoder
from fastNLP.modules.decoder.mlp import MLP


class BiLSTMSentiment(nn.Module):
def __init__(self, init_embed,
num_classes,
hidden_dim=256,
num_layers=1,
nfc=128):
super(BiLSTMSentiment,self).__init__()
self.embed = encoder.Embedding(init_embed)
self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True)
self.mlp = MLP(size_layer=[hidden_dim* 2, nfc, num_classes])

def forward(self, words):
x_emb = self.embed(words)
output, _ = self.lstm(x_emb)
output = self.mlp(output[:,-1,:])
return {C.OUTPUT: output}

def predict(self, words):
output = self(words)
_, predict = output[C.OUTPUT].max(dim=1)
return {C.OUTPUT: predict}


+ 35
- 0
reproduction/text_classification/model/lstm_self_attention.py View File

@@ -0,0 +1,35 @@
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C
from fastNLP.modules.encoder.lstm import LSTM
from fastNLP.modules import encoder
from fastNLP.modules.aggregator.attention import SelfAttention
from fastNLP.modules.decoder.mlp import MLP


class BiLSTM_SELF_ATTENTION(nn.Module):
def __init__(self, init_embed,
num_classes,
hidden_dim=256,
num_layers=1,
attention_unit=256,
attention_hops=1,
nfc=128):
super(BiLSTM_SELF_ATTENTION,self).__init__()
self.embed = encoder.Embedding(init_embed)
self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True)
self.attention = SelfAttention(input_size=hidden_dim * 2 , attention_unit=attention_unit, attention_hops=attention_hops)
self.mlp = MLP(size_layer=[hidden_dim* 2*attention_hops, nfc, num_classes])

def forward(self, words):
x_emb = self.embed(words)
output, _ = self.lstm(x_emb)
after_attention, penalty = self.attention(output,words)
after_attention =after_attention.view(after_attention.size(0),-1)
output = self.mlp(after_attention)
return {C.OUTPUT: output}

def predict(self, words):
output = self(words)
_, predict = output[C.OUTPUT].max(dim=1)
return {C.OUTPUT: predict}

+ 99
- 0
reproduction/text_classification/model/weight_drop.py View File

@@ -0,0 +1,99 @@
import torch
from torch.nn import Parameter
from functools import wraps

class WeightDrop(torch.nn.Module):
def __init__(self, module, weights, dropout=0, variational=False):
super(WeightDrop, self).__init__()
self.module = module
self.weights = weights
self.dropout = dropout
self.variational = variational
self._setup()

def widget_demagnetizer_y2k_edition(*args, **kwargs):
# We need to replace flatten_parameters with a nothing function
# It must be a function rather than a lambda as otherwise pickling explodes
# We can't write boring code though, so ... WIDGET DEMAGNETIZER Y2K EDITION!
# (╯°□°)╯︵ ┻━┻
return

def _setup(self):
# Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN
if issubclass(type(self.module), torch.nn.RNNBase):
self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition

for name_w in self.weights:
print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
w = getattr(self.module, name_w)
del self.module._parameters[name_w]
self.module.register_parameter(name_w + '_raw', Parameter(w.data))

def _setweights(self):
for name_w in self.weights:
raw_w = getattr(self.module, name_w + '_raw')
w = None
if self.variational:
mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1))
if raw_w.is_cuda: mask = mask.cuda()
mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True)
w = mask.expand_as(raw_w) * raw_w
else:
w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)
setattr(self.module, name_w, w)

def forward(self, *args):
self._setweights()
return self.module.forward(*args)

if __name__ == '__main__':
import torch
from weight_drop import WeightDrop

# Input is (seq, batch, input)
x = torch.autograd.Variable(torch.randn(2, 1, 10)).cuda()
h0 = None

###

print('Testing WeightDrop')
print('=-=-=-=-=-=-=-=-=-=')

###

print('Testing WeightDrop with Linear')

lin = WeightDrop(torch.nn.Linear(10, 10), ['weight'], dropout=0.9)
lin.cuda()
run1 = [x.sum() for x in lin(x).data]
run2 = [x.sum() for x in lin(x).data]

print('All items should be different')
print('Run 1:', run1)
print('Run 2:', run2)

assert run1[0] != run2[0]
assert run1[1] != run2[1]

print('---')

###

print('Testing WeightDrop with LSTM')

wdrnn = WeightDrop(torch.nn.LSTM(10, 10), ['weight_hh_l0'], dropout=0.9)
wdrnn.cuda()

run1 = [x.sum() for x in wdrnn(x, h0)[0].data]
run2 = [x.sum() for x in wdrnn(x, h0)[0].data]

print('First timesteps should be equal, all others should differ')
print('Run 1:', run1)
print('Run 2:', run2)

# First time step, not influenced by hidden to hidden weights, should be equal
assert run1[0] == run2[0]
# Second step should not
assert run1[1] != run2[1]

print('---')

+ 109
- 0
reproduction/text_classification/train_HAN.py View File

@@ -0,0 +1,109 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径

import os
import sys
sys.path.append('../../')
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

from fastNLP.core.const import Const as C
from fastNLP.core import LRScheduler
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from reproduction.text_classification.data.yelpLoader import yelpLoader
from reproduction.text_classification.model.HAN import HANCLS
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
import torch.cuda
from torch.optim.lr_scheduler import CosineAnnealingLR


##hyper

class Config():
model_dir_or_name = "en-base-uncased"
embedding_grad = False,
train_epoch = 30
batch_size = 100
num_classes = 5
task = "yelp"
#datadir = '/remote-home/lyli/fastNLP/yelp_polarity/'
datadir = '/remote-home/ygwang/yelp_polarity/'
datafile = {"train": "train.csv", "test": "test.csv"}
lr = 1e-3

def __init__(self):
self.datapath = {k: os.path.join(self.datadir, v)
for k, v in self.datafile.items()}


ops = Config()

##1.task相关信息:利用dataloader载入dataInfo

datainfo = yelpLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
print(len(datainfo.datasets['train']))
print(len(datainfo.datasets['test']))


# post process
def make_sents(words):
sents = [words]
return sents


for dataset in datainfo.datasets.values():
dataset.apply_field(make_sents, field_name='words', new_field_name='input_sents')

datainfo = datainfo
datainfo.datasets['train'].set_input('input_sents')
datainfo.datasets['test'].set_input('input_sents')
datainfo.datasets['train'].set_target('target')
datainfo.datasets['test'].set_target('target')

## 2.或直接复用fastNLP的模型

vocab = datainfo.vocabs['words']
# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])
embedding = StaticEmbedding(vocab)

print(len(vocab))
print(len(datainfo.vocabs['target']))

# model = DPCNN(init_embed=embedding, num_cls=ops.num_classes)
model = HANCLS(init_embed=embedding, num_cls=ops.num_classes)

## 3. 声明loss,metric,optimizer
loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer = SGD([param for param in model.parameters() if param.requires_grad == True],
lr=ops.lr, momentum=0.9, weight_decay=0)

callbacks = []
callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print(device)

for ds in datainfo.datasets.values():
ds.apply_field(len, C.INPUT, C.INPUT_LEN)
ds.set_input(C.INPUT, C.INPUT_LEN)
ds.set_target(C.TARGET)


## 4.定义train方法
def train(model, datainfo, loss, metrics, optimizer, num_epochs=ops.train_epoch):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=[metrics], dev_data=datainfo.datasets['test'], device=device,
check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks,
n_epochs=num_epochs)

print(trainer.train())


if __name__ == "__main__":
train(model, datainfo, loss, metric, optimizer)

+ 69
- 0
reproduction/text_classification/train_awdlstm.py View File

@@ -0,0 +1,69 @@
# 这个模型需要在pytorch=0.4下运行,weight_drop不支持1.0

# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'


import torch.nn as nn

from data.IMDBLoader import IMDBLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from model.awd_lstm import AWDLSTMSentiment

from fastNLP.core.const import Const as C
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP import Trainer, Tester
from torch.optim import Adam
from fastNLP.io.model_io import ModelLoader, ModelSaver

import argparse


class Config():
train_epoch= 10
lr=0.001

num_classes=2
hidden_dim=256
num_layers=1
nfc=128
wdrop=0.5

task_name = "IMDB"
datapath={"train":"IMDB_data/train.csv", "test":"IMDB_data/test.csv"}
save_model_path="./result_IMDB_test/"

opt=Config()


# load data
dataloader=IMDBLoader()
datainfo=dataloader.process(opt.datapath)

# print(datainfo.datasets["train"])
# print(datainfo)


# define model
vocab=datainfo.vocabs['words']
embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True)
model=AWDLSTMSentiment(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, nfc=opt.nfc, wdrop=opt.wdrop)


# define loss_function and metrics
loss=CrossEntropyLoss()
metrics=AccuracyMetric()
optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr)


def train(datainfo, model, optimizer, loss, metrics, opt):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=metrics, dev_data=datainfo.datasets['test'], device=0, check_code_level=-1,
n_epochs=opt.train_epoch, save_path=opt.save_model_path)
trainer.train()


if __name__ == "__main__":
train(datainfo, model, optimizer, loss, metrics, opt)

+ 205
- 0
reproduction/text_classification/train_char_cnn.py View File

@@ -0,0 +1,205 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'

import sys
sys.path.append('../..')
from fastNLP.core.const import Const as C
import torch.nn as nn
from data.yelpLoader import yelpLoader
from data.sstLoader import sst2Loader
from data.IMDBLoader import IMDBLoader
from model.char_cnn import CharacterLevelCNN
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.modules.encoder.embedding import CNNCharEmbedding,StaticEmbedding,StackEmbedding,LSTMCharEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
from torch.autograd import Variable
import torch
from fastNLP import BucketSampler

##hyper
#todo 这里加入fastnlp的记录
class Config():
model_dir_or_name="en-base-uncased"
embedding_grad= False,
bert_embedding_larers= '4,-2,-1'
train_epoch= 50
num_classes=2
task= "IMDB"
#yelp_p
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
#IMDB
#datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
# "test": "/remote-home/ygwang/IMDB_data/test.csv"}
# sst
# datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
# "dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}

lr=0.01
batch_size=128
model_size="large"
number_of_characters=69
extra_characters=''
max_length=1014

char_cnn_config={
"alphabet": {
"en": {
"lower": {
"alphabet": "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 69
},
"both": {
"alphabet": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 95
}
}
},
"model_parameters": {
"small": {
"conv": [
#依次是channel,kennnel_size,maxpooling_size
[256,7,3],
[256,7,3],
[256,3,-1],
[256,3,-1],
[256,3,-1],
[256,3,3]
],
"fc": [1024,1024]
},
"large":{
"conv":[
[1024, 7, 3],
[1024, 7, 3],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, 3]
],
"fc": [2048,2048]
}
},
"data": {
"text_column": "SentimentText",
"label_column": "Sentiment",
"max_length": 1014,
"num_of_classes": 2,
"encoding": None,
"chunksize": 50000,
"max_rows": 100000,
"preprocessing_steps": ["lower", "remove_hashtags", "remove_urls", "remove_user_mentions"]
},
"training": {
"batch_size": 128,
"learning_rate": 0.01,
"epochs": 10,
"optimizer": "sgd"
}
}
ops=Config


##1.task相关信息:利用dataloader载入dataInfo
#dataloader=sst2Loader()
#dataloader=IMDBLoader()
dataloader=yelpLoader(fine_grained=True)
datainfo=dataloader.process(ops.datapath,char_level_op=True)
char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"]
ops.number_of_characters=len(char_vocab)
ops.embedding_dim=ops.number_of_characters

#chartoindex
def chartoindex(chars):
max_seq_len=ops.max_length
zero_index=len(char_vocab)
char_index_list=[]
for char in chars:
if char in char_vocab:
char_index_list.append(char_vocab.index(char))
else:
#<unk>和<pad>均使用最后一个作为embbeding
char_index_list.append(zero_index)
if len(char_index_list) > max_seq_len:
char_index_list = char_index_list[:max_seq_len]
elif 0 < len(char_index_list) < max_seq_len:
char_index_list = char_index_list+[zero_index]*(max_seq_len-len(char_index_list))
elif len(char_index_list) == 0:
char_index_list=[zero_index]*max_seq_len
return char_index_list

for dataset in datainfo.datasets.values():
dataset.apply_field(chartoindex,field_name='chars',new_field_name='chars')

datainfo.datasets['train'].set_input('chars')
datainfo.datasets['test'].set_input('chars')
datainfo.datasets['train'].set_target('target')
datainfo.datasets['test'].set_target('target')

##2. 定义/组装模型,这里可以随意,就如果是fastNLP封装好的,类似CNNText就直接用初始化调用就好了,这里只是给出一个伪框架表示占位,在这里建立符合fastNLP输入输出规范的model
class ModelFactory(nn.Module):
"""
用于拼装embedding,encoder,decoder 以及设计forward过程

:param embedding: embbeding model
:param encoder: encoder model
:param decoder: decoder model

"""
def __int__(self,embedding,encoder,decoder,**kwargs):
super(ModelFactory,self).__init__()
self.embedding=embedding
self.encoder=encoder
self.decoder=decoder

def forward(self,x):
return {C.OUTPUT:None}

## 2.或直接复用fastNLP的模型
#vocab=datainfo.vocabs['words']
vocab_label=datainfo.vocabs['target']
'''
# emded_char=CNNCharEmbedding(vocab)
# embed_word = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
# embedding=StackEmbedding([emded_char, embed_word])
# cnn_char_embed = CNNCharEmbedding(vocab)
# lstm_char_embed = LSTMCharEmbedding(vocab)
# embedding = StackEmbedding([cnn_char_embed, lstm_char_embed])
'''
#one-hot embedding
embedding_weight= Variable(torch.zeros(len(char_vocab)+1, len(char_vocab)))

for i in range(len(char_vocab)):
embedding_weight[i][i]=1
embedding=nn.Embedding(num_embeddings=len(char_vocab)+1,embedding_dim=len(char_vocab),padding_idx=len(char_vocab),_weight=embedding_weight)
for para in embedding.parameters():
para.requires_grad=False
#CNNText太过于简单
#model=CNNText(init_embed=embedding, num_classes=ops.num_classes)
model=CharacterLevelCNN(ops,embedding)

## 3. 声明loss,metric,optimizer
loss=CrossEntropyLoss
metric=AccuracyMetric
optimizer= SGD([param for param in model.parameters() if param.requires_grad==True], lr=ops.lr)

## 4.定义train方法
def train(model,datainfo,loss,metrics,optimizer,num_epochs=100):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss(target='target'),
metrics=[metrics(target='target')], dev_data=datainfo.datasets['test'], device=0, check_code_level=-1,
n_epochs=num_epochs)
print(trainer.train())



if __name__=="__main__":
#print(vocab_label)

#print(datainfo.datasets["train"])
train(model,datainfo,loss,metric,optimizer,num_epochs=ops.train_epoch)

+ 120
- 0
reproduction/text_classification/train_dpcnn.py View File

@@ -0,0 +1,120 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径

import torch.cuda
from fastNLP.core.utils import cache_results
from torch.optim import SGD
from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
from fastNLP.core.trainer import Trainer
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from reproduction.text_classification.model.dpcnn import DPCNN
from data.yelpLoader import yelpLoader
from fastNLP.core.sampler import BucketSampler
import torch.nn as nn
from fastNLP.core import LRScheduler
from fastNLP.core.const import Const as C
from fastNLP.core.vocabulary import VocabularyOption
from utils.util_init import set_rng_seeds
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"


# hyper

class Config():
seed = 12345
model_dir_or_name = "dpcnn-yelp-p"
embedding_grad = True
train_epoch = 30
batch_size = 100
task = "yelp_p"
#datadir = 'workdir/datasets/SST'
datadir = 'workdir/datasets/yelp_polarity'
# datadir = 'workdir/datasets/yelp_full'
#datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datafile = {"train": "train.csv", "test": "test.csv"}
lr = 1e-3
src_vocab_op = VocabularyOption(max_size=100000)
embed_dropout = 0.3
cls_dropout = 0.1
weight_decay = 1e-5

def __init__(self):
self.datadir = os.path.join(os.environ['HOME'], self.datadir)
self.datapath = {k: os.path.join(self.datadir, v)
for k, v in self.datafile.items()}


ops = Config()

set_rng_seeds(ops.seed)
print('RNG SEED: {}'.format(ops.seed))

# 1.task相关信息:利用dataloader载入dataInfo

#datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])


@cache_results(ops.model_dir_or_name+'-data-cache')
def load_data():
datainfo = yelpLoader(fine_grained=True, lower=True).process(
paths=ops.datapath, train_ds=['train'], src_vocab_op=ops.src_vocab_op)
for ds in datainfo.datasets.values():
ds.apply_field(len, C.INPUT, C.INPUT_LEN)
ds.set_input(C.INPUT, C.INPUT_LEN)
ds.set_target(C.TARGET)
embedding = StaticEmbedding(
datainfo.vocabs['words'], model_dir_or_name='en-glove-840b-300', requires_grad=ops.embedding_grad,
normalize=False
)
return datainfo, embedding


datainfo, embedding = load_data()

# 2.或直接复用fastNLP的模型

# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])

print(datainfo)
print(datainfo.datasets['train'][0])

model = DPCNN(init_embed=embedding, num_cls=len(datainfo.vocabs[C.TARGET]),
embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout)
print(model)

# 3. 声明loss,metric,optimizer
loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer = SGD([param for param in model.parameters() if param.requires_grad == True],
lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay)

callbacks = []
# callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))
callbacks.append(
LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch <
ops.train_epoch * 0.8 else ops.lr * 0.1))
)

# callbacks.append(
# FitlogCallback(data=datainfo.datasets, verbose=1)
# )

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print(device)

# 4.定义train方法
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size),
metrics=[metric],
dev_data=datainfo.datasets['test'], device=device,
check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks,
n_epochs=ops.train_epoch, num_workers=4)



if __name__ == "__main__":
print(trainer.train())

+ 66
- 0
reproduction/text_classification/train_lstm.py View File

@@ -0,0 +1,66 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'


import torch.nn as nn

from data.IMDBLoader import IMDBLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from model.lstm import BiLSTMSentiment

from fastNLP.core.const import Const as C
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP import Trainer, Tester
from torch.optim import Adam
from fastNLP.io.model_io import ModelLoader, ModelSaver

import argparse


class Config():
train_epoch= 10
lr=0.001

num_classes=2
hidden_dim=256
num_layers=1
nfc=128

task_name = "IMDB"
datapath={"train":"IMDB_data/train.csv", "test":"IMDB_data/test.csv"}
save_model_path="./result_IMDB_test/"

opt=Config()


# load data
dataloader=IMDBLoader()
datainfo=dataloader.process(opt.datapath)

# print(datainfo.datasets["train"])
# print(datainfo)


# define model
vocab=datainfo.vocabs['words']
embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True)
model=BiLSTMSentiment(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, nfc=opt.nfc)


# define loss_function and metrics
loss=CrossEntropyLoss()
metrics=AccuracyMetric()
optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr)


def train(datainfo, model, optimizer, loss, metrics, opt):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=metrics, dev_data=datainfo.datasets['test'], device=0, check_code_level=-1,
n_epochs=opt.train_epoch, save_path=opt.save_model_path)
trainer.train()


if __name__ == "__main__":
train(datainfo, model, optimizer, loss, metrics, opt)

+ 68
- 0
reproduction/text_classification/train_lstm_att.py View File

@@ -0,0 +1,68 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'


import torch.nn as nn

from data.IMDBLoader import IMDBLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from model.lstm_self_attention import BiLSTM_SELF_ATTENTION

from fastNLP.core.const import Const as C
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP import Trainer, Tester
from torch.optim import Adam
from fastNLP.io.model_io import ModelLoader, ModelSaver

import argparse


class Config():
train_epoch= 10
lr=0.001

num_classes=2
hidden_dim=256
num_layers=1
attention_unit=256
attention_hops=1
nfc=128

task_name = "IMDB"
datapath={"train":"IMDB_data/train.csv", "test":"IMDB_data/test.csv"}
save_model_path="./result_IMDB_test/"

opt=Config()


# load data
dataloader=IMDBLoader()
datainfo=dataloader.process(opt.datapath)

# print(datainfo.datasets["train"])
# print(datainfo)


# define model
vocab=datainfo.vocabs['words']
embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True)
model=BiLSTM_SELF_ATTENTION(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, attention_unit=opt.attention_unit, attention_hops=opt.attention_hops, nfc=opt.nfc)


# define loss_function and metrics
loss=CrossEntropyLoss()
metrics=AccuracyMetric()
optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr)


def train(datainfo, model, optimizer, loss, metrics, opt):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=metrics, dev_data=datainfo.datasets['test'], device=0, check_code_level=-1,
n_epochs=opt.train_epoch, save_path=opt.save_model_path)
trainer.train()


if __name__ == "__main__":
train(datainfo, model, optimizer, loss, metrics, opt)

+ 11
- 0
reproduction/text_classification/utils/util_init.py View File

@@ -0,0 +1,11 @@
import numpy
import torch
import random


def set_rng_seeds(seed):
random.seed(seed)
numpy.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# print('RNG_SEED {}'.format(seed))

+ 14
- 3
reproduction/utils.py View File

@@ -29,13 +29,15 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
path_pair = ('train', filename) path_pair = ('train', filename)
if 'dev' in filename: if 'dev' in filename:
if path_pair: if path_pair:
raise Exception("File:{} in {} contains bot `{}` and `dev`.".format(filename, paths, path_pair[0]))
raise Exception("File:{} in {} contains both `{}` and `dev`.".format(filename, paths, path_pair[0]))
path_pair = ('dev', filename) path_pair = ('dev', filename)
if 'test' in filename: if 'test' in filename:
if path_pair: if path_pair:
raise Exception("File:{} in {} contains bot `{}` and `test`.".format(filename, paths, path_pair[0]))
raise Exception("File:{} in {} contains both `{}` and `test`.".format(filename, paths, path_pair[0]))
path_pair = ('test', filename) path_pair = ('test', filename)
if path_pair: if path_pair:
if path_pair[0] in files:
raise RuntimeError(f"Multiple file under {paths} have '{path_pair[0]}' in their filename.")
files[path_pair[0]] = os.path.join(paths, path_pair[1]) files[path_pair[0]] = os.path.join(paths, path_pair[1])
return files return files
else: else:
@@ -57,4 +59,13 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
else: else:
raise TypeError(f"paths only supports str and dict. not {type(paths)}.") raise TypeError(f"paths only supports str and dict. not {type(paths)}.")



def get_tokenizer():
try:
import spacy
spacy.prefer_gpu()
en = spacy.load('en')
print('use spacy tokenizer')
return lambda x: [w.text for w in en.tokenizer(x)]
except Exception as e:
print('use raw tokenizer')
return lambda x: x.split()

Loading…
Cancel
Save