@@ -60,8 +60,8 @@ class SSTLoader(DataSetLoader): | |||||
def _get_one(self, data, subtree): | def _get_one(self, data, subtree): | ||||
tree = Tree.fromstring(data) | tree = Tree.fromstring(data) | ||||
if subtree: | if subtree: | ||||
return [([x.text for x in self.tokenizer(' '.join(t.leaves()))], t.label()) for t in tree.subtrees() ] | |||||
return [([x.text for x in self.tokenizer(' '.join(tree.leaves()))], tree.label())] | |||||
return [(self.tokenizer(' '.join(t.leaves())), t.label()) for t in tree.subtrees() ] | |||||
return [(self.tokenizer(' '.join(tree.leaves())), tree.label())] | |||||
def process(self, | def process(self, | ||||
paths, train_subtree=True, | paths, train_subtree=True, | ||||
@@ -46,8 +46,8 @@ class StarTransEnc(nn.Module): | |||||
super(StarTransEnc, self).__init__() | super(StarTransEnc, self).__init__() | ||||
self.embedding = get_embeddings(init_embed) | self.embedding = get_embeddings(init_embed) | ||||
emb_dim = self.embedding.embedding_dim | emb_dim = self.embedding.embedding_dim | ||||
#self.emb_fc = nn.Linear(emb_dim, hidden_size) | |||||
self.emb_drop = nn.Dropout(emb_dropout) | |||||
self.emb_fc = nn.Linear(emb_dim, hidden_size) | |||||
# self.emb_drop = nn.Dropout(emb_dropout) | |||||
self.encoder = StarTransformer(hidden_size=hidden_size, | self.encoder = StarTransformer(hidden_size=hidden_size, | ||||
num_layers=num_layers, | num_layers=num_layers, | ||||
num_head=num_head, | num_head=num_head, | ||||
@@ -65,7 +65,7 @@ class StarTransEnc(nn.Module): | |||||
[batch, hidden] 全局 relay 节点, 详见论文 | [batch, hidden] 全局 relay 节点, 详见论文 | ||||
""" | """ | ||||
x = self.embedding(x) | x = self.embedding(x) | ||||
#x = self.emb_fc(self.emb_drop(x)) | |||||
x = self.emb_fc(x) | |||||
nodes, relay = self.encoder(x, mask) | nodes, relay = self.encoder(x, mask) | ||||
return nodes, relay | return nodes, relay | ||||
@@ -34,8 +34,8 @@ class StarTransformer(nn.Module): | |||||
super(StarTransformer, self).__init__() | super(StarTransformer, self).__init__() | ||||
self.iters = num_layers | self.iters = num_layers | ||||
self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)]) | |||||
self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1) | |||||
self.norm = nn.ModuleList([nn.LayerNorm(hidden_size, eps=1e-6) for _ in range(self.iters)]) | |||||
# self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1) | |||||
self.emb_drop = nn.Dropout(dropout) | self.emb_drop = nn.Dropout(dropout) | ||||
self.ring_att = nn.ModuleList( | self.ring_att = nn.ModuleList( | ||||
[_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=0.0) | [_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=0.0) | ||||
@@ -9,26 +9,3 @@ paper: [Star-Transformer](https://arxiv.org/abs/1902.09113) | |||||
|Text Classification|SST|-|51.2| | |Text Classification|SST|-|51.2| | ||||
|Natural Language Inference|SNLI|-|83.76| | |Natural Language Inference|SNLI|-|83.76| | ||||
## Usage | |||||
``` python | |||||
# for sequence labeling(ner, pos tagging, etc) | |||||
from fastNLP.models.star_transformer import STSeqLabel | |||||
model = STSeqLabel( | |||||
vocab_size=10000, num_cls=50, | |||||
emb_dim=300) | |||||
# for sequence classification | |||||
from fastNLP.models.star_transformer import STSeqCls | |||||
model = STSeqCls( | |||||
vocab_size=10000, num_cls=50, | |||||
emb_dim=300) | |||||
# for natural language inference | |||||
from fastNLP.models.star_transformer import STNLICls | |||||
model = STNLICls( | |||||
vocab_size=10000, num_cls=50, | |||||
emb_dim=300) | |||||
``` |
@@ -10,7 +10,8 @@ from fastNLP.models.star_transformer import STSeqLabel, STSeqCls, STNLICls | |||||
from fastNLP.core.const import Const as C | from fastNLP.core.const import Const as C | ||||
import sys | import sys | ||||
#sys.path.append('/remote-home/yfshao/workdir/dev_fastnlp/') | #sys.path.append('/remote-home/yfshao/workdir/dev_fastnlp/') | ||||
pre_dir = '/home/ec2-user/fast_data/' | |||||
import os | |||||
pre_dir = os.path.join(os.environ['HOME'], 'workdir/datasets/') | |||||
g_model_select = { | g_model_select = { | ||||
'pos': STSeqLabel, | 'pos': STSeqLabel, | ||||
@@ -19,7 +20,7 @@ g_model_select = { | |||||
'nli': STNLICls, | 'nli': STNLICls, | ||||
} | } | ||||
g_emb_file_path = {'en': pre_dir + 'glove.840B.300d.txt', | |||||
g_emb_file_path = {'en': pre_dir + 'word_vector/glove.840B.300d.txt', | |||||
'zh': pre_dir + 'cc.zh.300.vec'} | 'zh': pre_dir + 'cc.zh.300.vec'} | ||||
g_args = None | g_args = None | ||||
@@ -55,7 +56,7 @@ def get_conll2012_ner(): | |||||
def get_sst(): | def get_sst(): | ||||
path = pre_dir + 'sst' | |||||
path = pre_dir + 'SST' | |||||
files = ['train.txt', 'dev.txt', 'test.txt'] | files = ['train.txt', 'dev.txt', 'test.txt'] | ||||
return load_sst(path, files) | return load_sst(path, files) | ||||
@@ -171,10 +172,10 @@ def train(): | |||||
sampler=FN.BucketSampler(100, g_args.bsz, C.INPUT_LEN), | sampler=FN.BucketSampler(100, g_args.bsz, C.INPUT_LEN), | ||||
callbacks=[MyCallback()]) | callbacks=[MyCallback()]) | ||||
trainer.train() | |||||
print(trainer.train()) | |||||
tester = FN.Tester(data=test_data, model=model, metrics=metric, | tester = FN.Tester(data=test_data, model=model, metrics=metric, | ||||
batch_size=128, device=device) | batch_size=128, device=device) | ||||
tester.test() | |||||
print(tester.test()) | |||||
def test(): | def test(): | ||||
@@ -106,7 +106,7 @@ class IDCNN(nn.Module): | |||||
if self.crf is not None and target is not None: | if self.crf is not None and target is not None: | ||||
loss = self.crf(y.transpose(1, 2), t, mask) | loss = self.crf(y.transpose(1, 2), t, mask) | ||||
else: | else: | ||||
t.masked_fill_(mask == 0, -100) | |||||
# t.masked_fill_(mask == 0, -100) | |||||
loss = F.cross_entropy(y, t, ignore_index=-100) | loss = F.cross_entropy(y, t, ignore_index=-100) | ||||
return loss | return loss | ||||
@@ -130,13 +130,3 @@ class IDCNN(nn.Module): | |||||
C.OUTPUT: pred, | C.OUTPUT: pred, | ||||
} | } | ||||
def predict(self, words, seq_len, chars=None): | |||||
res = self.forward( | |||||
words=words, | |||||
seq_len=seq_len, | |||||
chars=chars, | |||||
target=None | |||||
)[C.OUTPUT] | |||||
return { | |||||
C.OUTPUT: res | |||||
} |
@@ -1,4 +1,5 @@ | |||||
from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader | from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader | ||||
from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader | |||||
from fastNLP.core.callback import FitlogCallback, LRScheduler | from fastNLP.core.callback import FitlogCallback, LRScheduler | ||||
from fastNLP import GradientClipCallback | from fastNLP import GradientClipCallback | ||||
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR | from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR | ||||
@@ -7,10 +8,12 @@ from fastNLP import Const | |||||
from fastNLP import RandomSampler, BucketSampler | from fastNLP import RandomSampler, BucketSampler | ||||
from fastNLP import SpanFPreRecMetric | from fastNLP import SpanFPreRecMetric | ||||
from fastNLP import Trainer | from fastNLP import Trainer | ||||
from fastNLP.core.metrics import MetricBase | |||||
from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN | from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN | ||||
from fastNLP.core.utils import Option | from fastNLP.core.utils import Option | ||||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding | from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding | ||||
from fastNLP.core.utils import cache_results | from fastNLP.core.utils import cache_results | ||||
from fastNLP.core.vocabulary import VocabularyOption | |||||
import sys | import sys | ||||
import torch.cuda | import torch.cuda | ||||
import os | import os | ||||
@@ -24,43 +27,53 @@ encoding_type = 'bioes' | |||||
def get_path(path): | def get_path(path): | ||||
return os.path.join(os.environ['HOME'], path) | return os.path.join(os.environ['HOME'], path) | ||||
data_path = get_path('workdir/datasets/ontonotes-v4') | |||||
ops = Option( | ops = Option( | ||||
batch_size=128, | batch_size=128, | ||||
num_epochs=100, | num_epochs=100, | ||||
lr=3e-4, | |||||
lr=5e-4, | |||||
repeats=3, | repeats=3, | ||||
num_layers=3, | num_layers=3, | ||||
num_filters=400, | num_filters=400, | ||||
use_crf=True, | |||||
use_crf=False, | |||||
gradient_clip=5, | gradient_clip=5, | ||||
) | ) | ||||
@cache_results('ontonotes-cache') | |||||
@cache_results('ontonotes-min_freq0-case-cache') | |||||
def load_data(): | def load_data(): | ||||
data = OntoNoteNERDataLoader(encoding_type=encoding_type).process(data_path, | |||||
lower=True) | |||||
print('loading data') | |||||
# data = OntoNoteNERDataLoader(encoding_type=encoding_type).process( | |||||
# data_path = get_path('workdir/datasets/ontonotes-v4') | |||||
# lower=False, | |||||
# word_vocab_opt=VocabularyOption(min_freq=0), | |||||
# ) | |||||
data = Conll2003DataLoader(task='ner', encoding_type=encoding_type).process( | |||||
paths=get_path('workdir/datasets/conll03'), | |||||
lower=False, word_vocab_opt=VocabularyOption(min_freq=0) | |||||
) | |||||
# char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], | # char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], | ||||
# kernel_sizes=[3]) | # kernel_sizes=[3]) | ||||
print('loading embedding') | |||||
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | ||||
model_dir_or_name='en-glove-840b-300', | model_dir_or_name='en-glove-840b-300', | ||||
requires_grad=True) | requires_grad=True) | ||||
return data, [word_embed] | return data, [word_embed] | ||||
data, embeds = load_data() | data, embeds = load_data() | ||||
print(data) | |||||
print(data.datasets['train'][0]) | print(data.datasets['train'][0]) | ||||
print(list(data.vocabs.keys())) | print(list(data.vocabs.keys())) | ||||
for ds in data.datasets.values(): | |||||
ds.rename_field('cap_words', 'chars') | |||||
ds.set_input('chars') | |||||
# for ds in data.datasets.values(): | |||||
# ds.rename_field('cap_words', 'chars') | |||||
# ds.set_input('chars') | |||||
word_embed = embeds[0] | word_embed = embeds[0] | ||||
char_embed = CNNCharEmbedding(data.vocabs['cap_words']) | |||||
word_embed.embedding.weight.data /= word_embed.embedding.weight.data.std() | |||||
# char_embed = CNNCharEmbedding(data.vocabs['cap_words']) | |||||
char_embed = None | |||||
# for ds in data.datasets: | # for ds in data.datasets: | ||||
# ds.rename_field('') | # ds.rename_field('') | ||||
@@ -75,13 +88,42 @@ model = IDCNN(init_embed=word_embed, | |||||
kernel_size=3, | kernel_size=3, | ||||
use_crf=ops.use_crf, use_projection=True, | use_crf=ops.use_crf, use_projection=True, | ||||
block_loss=True, | block_loss=True, | ||||
input_dropout=0.33, hidden_dropout=0.2, inner_dropout=0.2) | |||||
input_dropout=0.5, hidden_dropout=0.0, inner_dropout=0.0) | |||||
print(model) | print(model) | ||||
callbacks = [GradientClipCallback(clip_value=ops.gradient_clip, clip_type='norm'),] | callbacks = [GradientClipCallback(clip_value=ops.gradient_clip, clip_type='norm'),] | ||||
metrics = [] | |||||
metrics.append( | |||||
SpanFPreRecMetric( | |||||
tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type, | |||||
pred=Const.OUTPUT, target=Const.TARGET, seq_len=Const.INPUT_LEN, | |||||
) | |||||
) | |||||
class LossMetric(MetricBase): | |||||
def __init__(self, loss=None): | |||||
super(LossMetric, self).__init__() | |||||
self._init_param_map(loss=loss) | |||||
self.total_loss = 0.0 | |||||
self.steps = 0 | |||||
def evaluate(self, loss): | |||||
self.total_loss += float(loss) | |||||
self.steps += 1 | |||||
def get_metric(self, reset=True): | |||||
result = {'loss': self.total_loss / (self.steps + 1e-12)} | |||||
if reset: | |||||
self.total_loss = 0.0 | |||||
self.steps = 0 | |||||
return result | |||||
metrics.append( | |||||
LossMetric(loss=Const.LOSS) | |||||
) | |||||
optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=0) | |||||
optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=1e-4) | |||||
# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) | # scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) | ||||
# callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 15))) | # callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 15))) | ||||
# optimizer = SWATS(model.parameters(), verbose=True) | # optimizer = SWATS(model.parameters(), verbose=True) | ||||
@@ -92,8 +134,7 @@ device = 'cuda:0' if torch.cuda.is_available() else 'cpu' | |||||
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, | trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, | ||||
sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), | sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), | ||||
device=device, dev_data=data.datasets['dev'], batch_size=ops.batch_size, | device=device, dev_data=data.datasets['dev'], batch_size=ops.batch_size, | ||||
metrics=SpanFPreRecMetric( | |||||
tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | |||||
metrics=metrics, | |||||
check_code_level=-1, | check_code_level=-1, | ||||
callbacks=callbacks, num_workers=2, n_epochs=ops.num_epochs) | callbacks=callbacks, num_workers=2, n_epochs=ops.num_epochs) | ||||
trainer.train() | trainer.train() |
@@ -11,7 +11,7 @@ from reproduction.text_classification.model.dpcnn import DPCNN | |||||
from data.yelpLoader import yelpLoader | from data.yelpLoader import yelpLoader | ||||
from fastNLP.core.sampler import BucketSampler | from fastNLP.core.sampler import BucketSampler | ||||
import torch.nn as nn | import torch.nn as nn | ||||
from fastNLP.core import LRScheduler | |||||
from fastNLP.core import LRScheduler, Callback | |||||
from fastNLP.core.const import Const as C | from fastNLP.core.const import Const as C | ||||
from fastNLP.core.vocabulary import VocabularyOption | from fastNLP.core.vocabulary import VocabularyOption | ||||
from utils.util_init import set_rng_seeds | from utils.util_init import set_rng_seeds | ||||
@@ -25,14 +25,14 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | |||||
class Config(): | class Config(): | ||||
seed = 12345 | seed = 12345 | ||||
model_dir_or_name = "dpcnn-yelp-p" | |||||
model_dir_or_name = "dpcnn-yelp-f" | |||||
embedding_grad = True | embedding_grad = True | ||||
train_epoch = 30 | train_epoch = 30 | ||||
batch_size = 100 | batch_size = 100 | ||||
task = "yelp_p" | |||||
task = "yelp_f" | |||||
#datadir = 'workdir/datasets/SST' | #datadir = 'workdir/datasets/SST' | ||||
datadir = 'workdir/datasets/yelp_polarity' | |||||
# datadir = 'workdir/datasets/yelp_full' | |||||
# datadir = 'workdir/datasets/yelp_polarity' | |||||
datadir = 'workdir/datasets/yelp_full' | |||||
#datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} | #datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} | ||||
datafile = {"train": "train.csv", "test": "test.csv"} | datafile = {"train": "train.csv", "test": "test.csv"} | ||||
lr = 1e-3 | lr = 1e-3 | ||||
@@ -73,6 +73,8 @@ def load_data(): | |||||
datainfo, embedding = load_data() | datainfo, embedding = load_data() | ||||
embedding.embedding.weight.data /= embedding.embedding.weight.data.std() | |||||
print(embedding.embedding.weight.mean(), embedding.embedding.weight.std()) | |||||
# 2.或直接复用fastNLP的模型 | # 2.或直接复用fastNLP的模型 | ||||
@@ -92,11 +94,12 @@ optimizer = SGD([param for param in model.parameters() if param.requires_grad == | |||||
lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay) | lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay) | ||||
callbacks = [] | callbacks = [] | ||||
# callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) | |||||
callbacks.append( | |||||
LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch < | |||||
ops.train_epoch * 0.8 else ops.lr * 0.1)) | |||||
) | |||||
callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) | |||||
# callbacks.append( | |||||
# LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch < | |||||
# ops.train_epoch * 0.8 else ops.lr * 0.1)) | |||||
# ) | |||||
# callbacks.append( | # callbacks.append( | ||||
# FitlogCallback(data=datainfo.datasets, verbose=1) | # FitlogCallback(data=datainfo.datasets, verbose=1) | ||||