diff --git a/fastNLP/io/data_loader/sst.py b/fastNLP/io/data_loader/sst.py index a7a35aee..05d63e2f 100644 --- a/fastNLP/io/data_loader/sst.py +++ b/fastNLP/io/data_loader/sst.py @@ -60,8 +60,8 @@ class SSTLoader(DataSetLoader): def _get_one(self, data, subtree): tree = Tree.fromstring(data) if subtree: - return [([x.text for x in self.tokenizer(' '.join(t.leaves()))], t.label()) for t in tree.subtrees() ] - return [([x.text for x in self.tokenizer(' '.join(tree.leaves()))], tree.label())] + return [(self.tokenizer(' '.join(t.leaves())), t.label()) for t in tree.subtrees() ] + return [(self.tokenizer(' '.join(tree.leaves())), tree.label())] def process(self, paths, train_subtree=True, diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py index 1aba5a8c..bb91a5b6 100644 --- a/fastNLP/models/star_transformer.py +++ b/fastNLP/models/star_transformer.py @@ -46,8 +46,8 @@ class StarTransEnc(nn.Module): super(StarTransEnc, self).__init__() self.embedding = get_embeddings(init_embed) emb_dim = self.embedding.embedding_dim - #self.emb_fc = nn.Linear(emb_dim, hidden_size) - self.emb_drop = nn.Dropout(emb_dropout) + self.emb_fc = nn.Linear(emb_dim, hidden_size) + # self.emb_drop = nn.Dropout(emb_dropout) self.encoder = StarTransformer(hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, @@ -65,7 +65,7 @@ class StarTransEnc(nn.Module): [batch, hidden] 全局 relay 节点, 详见论文 """ x = self.embedding(x) - #x = self.emb_fc(self.emb_drop(x)) + x = self.emb_fc(x) nodes, relay = self.encoder(x, mask) return nodes, relay diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py index 76b7e922..097fbebb 100644 --- a/fastNLP/modules/encoder/star_transformer.py +++ b/fastNLP/modules/encoder/star_transformer.py @@ -34,8 +34,8 @@ class StarTransformer(nn.Module): super(StarTransformer, self).__init__() self.iters = num_layers - self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)]) - self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1) + self.norm = nn.ModuleList([nn.LayerNorm(hidden_size, eps=1e-6) for _ in range(self.iters)]) + # self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1) self.emb_drop = nn.Dropout(dropout) self.ring_att = nn.ModuleList( [_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=0.0) diff --git a/reproduction/Star_transformer/README.md b/reproduction/Star_transformer/README.md index d07d5536..071e07e8 100644 --- a/reproduction/Star_transformer/README.md +++ b/reproduction/Star_transformer/README.md @@ -9,26 +9,3 @@ paper: [Star-Transformer](https://arxiv.org/abs/1902.09113) |Text Classification|SST|-|51.2| |Natural Language Inference|SNLI|-|83.76| -## Usage -``` python -# for sequence labeling(ner, pos tagging, etc) -from fastNLP.models.star_transformer import STSeqLabel -model = STSeqLabel( - vocab_size=10000, num_cls=50, - emb_dim=300) - - -# for sequence classification -from fastNLP.models.star_transformer import STSeqCls -model = STSeqCls( - vocab_size=10000, num_cls=50, - emb_dim=300) - - -# for natural language inference -from fastNLP.models.star_transformer import STNLICls -model = STNLICls( - vocab_size=10000, num_cls=50, - emb_dim=300) - -``` diff --git a/reproduction/Star_transformer/train.py b/reproduction/Star_transformer/train.py index 480748df..f1e5c2f9 100644 --- a/reproduction/Star_transformer/train.py +++ b/reproduction/Star_transformer/train.py @@ -10,7 +10,8 @@ from fastNLP.models.star_transformer import STSeqLabel, STSeqCls, STNLICls from fastNLP.core.const import Const as C import sys #sys.path.append('/remote-home/yfshao/workdir/dev_fastnlp/') -pre_dir = '/home/ec2-user/fast_data/' +import os +pre_dir = os.path.join(os.environ['HOME'], 'workdir/datasets/') g_model_select = { 'pos': STSeqLabel, @@ -19,7 +20,7 @@ g_model_select = { 'nli': STNLICls, } -g_emb_file_path = {'en': pre_dir + 'glove.840B.300d.txt', +g_emb_file_path = {'en': pre_dir + 'word_vector/glove.840B.300d.txt', 'zh': pre_dir + 'cc.zh.300.vec'} g_args = None @@ -55,7 +56,7 @@ def get_conll2012_ner(): def get_sst(): - path = pre_dir + 'sst' + path = pre_dir + 'SST' files = ['train.txt', 'dev.txt', 'test.txt'] return load_sst(path, files) @@ -171,10 +172,10 @@ def train(): sampler=FN.BucketSampler(100, g_args.bsz, C.INPUT_LEN), callbacks=[MyCallback()]) - trainer.train() + print(trainer.train()) tester = FN.Tester(data=test_data, model=model, metrics=metric, batch_size=128, device=device) - tester.test() + print(tester.test()) def test(): diff --git a/reproduction/seqence_labelling/ner/model/dilated_cnn.py b/reproduction/seqence_labelling/ner/model/dilated_cnn.py index a4e02159..bf661354 100644 --- a/reproduction/seqence_labelling/ner/model/dilated_cnn.py +++ b/reproduction/seqence_labelling/ner/model/dilated_cnn.py @@ -106,7 +106,7 @@ class IDCNN(nn.Module): if self.crf is not None and target is not None: loss = self.crf(y.transpose(1, 2), t, mask) else: - t.masked_fill_(mask == 0, -100) + # t.masked_fill_(mask == 0, -100) loss = F.cross_entropy(y, t, ignore_index=-100) return loss @@ -130,13 +130,3 @@ class IDCNN(nn.Module): C.OUTPUT: pred, } - def predict(self, words, seq_len, chars=None): - res = self.forward( - words=words, - seq_len=seq_len, - chars=chars, - target=None - )[C.OUTPUT] - return { - C.OUTPUT: res - } diff --git a/reproduction/seqence_labelling/ner/train_idcnn.py b/reproduction/seqence_labelling/ner/train_idcnn.py index 1781c763..7de8a61c 100644 --- a/reproduction/seqence_labelling/ner/train_idcnn.py +++ b/reproduction/seqence_labelling/ner/train_idcnn.py @@ -1,4 +1,5 @@ from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader +from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader from fastNLP.core.callback import FitlogCallback, LRScheduler from fastNLP import GradientClipCallback from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR @@ -7,10 +8,12 @@ from fastNLP import Const from fastNLP import RandomSampler, BucketSampler from fastNLP import SpanFPreRecMetric from fastNLP import Trainer +from fastNLP.core.metrics import MetricBase from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN from fastNLP.core.utils import Option from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding from fastNLP.core.utils import cache_results +from fastNLP.core.vocabulary import VocabularyOption import sys import torch.cuda import os @@ -24,43 +27,53 @@ encoding_type = 'bioes' def get_path(path): return os.path.join(os.environ['HOME'], path) -data_path = get_path('workdir/datasets/ontonotes-v4') ops = Option( batch_size=128, num_epochs=100, - lr=3e-4, + lr=5e-4, repeats=3, num_layers=3, num_filters=400, - use_crf=True, + use_crf=False, gradient_clip=5, ) -@cache_results('ontonotes-cache') +@cache_results('ontonotes-min_freq0-case-cache') def load_data(): - - data = OntoNoteNERDataLoader(encoding_type=encoding_type).process(data_path, - lower=True) + print('loading data') + # data = OntoNoteNERDataLoader(encoding_type=encoding_type).process( + # data_path = get_path('workdir/datasets/ontonotes-v4') + # lower=False, + # word_vocab_opt=VocabularyOption(min_freq=0), + # ) + data = Conll2003DataLoader(task='ner', encoding_type=encoding_type).process( + paths=get_path('workdir/datasets/conll03'), + lower=False, word_vocab_opt=VocabularyOption(min_freq=0) + ) # char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], # kernel_sizes=[3]) - + print('loading embedding') word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], model_dir_or_name='en-glove-840b-300', requires_grad=True) return data, [word_embed] data, embeds = load_data() +print(data) print(data.datasets['train'][0]) print(list(data.vocabs.keys())) -for ds in data.datasets.values(): - ds.rename_field('cap_words', 'chars') - ds.set_input('chars') +# for ds in data.datasets.values(): +# ds.rename_field('cap_words', 'chars') +# ds.set_input('chars') word_embed = embeds[0] -char_embed = CNNCharEmbedding(data.vocabs['cap_words']) +word_embed.embedding.weight.data /= word_embed.embedding.weight.data.std() + +# char_embed = CNNCharEmbedding(data.vocabs['cap_words']) +char_embed = None # for ds in data.datasets: # ds.rename_field('') @@ -75,13 +88,42 @@ model = IDCNN(init_embed=word_embed, kernel_size=3, use_crf=ops.use_crf, use_projection=True, block_loss=True, - input_dropout=0.33, hidden_dropout=0.2, inner_dropout=0.2) + input_dropout=0.5, hidden_dropout=0.0, inner_dropout=0.0) print(model) callbacks = [GradientClipCallback(clip_value=ops.gradient_clip, clip_type='norm'),] +metrics = [] +metrics.append( + SpanFPreRecMetric( + tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type, + pred=Const.OUTPUT, target=Const.TARGET, seq_len=Const.INPUT_LEN, + ) +) + +class LossMetric(MetricBase): + def __init__(self, loss=None): + super(LossMetric, self).__init__() + self._init_param_map(loss=loss) + self.total_loss = 0.0 + self.steps = 0 + + def evaluate(self, loss): + self.total_loss += float(loss) + self.steps += 1 + + def get_metric(self, reset=True): + result = {'loss': self.total_loss / (self.steps + 1e-12)} + if reset: + self.total_loss = 0.0 + self.steps = 0 + return result + +metrics.append( + LossMetric(loss=Const.LOSS) +) -optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=0) +optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=1e-4) # scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) # callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 15))) # optimizer = SWATS(model.parameters(), verbose=True) @@ -92,8 +134,7 @@ device = 'cuda:0' if torch.cuda.is_available() else 'cpu' trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), device=device, dev_data=data.datasets['dev'], batch_size=ops.batch_size, - metrics=SpanFPreRecMetric( - tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), + metrics=metrics, check_code_level=-1, callbacks=callbacks, num_workers=2, n_epochs=ops.num_epochs) trainer.train() diff --git a/reproduction/text_classification/train_dpcnn.py b/reproduction/text_classification/train_dpcnn.py index 9664bf75..70570970 100644 --- a/reproduction/text_classification/train_dpcnn.py +++ b/reproduction/text_classification/train_dpcnn.py @@ -11,7 +11,7 @@ from reproduction.text_classification.model.dpcnn import DPCNN from data.yelpLoader import yelpLoader from fastNLP.core.sampler import BucketSampler import torch.nn as nn -from fastNLP.core import LRScheduler +from fastNLP.core import LRScheduler, Callback from fastNLP.core.const import Const as C from fastNLP.core.vocabulary import VocabularyOption from utils.util_init import set_rng_seeds @@ -25,14 +25,14 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" class Config(): seed = 12345 - model_dir_or_name = "dpcnn-yelp-p" + model_dir_or_name = "dpcnn-yelp-f" embedding_grad = True train_epoch = 30 batch_size = 100 - task = "yelp_p" + task = "yelp_f" #datadir = 'workdir/datasets/SST' - datadir = 'workdir/datasets/yelp_polarity' - # datadir = 'workdir/datasets/yelp_full' + # datadir = 'workdir/datasets/yelp_polarity' + datadir = 'workdir/datasets/yelp_full' #datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} datafile = {"train": "train.csv", "test": "test.csv"} lr = 1e-3 @@ -73,6 +73,8 @@ def load_data(): datainfo, embedding = load_data() +embedding.embedding.weight.data /= embedding.embedding.weight.data.std() +print(embedding.embedding.weight.mean(), embedding.embedding.weight.std()) # 2.或直接复用fastNLP的模型 @@ -92,11 +94,12 @@ optimizer = SGD([param for param in model.parameters() if param.requires_grad == lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay) callbacks = [] -# callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) -callbacks.append( - LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch < - ops.train_epoch * 0.8 else ops.lr * 0.1)) -) + +callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) +# callbacks.append( +# LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch < +# ops.train_epoch * 0.8 else ops.lr * 0.1)) +# ) # callbacks.append( # FitlogCallback(data=datainfo.datasets, verbose=1)