Browse Source

Merge branch 'dev0.5.0' of github.com:fastnlp/fastNLP into dev0.5.0

tags/v0.4.10
yh_cc 5 years ago
parent
commit
bdc7b18cb3
8 changed files with 84 additions and 72 deletions
  1. +2
    -2
      fastNLP/io/data_loader/sst.py
  2. +3
    -3
      fastNLP/models/star_transformer.py
  3. +2
    -2
      fastNLP/modules/encoder/star_transformer.py
  4. +0
    -23
      reproduction/Star_transformer/README.md
  5. +6
    -5
      reproduction/Star_transformer/train.py
  6. +1
    -11
      reproduction/seqence_labelling/ner/model/dilated_cnn.py
  7. +57
    -16
      reproduction/seqence_labelling/ner/train_idcnn.py
  8. +13
    -10
      reproduction/text_classification/train_dpcnn.py

+ 2
- 2
fastNLP/io/data_loader/sst.py View File

@@ -60,8 +60,8 @@ class SSTLoader(DataSetLoader):
def _get_one(self, data, subtree): def _get_one(self, data, subtree):
tree = Tree.fromstring(data) tree = Tree.fromstring(data)
if subtree: if subtree:
return [([x.text for x in self.tokenizer(' '.join(t.leaves()))], t.label()) for t in tree.subtrees() ]
return [([x.text for x in self.tokenizer(' '.join(tree.leaves()))], tree.label())]
return [(self.tokenizer(' '.join(t.leaves())), t.label()) for t in tree.subtrees() ]
return [(self.tokenizer(' '.join(tree.leaves())), tree.label())]


def process(self, def process(self,
paths, train_subtree=True, paths, train_subtree=True,


+ 3
- 3
fastNLP/models/star_transformer.py View File

@@ -46,8 +46,8 @@ class StarTransEnc(nn.Module):
super(StarTransEnc, self).__init__() super(StarTransEnc, self).__init__()
self.embedding = get_embeddings(init_embed) self.embedding = get_embeddings(init_embed)
emb_dim = self.embedding.embedding_dim emb_dim = self.embedding.embedding_dim
#self.emb_fc = nn.Linear(emb_dim, hidden_size)
self.emb_drop = nn.Dropout(emb_dropout)
self.emb_fc = nn.Linear(emb_dim, hidden_size)
# self.emb_drop = nn.Dropout(emb_dropout)
self.encoder = StarTransformer(hidden_size=hidden_size, self.encoder = StarTransformer(hidden_size=hidden_size,
num_layers=num_layers, num_layers=num_layers,
num_head=num_head, num_head=num_head,
@@ -65,7 +65,7 @@ class StarTransEnc(nn.Module):
[batch, hidden] 全局 relay 节点, 详见论文 [batch, hidden] 全局 relay 节点, 详见论文
""" """
x = self.embedding(x) x = self.embedding(x)
#x = self.emb_fc(self.emb_drop(x))
x = self.emb_fc(x)
nodes, relay = self.encoder(x, mask) nodes, relay = self.encoder(x, mask)
return nodes, relay return nodes, relay




+ 2
- 2
fastNLP/modules/encoder/star_transformer.py View File

@@ -34,8 +34,8 @@ class StarTransformer(nn.Module):
super(StarTransformer, self).__init__() super(StarTransformer, self).__init__()
self.iters = num_layers self.iters = num_layers
self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)])
self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1)
self.norm = nn.ModuleList([nn.LayerNorm(hidden_size, eps=1e-6) for _ in range(self.iters)])
# self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1)
self.emb_drop = nn.Dropout(dropout) self.emb_drop = nn.Dropout(dropout)
self.ring_att = nn.ModuleList( self.ring_att = nn.ModuleList(
[_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=0.0) [_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=0.0)


+ 0
- 23
reproduction/Star_transformer/README.md View File

@@ -9,26 +9,3 @@ paper: [Star-Transformer](https://arxiv.org/abs/1902.09113)
|Text Classification|SST|-|51.2| |Text Classification|SST|-|51.2|
|Natural Language Inference|SNLI|-|83.76| |Natural Language Inference|SNLI|-|83.76|


## Usage
``` python
# for sequence labeling(ner, pos tagging, etc)
from fastNLP.models.star_transformer import STSeqLabel
model = STSeqLabel(
vocab_size=10000, num_cls=50,
emb_dim=300)


# for sequence classification
from fastNLP.models.star_transformer import STSeqCls
model = STSeqCls(
vocab_size=10000, num_cls=50,
emb_dim=300)


# for natural language inference
from fastNLP.models.star_transformer import STNLICls
model = STNLICls(
vocab_size=10000, num_cls=50,
emb_dim=300)

```

+ 6
- 5
reproduction/Star_transformer/train.py View File

@@ -10,7 +10,8 @@ from fastNLP.models.star_transformer import STSeqLabel, STSeqCls, STNLICls
from fastNLP.core.const import Const as C from fastNLP.core.const import Const as C
import sys import sys
#sys.path.append('/remote-home/yfshao/workdir/dev_fastnlp/') #sys.path.append('/remote-home/yfshao/workdir/dev_fastnlp/')
pre_dir = '/home/ec2-user/fast_data/'
import os
pre_dir = os.path.join(os.environ['HOME'], 'workdir/datasets/')


g_model_select = { g_model_select = {
'pos': STSeqLabel, 'pos': STSeqLabel,
@@ -19,7 +20,7 @@ g_model_select = {
'nli': STNLICls, 'nli': STNLICls,
} }


g_emb_file_path = {'en': pre_dir + 'glove.840B.300d.txt',
g_emb_file_path = {'en': pre_dir + 'word_vector/glove.840B.300d.txt',
'zh': pre_dir + 'cc.zh.300.vec'} 'zh': pre_dir + 'cc.zh.300.vec'}


g_args = None g_args = None
@@ -55,7 +56,7 @@ def get_conll2012_ner():




def get_sst(): def get_sst():
path = pre_dir + 'sst'
path = pre_dir + 'SST'
files = ['train.txt', 'dev.txt', 'test.txt'] files = ['train.txt', 'dev.txt', 'test.txt']
return load_sst(path, files) return load_sst(path, files)


@@ -171,10 +172,10 @@ def train():
sampler=FN.BucketSampler(100, g_args.bsz, C.INPUT_LEN), sampler=FN.BucketSampler(100, g_args.bsz, C.INPUT_LEN),
callbacks=[MyCallback()]) callbacks=[MyCallback()])


trainer.train()
print(trainer.train())
tester = FN.Tester(data=test_data, model=model, metrics=metric, tester = FN.Tester(data=test_data, model=model, metrics=metric,
batch_size=128, device=device) batch_size=128, device=device)
tester.test()
print(tester.test())




def test(): def test():


+ 1
- 11
reproduction/seqence_labelling/ner/model/dilated_cnn.py View File

@@ -106,7 +106,7 @@ class IDCNN(nn.Module):
if self.crf is not None and target is not None: if self.crf is not None and target is not None:
loss = self.crf(y.transpose(1, 2), t, mask) loss = self.crf(y.transpose(1, 2), t, mask)
else: else:
t.masked_fill_(mask == 0, -100)
# t.masked_fill_(mask == 0, -100)
loss = F.cross_entropy(y, t, ignore_index=-100) loss = F.cross_entropy(y, t, ignore_index=-100)
return loss return loss


@@ -130,13 +130,3 @@ class IDCNN(nn.Module):
C.OUTPUT: pred, C.OUTPUT: pred,
} }


def predict(self, words, seq_len, chars=None):
res = self.forward(
words=words,
seq_len=seq_len,
chars=chars,
target=None
)[C.OUTPUT]
return {
C.OUTPUT: res
}

+ 57
- 16
reproduction/seqence_labelling/ner/train_idcnn.py View File

@@ -1,4 +1,5 @@
from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader
from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader
from fastNLP.core.callback import FitlogCallback, LRScheduler from fastNLP.core.callback import FitlogCallback, LRScheduler
from fastNLP import GradientClipCallback from fastNLP import GradientClipCallback
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
@@ -7,10 +8,12 @@ from fastNLP import Const
from fastNLP import RandomSampler, BucketSampler from fastNLP import RandomSampler, BucketSampler
from fastNLP import SpanFPreRecMetric from fastNLP import SpanFPreRecMetric
from fastNLP import Trainer from fastNLP import Trainer
from fastNLP.core.metrics import MetricBase
from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN
from fastNLP.core.utils import Option from fastNLP.core.utils import Option
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding
from fastNLP.core.utils import cache_results from fastNLP.core.utils import cache_results
from fastNLP.core.vocabulary import VocabularyOption
import sys import sys
import torch.cuda import torch.cuda
import os import os
@@ -24,43 +27,53 @@ encoding_type = 'bioes'
def get_path(path): def get_path(path):
return os.path.join(os.environ['HOME'], path) return os.path.join(os.environ['HOME'], path)


data_path = get_path('workdir/datasets/ontonotes-v4')


ops = Option( ops = Option(
batch_size=128, batch_size=128,
num_epochs=100, num_epochs=100,
lr=3e-4,
lr=5e-4,
repeats=3, repeats=3,
num_layers=3, num_layers=3,
num_filters=400, num_filters=400,
use_crf=True,
use_crf=False,
gradient_clip=5, gradient_clip=5,
) )


@cache_results('ontonotes-cache')
@cache_results('ontonotes-min_freq0-case-cache')
def load_data(): def load_data():

data = OntoNoteNERDataLoader(encoding_type=encoding_type).process(data_path,
lower=True)
print('loading data')
# data = OntoNoteNERDataLoader(encoding_type=encoding_type).process(
# data_path = get_path('workdir/datasets/ontonotes-v4')
# lower=False,
# word_vocab_opt=VocabularyOption(min_freq=0),
# )
data = Conll2003DataLoader(task='ner', encoding_type=encoding_type).process(
paths=get_path('workdir/datasets/conll03'),
lower=False, word_vocab_opt=VocabularyOption(min_freq=0)
)


# char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], # char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
# kernel_sizes=[3]) # kernel_sizes=[3])

print('loading embedding')
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='en-glove-840b-300', model_dir_or_name='en-glove-840b-300',
requires_grad=True) requires_grad=True)
return data, [word_embed] return data, [word_embed]


data, embeds = load_data() data, embeds = load_data()
print(data)
print(data.datasets['train'][0]) print(data.datasets['train'][0])
print(list(data.vocabs.keys())) print(list(data.vocabs.keys()))


for ds in data.datasets.values():
ds.rename_field('cap_words', 'chars')
ds.set_input('chars')
# for ds in data.datasets.values():
# ds.rename_field('cap_words', 'chars')
# ds.set_input('chars')


word_embed = embeds[0] word_embed = embeds[0]
char_embed = CNNCharEmbedding(data.vocabs['cap_words'])
word_embed.embedding.weight.data /= word_embed.embedding.weight.data.std()

# char_embed = CNNCharEmbedding(data.vocabs['cap_words'])
char_embed = None
# for ds in data.datasets: # for ds in data.datasets:
# ds.rename_field('') # ds.rename_field('')


@@ -75,13 +88,42 @@ model = IDCNN(init_embed=word_embed,
kernel_size=3, kernel_size=3,
use_crf=ops.use_crf, use_projection=True, use_crf=ops.use_crf, use_projection=True,
block_loss=True, block_loss=True,
input_dropout=0.33, hidden_dropout=0.2, inner_dropout=0.2)
input_dropout=0.5, hidden_dropout=0.0, inner_dropout=0.0)


print(model) print(model)


callbacks = [GradientClipCallback(clip_value=ops.gradient_clip, clip_type='norm'),] callbacks = [GradientClipCallback(clip_value=ops.gradient_clip, clip_type='norm'),]
metrics = []
metrics.append(
SpanFPreRecMetric(
tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type,
pred=Const.OUTPUT, target=Const.TARGET, seq_len=Const.INPUT_LEN,
)
)

class LossMetric(MetricBase):
def __init__(self, loss=None):
super(LossMetric, self).__init__()
self._init_param_map(loss=loss)
self.total_loss = 0.0
self.steps = 0

def evaluate(self, loss):
self.total_loss += float(loss)
self.steps += 1

def get_metric(self, reset=True):
result = {'loss': self.total_loss / (self.steps + 1e-12)}
if reset:
self.total_loss = 0.0
self.steps = 0
return result

metrics.append(
LossMetric(loss=Const.LOSS)
)


optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=0)
optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=1e-4)
# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) # scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
# callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 15))) # callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 15)))
# optimizer = SWATS(model.parameters(), verbose=True) # optimizer = SWATS(model.parameters(), verbose=True)
@@ -92,8 +134,7 @@ device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer,
sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size),
device=device, dev_data=data.datasets['dev'], batch_size=ops.batch_size, device=device, dev_data=data.datasets['dev'], batch_size=ops.batch_size,
metrics=SpanFPreRecMetric(
tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
metrics=metrics,
check_code_level=-1, check_code_level=-1,
callbacks=callbacks, num_workers=2, n_epochs=ops.num_epochs) callbacks=callbacks, num_workers=2, n_epochs=ops.num_epochs)
trainer.train() trainer.train()

+ 13
- 10
reproduction/text_classification/train_dpcnn.py View File

@@ -11,7 +11,7 @@ from reproduction.text_classification.model.dpcnn import DPCNN
from data.yelpLoader import yelpLoader from data.yelpLoader import yelpLoader
from fastNLP.core.sampler import BucketSampler from fastNLP.core.sampler import BucketSampler
import torch.nn as nn import torch.nn as nn
from fastNLP.core import LRScheduler
from fastNLP.core import LRScheduler, Callback
from fastNLP.core.const import Const as C from fastNLP.core.const import Const as C
from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.vocabulary import VocabularyOption
from utils.util_init import set_rng_seeds from utils.util_init import set_rng_seeds
@@ -25,14 +25,14 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"


class Config(): class Config():
seed = 12345 seed = 12345
model_dir_or_name = "dpcnn-yelp-p"
model_dir_or_name = "dpcnn-yelp-f"
embedding_grad = True embedding_grad = True
train_epoch = 30 train_epoch = 30
batch_size = 100 batch_size = 100
task = "yelp_p"
task = "yelp_f"
#datadir = 'workdir/datasets/SST' #datadir = 'workdir/datasets/SST'
datadir = 'workdir/datasets/yelp_polarity'
# datadir = 'workdir/datasets/yelp_full'
# datadir = 'workdir/datasets/yelp_polarity'
datadir = 'workdir/datasets/yelp_full'
#datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} #datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datafile = {"train": "train.csv", "test": "test.csv"} datafile = {"train": "train.csv", "test": "test.csv"}
lr = 1e-3 lr = 1e-3
@@ -73,6 +73,8 @@ def load_data():




datainfo, embedding = load_data() datainfo, embedding = load_data()
embedding.embedding.weight.data /= embedding.embedding.weight.data.std()
print(embedding.embedding.weight.mean(), embedding.embedding.weight.std())


# 2.或直接复用fastNLP的模型 # 2.或直接复用fastNLP的模型


@@ -92,11 +94,12 @@ optimizer = SGD([param for param in model.parameters() if param.requires_grad ==
lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay) lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay)


callbacks = [] callbacks = []
# callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))
callbacks.append(
LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch <
ops.train_epoch * 0.8 else ops.lr * 0.1))
)

callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))
# callbacks.append(
# LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch <
# ops.train_epoch * 0.8 else ops.lr * 0.1))
# )


# callbacks.append( # callbacks.append(
# FitlogCallback(data=datainfo.datasets, verbose=1) # FitlogCallback(data=datainfo.datasets, verbose=1)


Loading…
Cancel
Save