Browse Source

Merge branch 'dev0.5.0' of github.com:fastnlp/fastNLP into dev0.5.0

tags/v0.4.10
yh_cc 5 years ago
parent
commit
bdc7b18cb3
8 changed files with 84 additions and 72 deletions
  1. +2
    -2
      fastNLP/io/data_loader/sst.py
  2. +3
    -3
      fastNLP/models/star_transformer.py
  3. +2
    -2
      fastNLP/modules/encoder/star_transformer.py
  4. +0
    -23
      reproduction/Star_transformer/README.md
  5. +6
    -5
      reproduction/Star_transformer/train.py
  6. +1
    -11
      reproduction/seqence_labelling/ner/model/dilated_cnn.py
  7. +57
    -16
      reproduction/seqence_labelling/ner/train_idcnn.py
  8. +13
    -10
      reproduction/text_classification/train_dpcnn.py

+ 2
- 2
fastNLP/io/data_loader/sst.py View File

@@ -60,8 +60,8 @@ class SSTLoader(DataSetLoader):
def _get_one(self, data, subtree):
tree = Tree.fromstring(data)
if subtree:
return [([x.text for x in self.tokenizer(' '.join(t.leaves()))], t.label()) for t in tree.subtrees() ]
return [([x.text for x in self.tokenizer(' '.join(tree.leaves()))], tree.label())]
return [(self.tokenizer(' '.join(t.leaves())), t.label()) for t in tree.subtrees() ]
return [(self.tokenizer(' '.join(tree.leaves())), tree.label())]

def process(self,
paths, train_subtree=True,


+ 3
- 3
fastNLP/models/star_transformer.py View File

@@ -46,8 +46,8 @@ class StarTransEnc(nn.Module):
super(StarTransEnc, self).__init__()
self.embedding = get_embeddings(init_embed)
emb_dim = self.embedding.embedding_dim
#self.emb_fc = nn.Linear(emb_dim, hidden_size)
self.emb_drop = nn.Dropout(emb_dropout)
self.emb_fc = nn.Linear(emb_dim, hidden_size)
# self.emb_drop = nn.Dropout(emb_dropout)
self.encoder = StarTransformer(hidden_size=hidden_size,
num_layers=num_layers,
num_head=num_head,
@@ -65,7 +65,7 @@ class StarTransEnc(nn.Module):
[batch, hidden] 全局 relay 节点, 详见论文
"""
x = self.embedding(x)
#x = self.emb_fc(self.emb_drop(x))
x = self.emb_fc(x)
nodes, relay = self.encoder(x, mask)
return nodes, relay



+ 2
- 2
fastNLP/modules/encoder/star_transformer.py View File

@@ -34,8 +34,8 @@ class StarTransformer(nn.Module):
super(StarTransformer, self).__init__()
self.iters = num_layers
self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)])
self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1)
self.norm = nn.ModuleList([nn.LayerNorm(hidden_size, eps=1e-6) for _ in range(self.iters)])
# self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1)
self.emb_drop = nn.Dropout(dropout)
self.ring_att = nn.ModuleList(
[_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=0.0)


+ 0
- 23
reproduction/Star_transformer/README.md View File

@@ -9,26 +9,3 @@ paper: [Star-Transformer](https://arxiv.org/abs/1902.09113)
|Text Classification|SST|-|51.2|
|Natural Language Inference|SNLI|-|83.76|

## Usage
``` python
# for sequence labeling(ner, pos tagging, etc)
from fastNLP.models.star_transformer import STSeqLabel
model = STSeqLabel(
vocab_size=10000, num_cls=50,
emb_dim=300)


# for sequence classification
from fastNLP.models.star_transformer import STSeqCls
model = STSeqCls(
vocab_size=10000, num_cls=50,
emb_dim=300)


# for natural language inference
from fastNLP.models.star_transformer import STNLICls
model = STNLICls(
vocab_size=10000, num_cls=50,
emb_dim=300)

```

+ 6
- 5
reproduction/Star_transformer/train.py View File

@@ -10,7 +10,8 @@ from fastNLP.models.star_transformer import STSeqLabel, STSeqCls, STNLICls
from fastNLP.core.const import Const as C
import sys
#sys.path.append('/remote-home/yfshao/workdir/dev_fastnlp/')
pre_dir = '/home/ec2-user/fast_data/'
import os
pre_dir = os.path.join(os.environ['HOME'], 'workdir/datasets/')

g_model_select = {
'pos': STSeqLabel,
@@ -19,7 +20,7 @@ g_model_select = {
'nli': STNLICls,
}

g_emb_file_path = {'en': pre_dir + 'glove.840B.300d.txt',
g_emb_file_path = {'en': pre_dir + 'word_vector/glove.840B.300d.txt',
'zh': pre_dir + 'cc.zh.300.vec'}

g_args = None
@@ -55,7 +56,7 @@ def get_conll2012_ner():


def get_sst():
path = pre_dir + 'sst'
path = pre_dir + 'SST'
files = ['train.txt', 'dev.txt', 'test.txt']
return load_sst(path, files)

@@ -171,10 +172,10 @@ def train():
sampler=FN.BucketSampler(100, g_args.bsz, C.INPUT_LEN),
callbacks=[MyCallback()])

trainer.train()
print(trainer.train())
tester = FN.Tester(data=test_data, model=model, metrics=metric,
batch_size=128, device=device)
tester.test()
print(tester.test())


def test():


+ 1
- 11
reproduction/seqence_labelling/ner/model/dilated_cnn.py View File

@@ -106,7 +106,7 @@ class IDCNN(nn.Module):
if self.crf is not None and target is not None:
loss = self.crf(y.transpose(1, 2), t, mask)
else:
t.masked_fill_(mask == 0, -100)
# t.masked_fill_(mask == 0, -100)
loss = F.cross_entropy(y, t, ignore_index=-100)
return loss

@@ -130,13 +130,3 @@ class IDCNN(nn.Module):
C.OUTPUT: pred,
}

def predict(self, words, seq_len, chars=None):
res = self.forward(
words=words,
seq_len=seq_len,
chars=chars,
target=None
)[C.OUTPUT]
return {
C.OUTPUT: res
}

+ 57
- 16
reproduction/seqence_labelling/ner/train_idcnn.py View File

@@ -1,4 +1,5 @@
from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader
from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader
from fastNLP.core.callback import FitlogCallback, LRScheduler
from fastNLP import GradientClipCallback
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
@@ -7,10 +8,12 @@ from fastNLP import Const
from fastNLP import RandomSampler, BucketSampler
from fastNLP import SpanFPreRecMetric
from fastNLP import Trainer
from fastNLP.core.metrics import MetricBase
from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN
from fastNLP.core.utils import Option
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding
from fastNLP.core.utils import cache_results
from fastNLP.core.vocabulary import VocabularyOption
import sys
import torch.cuda
import os
@@ -24,43 +27,53 @@ encoding_type = 'bioes'
def get_path(path):
return os.path.join(os.environ['HOME'], path)

data_path = get_path('workdir/datasets/ontonotes-v4')

ops = Option(
batch_size=128,
num_epochs=100,
lr=3e-4,
lr=5e-4,
repeats=3,
num_layers=3,
num_filters=400,
use_crf=True,
use_crf=False,
gradient_clip=5,
)

@cache_results('ontonotes-cache')
@cache_results('ontonotes-min_freq0-case-cache')
def load_data():

data = OntoNoteNERDataLoader(encoding_type=encoding_type).process(data_path,
lower=True)
print('loading data')
# data = OntoNoteNERDataLoader(encoding_type=encoding_type).process(
# data_path = get_path('workdir/datasets/ontonotes-v4')
# lower=False,
# word_vocab_opt=VocabularyOption(min_freq=0),
# )
data = Conll2003DataLoader(task='ner', encoding_type=encoding_type).process(
paths=get_path('workdir/datasets/conll03'),
lower=False, word_vocab_opt=VocabularyOption(min_freq=0)
)

# char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
# kernel_sizes=[3])

print('loading embedding')
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='en-glove-840b-300',
requires_grad=True)
return data, [word_embed]

data, embeds = load_data()
print(data)
print(data.datasets['train'][0])
print(list(data.vocabs.keys()))

for ds in data.datasets.values():
ds.rename_field('cap_words', 'chars')
ds.set_input('chars')
# for ds in data.datasets.values():
# ds.rename_field('cap_words', 'chars')
# ds.set_input('chars')

word_embed = embeds[0]
char_embed = CNNCharEmbedding(data.vocabs['cap_words'])
word_embed.embedding.weight.data /= word_embed.embedding.weight.data.std()

# char_embed = CNNCharEmbedding(data.vocabs['cap_words'])
char_embed = None
# for ds in data.datasets:
# ds.rename_field('')

@@ -75,13 +88,42 @@ model = IDCNN(init_embed=word_embed,
kernel_size=3,
use_crf=ops.use_crf, use_projection=True,
block_loss=True,
input_dropout=0.33, hidden_dropout=0.2, inner_dropout=0.2)
input_dropout=0.5, hidden_dropout=0.0, inner_dropout=0.0)

print(model)

callbacks = [GradientClipCallback(clip_value=ops.gradient_clip, clip_type='norm'),]
metrics = []
metrics.append(
SpanFPreRecMetric(
tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type,
pred=Const.OUTPUT, target=Const.TARGET, seq_len=Const.INPUT_LEN,
)
)

class LossMetric(MetricBase):
def __init__(self, loss=None):
super(LossMetric, self).__init__()
self._init_param_map(loss=loss)
self.total_loss = 0.0
self.steps = 0

def evaluate(self, loss):
self.total_loss += float(loss)
self.steps += 1

def get_metric(self, reset=True):
result = {'loss': self.total_loss / (self.steps + 1e-12)}
if reset:
self.total_loss = 0.0
self.steps = 0
return result

metrics.append(
LossMetric(loss=Const.LOSS)
)

optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=0)
optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=1e-4)
# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
# callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 15)))
# optimizer = SWATS(model.parameters(), verbose=True)
@@ -92,8 +134,7 @@ device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer,
sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size),
device=device, dev_data=data.datasets['dev'], batch_size=ops.batch_size,
metrics=SpanFPreRecMetric(
tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
metrics=metrics,
check_code_level=-1,
callbacks=callbacks, num_workers=2, n_epochs=ops.num_epochs)
trainer.train()

+ 13
- 10
reproduction/text_classification/train_dpcnn.py View File

@@ -11,7 +11,7 @@ from reproduction.text_classification.model.dpcnn import DPCNN
from data.yelpLoader import yelpLoader
from fastNLP.core.sampler import BucketSampler
import torch.nn as nn
from fastNLP.core import LRScheduler
from fastNLP.core import LRScheduler, Callback
from fastNLP.core.const import Const as C
from fastNLP.core.vocabulary import VocabularyOption
from utils.util_init import set_rng_seeds
@@ -25,14 +25,14 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

class Config():
seed = 12345
model_dir_or_name = "dpcnn-yelp-p"
model_dir_or_name = "dpcnn-yelp-f"
embedding_grad = True
train_epoch = 30
batch_size = 100
task = "yelp_p"
task = "yelp_f"
#datadir = 'workdir/datasets/SST'
datadir = 'workdir/datasets/yelp_polarity'
# datadir = 'workdir/datasets/yelp_full'
# datadir = 'workdir/datasets/yelp_polarity'
datadir = 'workdir/datasets/yelp_full'
#datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datafile = {"train": "train.csv", "test": "test.csv"}
lr = 1e-3
@@ -73,6 +73,8 @@ def load_data():


datainfo, embedding = load_data()
embedding.embedding.weight.data /= embedding.embedding.weight.data.std()
print(embedding.embedding.weight.mean(), embedding.embedding.weight.std())

# 2.或直接复用fastNLP的模型

@@ -92,11 +94,12 @@ optimizer = SGD([param for param in model.parameters() if param.requires_grad ==
lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay)

callbacks = []
# callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))
callbacks.append(
LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch <
ops.train_epoch * 0.8 else ops.lr * 0.1))
)

callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))
# callbacks.append(
# LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch <
# ops.train_epoch * 0.8 else ops.lr * 0.1))
# )

# callbacks.append(
# FitlogCallback(data=datainfo.datasets, verbose=1)


Loading…
Cancel
Save