|
- from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader
- from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader
- from fastNLP.core.callback import FitlogCallback, LRScheduler
- from fastNLP import GradientClipCallback
- from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
- from torch.optim import SGD, Adam
- from fastNLP import Const
- from fastNLP import RandomSampler, BucketSampler
- from fastNLP import SpanFPreRecMetric
- from fastNLP import Trainer
- from fastNLP.core.metrics import MetricBase
- from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN
- from fastNLP.core.utils import Option
- from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding
- from fastNLP.core.utils import cache_results
- from fastNLP.core.vocabulary import VocabularyOption
- import sys
- import torch.cuda
- import os
- os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
- os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-
- encoding_type = 'bioes'
-
-
- def get_path(path):
- return os.path.join(os.environ['HOME'], path)
-
-
- ops = Option(
- batch_size=128,
- num_epochs=100,
- lr=5e-4,
- repeats=3,
- num_layers=3,
- num_filters=400,
- use_crf=False,
- gradient_clip=5,
- )
-
- @cache_results('ontonotes-min_freq0-case-cache')
- def load_data():
- print('loading data')
- # data = OntoNoteNERDataLoader(encoding_type=encoding_type).process(
- # data_path = get_path('workdir/datasets/ontonotes-v4')
- # lower=False,
- # word_vocab_opt=VocabularyOption(min_freq=0),
- # )
- data = Conll2003DataLoader(task='ner', encoding_type=encoding_type).process(
- paths=get_path('workdir/datasets/conll03'),
- lower=False, word_vocab_opt=VocabularyOption(min_freq=0)
- )
-
- # char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
- # kernel_sizes=[3])
- print('loading embedding')
- word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
- model_dir_or_name='en-glove-840b-300',
- requires_grad=True)
- return data, [word_embed]
-
- data, embeds = load_data()
- print(data)
- print(data.datasets['train'][0])
- print(list(data.vocabs.keys()))
-
- # for ds in data.datasets.values():
- # ds.rename_field('cap_words', 'chars')
- # ds.set_input('chars')
-
- word_embed = embeds[0]
- word_embed.embedding.weight.data /= word_embed.embedding.weight.data.std()
-
- # char_embed = CNNCharEmbedding(data.vocabs['cap_words'])
- char_embed = None
- # for ds in data.datasets:
- # ds.rename_field('')
-
- print(data.vocabs[Const.TARGET].word2idx)
-
- model = IDCNN(init_embed=word_embed,
- char_embed=char_embed,
- num_cls=len(data.vocabs[Const.TARGET]),
- repeats=ops.repeats,
- num_layers=ops.num_layers,
- num_filters=ops.num_filters,
- kernel_size=3,
- use_crf=ops.use_crf, use_projection=True,
- block_loss=True,
- input_dropout=0.5, hidden_dropout=0.0, inner_dropout=0.0)
-
- print(model)
-
- callbacks = [GradientClipCallback(clip_value=ops.gradient_clip, clip_type='norm'),]
- metrics = []
- metrics.append(
- SpanFPreRecMetric(
- tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type,
- pred=Const.OUTPUT, target=Const.TARGET, seq_len=Const.INPUT_LEN,
- )
- )
-
- class LossMetric(MetricBase):
- def __init__(self, loss=None):
- super(LossMetric, self).__init__()
- self._init_param_map(loss=loss)
- self.total_loss = 0.0
- self.steps = 0
-
- def evaluate(self, loss):
- self.total_loss += float(loss)
- self.steps += 1
-
- def get_metric(self, reset=True):
- result = {'loss': self.total_loss / (self.steps + 1e-12)}
- if reset:
- self.total_loss = 0.0
- self.steps = 0
- return result
-
- metrics.append(
- LossMetric(loss=Const.LOSS)
- )
-
- optimizer = Adam(model.parameters(), lr=ops.lr, weight_decay=1e-4)
- # scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
- # callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 15)))
- # optimizer = SWATS(model.parameters(), verbose=True)
- # optimizer = Adam(model.parameters(), lr=0.005)
-
- device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
-
- trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer,
- sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size),
- device=device, dev_data=data.datasets['dev'], batch_size=ops.batch_size,
- metrics=metrics,
- check_code_level=-1,
- callbacks=callbacks, num_workers=2, n_epochs=ops.num_epochs)
- trainer.train()
|