From 8df5bce93814fda4c8e616431106ed78318fc5cb Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 12 Jan 2019 19:15:20 +0800 Subject: [PATCH] fastNLP V0.3 --- .../chinese_word_segment/train_context.py | 169 ------------------ tutorials/README.md | 8 +- 2 files changed, 4 insertions(+), 173 deletions(-) delete mode 100644 reproduction/chinese_word_segment/train_context.py diff --git a/reproduction/chinese_word_segment/train_context.py b/reproduction/chinese_word_segment/train_context.py deleted file mode 100644 index b0d238b3..00000000 --- a/reproduction/chinese_word_segment/train_context.py +++ /dev/null @@ -1,169 +0,0 @@ - -from fastNLP.api.pipeline import Pipeline -from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor -from fastNLP.api.processor import SeqLenProcessor -from reproduction.chinese_word_segment.process.cws_processor import CWSCharSegProcessor -from reproduction.chinese_word_segment.process.cws_processor import CWSBMESTagProcessor -from reproduction.chinese_word_segment.process.cws_processor import Pre2Post2BigramProcessor -from reproduction.chinese_word_segment.process.cws_processor import VocabIndexerProcessor -from reproduction.chinese_word_segment.process.cws_processor import InputTargetProcessor - -from reproduction.chinese_word_segment.cws_io.cws_reader import ConllCWSReader -from reproduction.chinese_word_segment.models.cws_model import CWSBiLSTMCRF - - -ds_name = 'msr' - -tr_filename = '/home/hyan/ctb3/train.conllx' -dev_filename = '/home/hyan/ctb3/dev.conllx' - - -reader = ConllCWSReader() - -tr_dataset = reader.load(tr_filename, cut_long_sent=True) -dev_dataset = reader.load(dev_filename) - -print("Train {}. Dev: {}".format(len(tr_dataset), len(dev_dataset))) - -# 1. 准备processor -fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence') - -char_proc = CWSCharSegProcessor('raw_sentence', 'chars_lst') -tag_proc = CWSBMESTagProcessor('raw_sentence', 'target') - -bigram_proc = Pre2Post2BigramProcessor('chars_lst', 'bigrams_lst') - -char_vocab_proc = VocabIndexerProcessor('chars_lst', new_added_filed_name='chars') -bigram_vocab_proc = VocabIndexerProcessor('bigrams_lst', new_added_filed_name='bigrams', min_freq=4) - -seq_len_proc = SeqLenProcessor('chars') - -# 2. 使用processor -fs2hs_proc(tr_dataset) - -char_proc(tr_dataset) -tag_proc(tr_dataset) -bigram_proc(tr_dataset) - -char_vocab_proc(tr_dataset) -bigram_vocab_proc(tr_dataset) -seq_len_proc(tr_dataset) - -# 2.1 处理dev_dataset -fs2hs_proc(dev_dataset) - -char_proc(dev_dataset) -tag_proc(dev_dataset) -bigram_proc(dev_dataset) - -char_vocab_proc(dev_dataset) -bigram_vocab_proc(dev_dataset) -seq_len_proc(dev_dataset) - -dev_dataset.set_input('target') -tr_dataset.set_input('target') - - -print("Finish preparing data.") - -# 3. 得到数据集可以用于训练了 -# TODO pretrain的embedding是怎么解决的? - -from torch import optim - - -tag_size = tag_proc.tag_size - -cws_model = CWSBiLSTMCRF(char_vocab_proc.get_vocab_size(), embed_dim=100, - bigram_vocab_num=bigram_vocab_proc.get_vocab_size(), - bigram_embed_dim=30, num_bigram_per_char=8, - hidden_size=200, bidirectional=True, embed_drop_p=0.3, - num_layers=1, tag_size=tag_size) -cws_model.cuda() - -num_epochs = 5 -optimizer = optim.Adagrad(cws_model.parameters(), lr=0.005) - -from fastNLP.core.trainer import Trainer -from fastNLP.core.sampler import BucketSampler -from fastNLP.core.metrics import BMESF1PreRecMetric - -metric = BMESF1PreRecMetric(target='tags') -trainer = Trainer(train_data=tr_dataset, model=cws_model, loss=None, metrics=metric, n_epochs=num_epochs, - batch_size=32, print_every=50, validate_every=-1, dev_data=dev_dataset, save_path=None, - optimizer=optimizer, check_code_level=0, metric_key='f', sampler=BucketSampler(), use_tqdm=True) - -trainer.train() - -# 4. 组装需要存下的内容 -pp = Pipeline() -pp.add_processor(fs2hs_proc) -# pp.add_processor(sp_proc) -pp.add_processor(char_proc) -pp.add_processor(tag_proc) -pp.add_processor(bigram_proc) -pp.add_processor(char_vocab_proc) -pp.add_processor(bigram_vocab_proc) -pp.add_processor(seq_len_proc) -# pp.add_processor(input_target_proc) - -# te_filename = '/hdd/fudanNLP/CWS/CWS_semiCRF/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name) -te_filename = '/home/hyan/ctb3/test.conllx' -te_dataset = reader.load(te_filename) -pp(te_dataset) - -from fastNLP.core.tester import Tester - -tester = Tester(data=te_dataset, model=cws_model, metrics=metric, batch_size=64, use_cuda=False, - verbose=1) -tester.test() -# -# batch_size = 64 -# te_batcher = Batch(te_dataset, batch_size, SequentialSampler(), use_cuda=False) -# pre, rec, f1 = calculate_pre_rec_f1(cws_model, te_batcher, type='bmes') -# print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1 * 100, -# pre * 100, -# rec * 100)) - -# TODO 这里貌似需要区分test pipeline与infer pipeline - -test_context_dict = {'pipeline': pp, - 'model': cws_model} -# torch.save(test_context_dict, 'models/test_context_crf.pkl') - - -# 5. dev的pp -# 4. 组装需要存下的内容 - -from fastNLP.api.processor import ModelProcessor -from reproduction.chinese_word_segment.process.cws_processor import BMES2OutputProcessor - -model_proc = ModelProcessor(cws_model) -output_proc = BMES2OutputProcessor(chars_field_name='chars_lst', tag_field_name='pred') - -pp = Pipeline() -pp.add_processor(fs2hs_proc) -# pp.add_processor(sp_proc) -pp.add_processor(char_proc) -pp.add_processor(bigram_proc) -char_vocab_proc.set_verbose(0) -pp.add_processor(char_vocab_proc) -bigram_vocab_proc.set_verbose(0) -pp.add_processor(bigram_vocab_proc) -pp.add_processor(seq_len_proc) - -pp.add_processor(model_proc) -pp.add_processor(output_proc) - - -# TODO 这里貌似需要区分test pipeline与infer pipeline -import torch -import datetime -now = datetime.datetime.now() -infer_context_dict = {'pipeline': pp, 'tag_proc': tag_proc} -torch.save(infer_context_dict, 'models/cws_crf_{}_{}.pkl'.format(now.month, now.day)) - - -# TODO 还需要考虑如何替换回原文的问题? -# 1. 不需要将特殊tag替换 -# 2. 需要将特殊tag替换回去 \ No newline at end of file diff --git a/tutorials/README.md b/tutorials/README.md index eafd2ad9..1de342e6 100644 --- a/tutorials/README.md +++ b/tutorials/README.md @@ -1,12 +1,12 @@ # fastNLP 教程 ### 上手教程 Quick Start -- 一分钟上手:`fastnlp_1min_tutorial.ipynb` ![Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/fastnlp_1min_tutorial.ipynb) -- 十分钟上手:`fastnlp_10min_tutorial.ipynb` ![Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/fastnlp_10min_tutorial.ipynb) +- 一分钟上手:`fastnlp_1min_tutorial.ipynb` [Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/fastnlp_1min_tutorial.ipynb) +- 十分钟上手:`fastnlp_10min_tutorial.ipynb` [Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/fastnlp_10min_tutorial.ipynb) ### 进阶教程 Advanced Tutorial -- `fastnlp_advanced_tutorial/advance_tutorial.ipynb` ![Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb) +- `fastnlp_advanced_tutorial/advance_tutorial.ipynb` [Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb) ### 开发者指南 Developer Guide -- `tutorial_for_developer.md` ![Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/tutorial_for_developer.md) +- `tutorial_for_developer.md` [Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/tutorial_for_developer.md)