From dc0124cf028503cb3ca5ec4f825c3cc3c70e3a34 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sat, 10 Nov 2018 11:10:14 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9model=E5=88=B0models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../{model => models}/__init__.py | 0 .../{model => models}/cws_model.py | 0 .../chinese_word_segment/train_context.py | 21 ++++++++++++------- 3 files changed, 14 insertions(+), 7 deletions(-) rename reproduction/chinese_word_segment/{model => models}/__init__.py (100%) rename reproduction/chinese_word_segment/{model => models}/cws_model.py (100%) diff --git a/reproduction/chinese_word_segment/model/__init__.py b/reproduction/chinese_word_segment/models/__init__.py similarity index 100% rename from reproduction/chinese_word_segment/model/__init__.py rename to reproduction/chinese_word_segment/models/__init__.py diff --git a/reproduction/chinese_word_segment/model/cws_model.py b/reproduction/chinese_word_segment/models/cws_model.py similarity index 100% rename from reproduction/chinese_word_segment/model/cws_model.py rename to reproduction/chinese_word_segment/models/cws_model.py diff --git a/reproduction/chinese_word_segment/train_context.py b/reproduction/chinese_word_segment/train_context.py index de6513d3..c44294ee 100644 --- a/reproduction/chinese_word_segment/train_context.py +++ b/reproduction/chinese_word_segment/train_context.py @@ -3,11 +3,17 @@ from fastNLP.core.instance import Instance from fastNLP.core.dataset import DataSet from fastNLP.api.pipeline import Pipeline from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor - -from reproduction.chinese_word_segment.process.cws_processor import * -from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter, DigitSpanConverter +from fastNLP.api.processor import IndexerProcessor +from reproduction.chinese_word_segment.process.cws_processor import SpeicalSpanProcessor +from reproduction.chinese_word_segment.process.cws_processor import CWSCharSegProcessor +from reproduction.chinese_word_segment.process.cws_processor import CWSSegAppTagProcessor +from reproduction.chinese_word_segment.process.cws_processor import Pre2Post2BigramProcessor +from reproduction.chinese_word_segment.process.cws_processor import VocabProcessor + +from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter +from reproduction.chinese_word_segment.process.span_converter import DigitSpanConverter from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader - +from reproduction.chinese_word_segment.models.cws_model import CWSBiLSTMSegApp tr_filename = '' dev_filename = '' @@ -60,8 +66,8 @@ bigram_proc(tr_dataset) char_vocab_proc(tr_dataset) bigram_vocab_proc(tr_dataset) -char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list') -bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list') +char_index_proc = IndexerProcessor(char_vocab_proc.get_vocab(), 'chars_list', 'indexed_chars_list') +bigram_index_proc = IndexerProcessor(bigram_vocab_proc.get_vocab(), 'bigrams_list','indexed_bigrams_list') char_index_proc(tr_dataset) bigram_index_proc(tr_dataset) @@ -81,7 +87,8 @@ bigram_index_proc(dev_dataset) # 3. 得到数据集可以用于训练了 # TODO pretrain的embedding是怎么解决的? - +cws_model = CWSBiLSTMSegApp(vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None, + hidden_size=200, bidirectional=True, embed_drop_p=None, num_layers=1, tag_size=2)