| @@ -3,11 +3,17 @@ from fastNLP.core.instance import Instance | |||||
| from fastNLP.core.dataset import DataSet | from fastNLP.core.dataset import DataSet | ||||
| from fastNLP.api.pipeline import Pipeline | from fastNLP.api.pipeline import Pipeline | ||||
| from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor | from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor | ||||
| from reproduction.chinese_word_segment.process.cws_processor import * | |||||
| from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter, DigitSpanConverter | |||||
| from fastNLP.api.processor import IndexerProcessor | |||||
| from reproduction.chinese_word_segment.process.cws_processor import SpeicalSpanProcessor | |||||
| from reproduction.chinese_word_segment.process.cws_processor import CWSCharSegProcessor | |||||
| from reproduction.chinese_word_segment.process.cws_processor import CWSSegAppTagProcessor | |||||
| from reproduction.chinese_word_segment.process.cws_processor import Pre2Post2BigramProcessor | |||||
| from reproduction.chinese_word_segment.process.cws_processor import VocabProcessor | |||||
| from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter | |||||
| from reproduction.chinese_word_segment.process.span_converter import DigitSpanConverter | |||||
| from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader | from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader | ||||
| from reproduction.chinese_word_segment.models.cws_model import CWSBiLSTMSegApp | |||||
| tr_filename = '' | tr_filename = '' | ||||
| dev_filename = '' | dev_filename = '' | ||||
| @@ -60,8 +66,8 @@ bigram_proc(tr_dataset) | |||||
| char_vocab_proc(tr_dataset) | char_vocab_proc(tr_dataset) | ||||
| bigram_vocab_proc(tr_dataset) | bigram_vocab_proc(tr_dataset) | ||||
| char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list') | |||||
| bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list') | |||||
| char_index_proc = IndexerProcessor(char_vocab_proc.get_vocab(), 'chars_list', 'indexed_chars_list') | |||||
| bigram_index_proc = IndexerProcessor(bigram_vocab_proc.get_vocab(), 'bigrams_list','indexed_bigrams_list') | |||||
| char_index_proc(tr_dataset) | char_index_proc(tr_dataset) | ||||
| bigram_index_proc(tr_dataset) | bigram_index_proc(tr_dataset) | ||||
| @@ -81,7 +87,8 @@ bigram_index_proc(dev_dataset) | |||||
| # 3. 得到数据集可以用于训练了 | # 3. 得到数据集可以用于训练了 | ||||
| # TODO pretrain的embedding是怎么解决的? | # TODO pretrain的embedding是怎么解决的? | ||||
| cws_model = CWSBiLSTMSegApp(vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None, | |||||
| hidden_size=200, bidirectional=True, embed_drop_p=None, num_layers=1, tag_size=2) | |||||