|
@@ -3,11 +3,17 @@ from fastNLP.core.instance import Instance |
|
|
from fastNLP.core.dataset import DataSet |
|
|
from fastNLP.core.dataset import DataSet |
|
|
from fastNLP.api.pipeline import Pipeline |
|
|
from fastNLP.api.pipeline import Pipeline |
|
|
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor |
|
|
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor |
|
|
|
|
|
|
|
|
from reproduction.chinese_word_segment.process.cws_processor import * |
|
|
|
|
|
from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter, DigitSpanConverter |
|
|
|
|
|
|
|
|
from fastNLP.api.processor import IndexerProcessor |
|
|
|
|
|
from reproduction.chinese_word_segment.process.cws_processor import SpeicalSpanProcessor |
|
|
|
|
|
from reproduction.chinese_word_segment.process.cws_processor import CWSCharSegProcessor |
|
|
|
|
|
from reproduction.chinese_word_segment.process.cws_processor import CWSSegAppTagProcessor |
|
|
|
|
|
from reproduction.chinese_word_segment.process.cws_processor import Pre2Post2BigramProcessor |
|
|
|
|
|
from reproduction.chinese_word_segment.process.cws_processor import VocabProcessor |
|
|
|
|
|
|
|
|
|
|
|
from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter |
|
|
|
|
|
from reproduction.chinese_word_segment.process.span_converter import DigitSpanConverter |
|
|
from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader |
|
|
from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader |
|
|
|
|
|
|
|
|
|
|
|
from reproduction.chinese_word_segment.models.cws_model import CWSBiLSTMSegApp |
|
|
|
|
|
|
|
|
tr_filename = '' |
|
|
tr_filename = '' |
|
|
dev_filename = '' |
|
|
dev_filename = '' |
|
@@ -60,8 +66,8 @@ bigram_proc(tr_dataset) |
|
|
char_vocab_proc(tr_dataset) |
|
|
char_vocab_proc(tr_dataset) |
|
|
bigram_vocab_proc(tr_dataset) |
|
|
bigram_vocab_proc(tr_dataset) |
|
|
|
|
|
|
|
|
char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list') |
|
|
|
|
|
bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list') |
|
|
|
|
|
|
|
|
char_index_proc = IndexerProcessor(char_vocab_proc.get_vocab(), 'chars_list', 'indexed_chars_list') |
|
|
|
|
|
bigram_index_proc = IndexerProcessor(bigram_vocab_proc.get_vocab(), 'bigrams_list','indexed_bigrams_list') |
|
|
|
|
|
|
|
|
char_index_proc(tr_dataset) |
|
|
char_index_proc(tr_dataset) |
|
|
bigram_index_proc(tr_dataset) |
|
|
bigram_index_proc(tr_dataset) |
|
@@ -81,7 +87,8 @@ bigram_index_proc(dev_dataset) |
|
|
|
|
|
|
|
|
# 3. 得到数据集可以用于训练了 |
|
|
# 3. 得到数据集可以用于训练了 |
|
|
# TODO pretrain的embedding是怎么解决的? |
|
|
# TODO pretrain的embedding是怎么解决的? |
|
|
|
|
|
|
|
|
|
|
|
cws_model = CWSBiLSTMSegApp(vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None, |
|
|
|
|
|
hidden_size=200, bidirectional=True, embed_drop_p=None, num_layers=1, tag_size=2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|