diff --git a/reproduction/chinese_word_segment/cws.cfg b/reproduction/Chinese_word_segmentation/cws.cfg similarity index 100% rename from reproduction/chinese_word_segment/cws.cfg rename to reproduction/Chinese_word_segmentation/cws.cfg diff --git a/reproduction/chinese_word_segment/cws_io/__init__.py b/reproduction/Chinese_word_segmentation/cws_io/__init__.py similarity index 100% rename from reproduction/chinese_word_segment/cws_io/__init__.py rename to reproduction/Chinese_word_segmentation/cws_io/__init__.py diff --git a/reproduction/chinese_word_segment/cws_io/cws_reader.py b/reproduction/Chinese_word_segmentation/cws_io/cws_reader.py similarity index 100% rename from reproduction/chinese_word_segment/cws_io/cws_reader.py rename to reproduction/Chinese_word_segmentation/cws_io/cws_reader.py diff --git a/reproduction/chinese_word_segment/models/__init__.py b/reproduction/Chinese_word_segmentation/models/__init__.py similarity index 100% rename from reproduction/chinese_word_segment/models/__init__.py rename to reproduction/Chinese_word_segmentation/models/__init__.py diff --git a/reproduction/chinese_word_segment/models/cws_model.py b/reproduction/Chinese_word_segmentation/models/cws_model.py similarity index 98% rename from reproduction/chinese_word_segment/models/cws_model.py rename to reproduction/Chinese_word_segmentation/models/cws_model.py index c6cf6746..daefc380 100644 --- a/reproduction/chinese_word_segment/models/cws_model.py +++ b/reproduction/Chinese_word_segmentation/models/cws_model.py @@ -1,11 +1,11 @@ -from torch import nn import torch -import torch.nn.functional as F +from torch import nn -from fastNLP.modules.decoder.MLP import MLP from fastNLP.models.base_model import BaseModel -from reproduction.chinese_word_segment.utils import seq_lens_to_mask +from fastNLP.modules.decoder.MLP import MLP +from reproduction.Chinese_word_segmentation.utils import seq_lens_to_mask + class CWSBiLSTMEncoder(BaseModel): def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None, diff --git a/reproduction/chinese_word_segment/process/__init__.py b/reproduction/Chinese_word_segmentation/process/__init__.py similarity index 100% rename from reproduction/chinese_word_segment/process/__init__.py rename to reproduction/Chinese_word_segmentation/process/__init__.py diff --git a/reproduction/chinese_word_segment/process/cws_processor.py b/reproduction/Chinese_word_segmentation/process/cws_processor.py similarity index 99% rename from reproduction/chinese_word_segment/process/cws_processor.py rename to reproduction/Chinese_word_segmentation/process/cws_processor.py index be6ca6b1..614d9ef5 100644 --- a/reproduction/chinese_word_segment/process/cws_processor.py +++ b/reproduction/Chinese_word_segmentation/process/cws_processor.py @@ -4,7 +4,7 @@ import re from fastNLP.api.processor import Processor from fastNLP.core.dataset import DataSet from fastNLP.core.vocabulary import Vocabulary -from reproduction.chinese_word_segment.process.span_converter import SpanConverter +from reproduction.Chinese_word_segmentation.process.span_converter import SpanConverter _SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>' diff --git a/reproduction/chinese_word_segment/process/span_converter.py b/reproduction/Chinese_word_segmentation/process/span_converter.py similarity index 100% rename from reproduction/chinese_word_segment/process/span_converter.py rename to reproduction/Chinese_word_segmentation/process/span_converter.py diff --git a/reproduction/chinese_word_segment/utils.py b/reproduction/Chinese_word_segmentation/utils.py similarity index 100% rename from reproduction/chinese_word_segment/utils.py rename to reproduction/Chinese_word_segmentation/utils.py diff --git a/reproduction/pos_tag_model/pos_processor.py b/reproduction/POS_tagging/pos_processor.py similarity index 100% rename from reproduction/pos_tag_model/pos_processor.py rename to reproduction/POS_tagging/pos_processor.py diff --git a/reproduction/pos_tag_model/pos_reader.py b/reproduction/POS_tagging/pos_reader.py similarity index 100% rename from reproduction/pos_tag_model/pos_reader.py rename to reproduction/POS_tagging/pos_reader.py diff --git a/reproduction/pos_tag_model/pos_tag.cfg b/reproduction/POS_tagging/pos_tag.cfg similarity index 100% rename from reproduction/pos_tag_model/pos_tag.cfg rename to reproduction/POS_tagging/pos_tag.cfg diff --git a/reproduction/pos_tag_model/train_pos_tag.py b/reproduction/POS_tagging/train_pos_tag.py similarity index 100% rename from reproduction/pos_tag_model/train_pos_tag.py rename to reproduction/POS_tagging/train_pos_tag.py diff --git a/reproduction/pos_tag_model/utils.py b/reproduction/POS_tagging/utils.py similarity index 100% rename from reproduction/pos_tag_model/utils.py rename to reproduction/POS_tagging/utils.py diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py deleted file mode 100644 index e7804bae..00000000 --- a/reproduction/chinese_word_segment/run.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -import sys - -sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) - -from fastNLP.io.config_io import ConfigLoader, ConfigSection -from fastNLP.core.trainer import SeqLabelTrainer -from fastNLP.io.dataset_loader import BaseLoader, TokenizeDataSetLoader -from fastNLP.core.utils import load_pickle -from fastNLP.io.model_io import ModelLoader, ModelSaver -from fastNLP.core.tester import SeqLabelTester -from fastNLP.models.sequence_modeling import AdvSeqLabel -from fastNLP.core.predictor import SeqLabelInfer -from fastNLP.core.utils import save_pickle -from fastNLP.core.metrics import SeqLabelEvaluator - -# not in the file's dir -if len(os.path.dirname(__file__)) != 0: - os.chdir(os.path.dirname(__file__)) -datadir = "/home/zyfeng/data/" -cfgfile = './cws.cfg' - -cws_data_path = os.path.join(datadir, "pku_training.utf8") -pickle_path = "save" -data_infer_path = os.path.join(datadir, "infer.utf8") - - -def infer(): - # Config Loader - test_args = ConfigSection() - ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) - - # fetch dictionary size and number of labels from pickle files - word2index = load_pickle(pickle_path, "word2id.pkl") - test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "label2id.pkl") - test_args["num_classes"] = len(index2label) - - # Define the same model - model = AdvSeqLabel(test_args) - - try: - ModelLoader.load_pytorch(model, "./save/trained_model.pkl") - print('model loaded!') - except Exception as e: - print('cannot load model!') - raise - - # Data Loader - infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines) - infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True) - print('data loaded') - - # Inference interface - infer = SeqLabelInfer(pickle_path) - results = infer.predict(model, infer_data) - - print(results) - print("Inference finished!") - - -def train(): - # Config Loader - train_args = ConfigSection() - test_args = ConfigSection() - ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args}) - - print("loading data set...") - data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load) - data.load(cws_data_path) - data_train, data_dev = data.split(ratio=0.3) - train_args["vocab_size"] = len(data.word_vocab) - train_args["num_classes"] = len(data.label_vocab) - print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab))) - - change_field_is_target(data_dev, "truth", True) - save_pickle(data_dev, "./save/", "data_dev.pkl") - save_pickle(data.word_vocab, "./save/", "word2id.pkl") - save_pickle(data.label_vocab, "./save/", "label2id.pkl") - - # Trainer - trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], - validate=train_args["validate"], - use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"], - save_best_dev=True, print_every_step=10, model_name="trained_model.pkl", - evaluator=SeqLabelEvaluator()) - - # Model - model = AdvSeqLabel(train_args) - try: - ModelLoader.load_pytorch(model, "./save/saved_model.pkl") - print('model parameter loaded!') - except Exception as e: - print("No saved model. Continue.") - pass - - # Start training - trainer.train(model, data_train, data_dev) - print("Training finished!") - - # Saver - saver = ModelSaver("./save/trained_model.pkl") - saver.save_pytorch(model) - print("Model saved!") - - -def predict(): - # Config Loader - test_args = ConfigSection() - ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) - - # fetch dictionary size and number of labels from pickle files - word2index = load_pickle(pickle_path, "word2id.pkl") - test_args["vocab_size"] = len(word2index) - index2label = load_pickle(pickle_path, "label2id.pkl") - test_args["num_classes"] = len(index2label) - - # load dev data - dev_data = load_pickle(pickle_path, "data_dev.pkl") - - # Define the same model - model = AdvSeqLabel(test_args) - - # Dump trained parameters into the model - ModelLoader.load_pytorch(model, "./save/trained_model.pkl") - print("model loaded!") - - # Tester - test_args["evaluator"] = SeqLabelEvaluator() - tester = SeqLabelTester(**test_args.data) - - # Start testing - tester.test(model, dev_data) - - -if __name__ == "__main__": - - import argparse - - parser = argparse.ArgumentParser(description='Run a chinese word segmentation model') - parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer']) - args = parser.parse_args() - if args.mode == 'train': - train() - elif args.mode == 'test': - predict() - elif args.mode == 'infer': - infer() - else: - print('no mode specified for model!') - parser.print_help() diff --git a/test/core/test_batch.py b/test/core/test_batch.py index 08d803f1..1c4b22f8 100644 --- a/test/core/test_batch.py +++ b/test/core/test_batch.py @@ -1,6 +1,7 @@ import unittest import numpy as np +import torch from fastNLP.core.batch import Batch from fastNLP.core.dataset import DataSet @@ -31,3 +32,47 @@ class TestCase1(unittest.TestCase): self.assertEqual(len(y["y"]), 4) self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4]) self.assertListEqual(list(y["y"][-1]), [5, 6]) + + def test_list_padding(self): + ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10, + "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10}) + ds.set_input("x") + ds.set_target("y") + iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) + for x, y in iter: + self.assertEqual(x["x"].shape, (4, 4)) + self.assertEqual(y["y"].shape, (4, 4)) + + def test_numpy_padding(self): + ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), + "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) + ds.set_input("x") + ds.set_target("y") + iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) + for x, y in iter: + self.assertEqual(x["x"].shape, (4, 4)) + self.assertEqual(y["y"].shape, (4, 4)) + + def test_list_to_tensor(self): + ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10, + "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10}) + ds.set_input("x") + ds.set_target("y") + iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) + for x, y in iter: + self.assertTrue(isinstance(x["x"], torch.Tensor)) + self.assertEqual(tuple(x["x"].shape), (4, 4)) + self.assertTrue(isinstance(y["y"], torch.Tensor)) + self.assertEqual(tuple(y["y"].shape), (4, 4)) + + def test_numpy_to_tensor(self): + ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), + "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) + ds.set_input("x") + ds.set_target("y") + iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) + for x, y in iter: + self.assertTrue(isinstance(x["x"], torch.Tensor)) + self.assertEqual(tuple(x["x"].shape), (4, 4)) + self.assertTrue(isinstance(y["y"], torch.Tensor)) + self.assertEqual(tuple(y["y"].shape), (4, 4))