- add pad sequence for lstm

- add csv, conll, json filereader - update dataloader - remove useless dataloader - fix trainer loss print - fix tests
6 years ago · c344f7a2f9
--- a/fastNLP/api/api.py
+++ b/fastNLP/api/api.py
@@ -9,7 +9,7 @@ from fastNLP.core.dataset import DataSet

 from fastNLP.api.utils import load_url
 from fastNLP.api.processor import ModelProcessor
 from fastNLP.io.dataset_loader import ConllCWSReader, ConllxDataLoader
 from fastNLP.io.dataset_loader import cut_long_sentence, ConllLoader
 from fastNLP.core.instance import Instance
 from fastNLP.api.pipeline import Pipeline
 from fastNLP.core.metrics import SpanFPreRecMetric
@@ -23,6 +23,85 @@ model_urls = {
 }


 class ConllCWSReader(object):
    """Deprecated. Use ConllLoader for all types of conll-format files."""
    def __init__(self):
        pass

    def load(self, path, cut_long_sent=False):
        """
        返回的DataSet只包含raw_sentence这个field，内容为str。
        假定了输入为conll的格式，以空行隔开两个句子，每行共7列，即
        ::

            1	编者按	编者按	NN	O	11	nmod:topic
            2	：	：	PU	O	11	punct
            3	7月	7月	NT	DATE	4	compound:nn
            4	12日	12日	NT	DATE	11	nmod:tmod
            5	，	，	PU	O	11	punct

            1	这	这	DT	O	3	det
            2	款	款	M	O	1	mark:clf
            3	飞行	飞行	NN	O	8	nsubj
            4	从	从	P	O	5	case
            5	外型	外型	NN	O	8	nmod:prep

        """
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.strip().split())
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet()
        for sample in datalist:
            # print(sample)
            res = self.get_char_lst(sample)
            if res is None:
                continue
            line = ' '.join(res)
            if cut_long_sent:
                sents = cut_long_sentence(line)
            else:
                sents = [line]
            for raw_sentence in sents:
                ds.append(Instance(raw_sentence=raw_sentence))
        return ds

    def get_char_lst(self, sample):
        if len(sample) == 0:
            return None
        text = []
        for w in sample:
            t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
            if t3 == '_':
                return None
            text.append(t1)
        return text

 class ConllxDataLoader(ConllLoader):
    """返回“词级别”的标签信息，包括词、词性、（句法）头依赖、（句法）边标签。跟``ZhConllPOSReader``完全不同。

        Deprecated. Use ConllLoader for all types of conll-format files.
    """
    def __init__(self):
        headers = [
            'words', 'pos_tags', 'heads', 'labels',
        ]
        indexs = [
            1, 3, 6, 7,
        ]
        super(ConllxDataLoader, self).__init__(headers=headers, indexs=indexs)


 class API:
    def __init__(self):
        self.pipeline = None
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -373,6 +373,9 @@ class DataSet(object):
        :return dataset: the read data set

        """
        import warnings
        warnings.warn('read_csv is deprecated, use CSVLoader instead',
                      category=DeprecationWarning)
        with open(csv_path, "r") as f:
            start_idx = 0
            if headers is None:
@@ -398,9 +401,6 @@ class DataSet(object):
                    _dict[header].append(content)
        return cls(_dict)

    # def read_pos(self):
    #     return DataLoaderRegister.get_reader('read_pos')

    def save(self, path):
        """Save the DataSet object as pickle.

--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -268,8 +268,9 @@ class Trainer(object):
                    self.callback_manager.on_step_end()

                    if self.step % self.print_every == 0:
                        avg_loss = float(avg_loss) / self.print_every
                        if self.use_tqdm:
                            print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every)
                            print_output = "loss:{0:<6.5f}".format(avg_loss)
                            pbar.update(self.print_every)
                        else:
                            end = time.time()
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -1,71 +1,13 @@
 import os
 import json
 from nltk.tree import Tree

 from fastNLP.core.dataset import DataSet
 from fastNLP.core.instance import Instance
 from fastNLP.io.base_loader import DataLoaderRegister
 from fastNLP.io.file_reader import read_csv, read_json, read_conll


 def convert_seq_dataset(data):
    """Create an DataSet instance that contains no labels.

    :param data: list of list of strings, [num_examples, *].
            Example::

                [
                    [word_11, word_12, ...],
                    ...
                ]

    :return: a DataSet.
    """
    dataset = DataSet()
    for word_seq in data:
        dataset.append(Instance(word_seq=word_seq))
    return dataset


 def convert_seq2tag_dataset(data):
    """Convert list of data into DataSet.

    :param data: list of list of strings, [num_examples, *].
            Example::

                [
                    [ [word_11, word_12, ...], label_1 ],
                    [ [word_21, word_22, ...], label_2 ],
                    ...
                ]

    :return: a DataSet.
    """
    dataset = DataSet()
    for sample in data:
        dataset.append(Instance(word_seq=sample[0], label=sample[1]))
    return dataset


 def convert_seq2seq_dataset(data):
    """Convert list of data into DataSet.

    :param data: list of list of strings, [num_examples, *].
            Example::

                [
                    [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                    [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                    ...
                ]

    :return: a DataSet.
    """
    dataset = DataSet()
    for sample in data:
        dataset.append(Instance(word_seq=sample[0], label_seq=sample[1]))
    return dataset


 def download_from_url(url, path):
 def _download_from_url(url, path):
    from tqdm import tqdm
    import requests

@@ -81,7 +23,7 @@ def download_from_url(url, path):
                t.update(len(chunk))
    return

 def uncompress(src, dst):
 def _uncompress(src, dst):
    import zipfile, gzip, tarfile, os

    def unzip(src, dst):
@@ -134,241 +76,6 @@ class DataSetLoader:
        raise NotImplementedError


 class NativeDataSetLoader(DataSetLoader):
    """A simple example of DataSetLoader

    """

    def __init__(self):
        super(NativeDataSetLoader, self).__init__()

    def load(self, path):
        ds = DataSet.read_csv(path, headers=("raw_sentence", "label"), sep="\t")
        ds.set_input("raw_sentence")
        ds.set_target("label")
        return ds


 DataLoaderRegister.set_reader(NativeDataSetLoader, 'read_naive')


 class RawDataSetLoader(DataSetLoader):
    """A simple example of raw data reader

    """

    def __init__(self):
        super(RawDataSetLoader, self).__init__()

    def load(self, data_path, split=None):
        with open(data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        lines = lines if split is None else [l.split(split) for l in lines]
        lines = list(filter(lambda x: len(x) > 0, lines))
        return self.convert(lines)

    def convert(self, data):
        return convert_seq_dataset(data)


 DataLoaderRegister.set_reader(RawDataSetLoader, 'read_rawdata')


 class DummyPOSReader(DataSetLoader):
    """A simple reader for a dummy POS tagging dataset.

    In these datasets, each line are divided by "\t". The first Col is the vocabulary and the second
    Col is the label. Different sentence are divided by an empty line.
        E.g::

            Tom label1
            and label2
            Jerry   label1
            .   label3
            (separated by an empty line)
            Hello   label4
            world   label5
            !   label3

    In this example, there are two sentences "Tom and Jerry ." and "Hello world !". Each word has its own label.
    """

    def __init__(self):
        super(DummyPOSReader, self).__init__()

    def load(self, data_path):
        """
        :return data: three-level list
            Example::
                [
                    [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                    [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                    ...
                ]
        """
        with open(data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        data = self.parse(lines)
        return self.convert(data)

    @staticmethod
    def parse(lines):
        data = []
        sentence = []
        for line in lines:
            line = line.strip()
            if len(line) > 1:
                sentence.append(line.split('\t'))
            else:
                words = []
                labels = []
                for tokens in sentence:
                    words.append(tokens[0])
                    labels.append(tokens[1])
                data.append([words, labels])
                sentence = []
        if len(sentence) != 0:
            words = []
            labels = []
            for tokens in sentence:
                words.append(tokens[0])
                labels.append(tokens[1])
            data.append([words, labels])
        return data

    def convert(self, data):
        """Convert lists of strings into Instances with Fields.
        """
        return convert_seq2seq_dataset(data)


 DataLoaderRegister.set_reader(DummyPOSReader, 'read_pos')


 class DummyCWSReader(DataSetLoader):
    """Load pku dataset for Chinese word segmentation.
    """
    def __init__(self):
        super(DummyCWSReader, self).__init__()

    def load(self, data_path, max_seq_len=32):
        """Load pku dataset for Chinese word segmentation.
        CWS (Chinese Word Segmentation) pku training dataset format:
        1. Each line is a sentence.
        2. Each word in a sentence is separated by space.
        This function convert the pku dataset into three-level lists with labels <BMES>.
        B: beginning of a word
        M: middle of a word
        E: ending of a word
        S: single character

        :param str data_path: path to the data set.
        :param max_seq_len: int, the maximum length of a sequence. If a sequence is longer than it, split it into
                several sequences.
        :return: three-level lists
        """
        assert isinstance(max_seq_len, int) and max_seq_len > 0
        with open(data_path, "r", encoding="utf-8") as f:
            sentences = f.readlines()
        data = []
        for sent in sentences:
            tokens = sent.strip().split()
            words = []
            labels = []
            for token in tokens:
                if len(token) == 1:
                    words.append(token)
                    labels.append("S")
                else:
                    words.append(token[0])
                    labels.append("B")
                    for idx in range(1, len(token) - 1):
                        words.append(token[idx])
                        labels.append("M")
                    words.append(token[-1])
                    labels.append("E")
            num_samples = len(words) // max_seq_len
            if len(words) % max_seq_len != 0:
                num_samples += 1
            for sample_idx in range(num_samples):
                start = sample_idx * max_seq_len
                end = (sample_idx + 1) * max_seq_len
                seq_words = words[start:end]
                seq_labels = labels[start:end]
                data.append([seq_words, seq_labels])
        return self.convert(data)

    def convert(self, data):
        return convert_seq2seq_dataset(data)


 class DummyClassificationReader(DataSetLoader):
    """Loader for a dummy classification data set"""

    def __init__(self):
        super(DummyClassificationReader, self).__init__()

    def load(self, data_path):
        assert os.path.exists(data_path)
        with open(data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        data = self.parse(lines)
        return self.convert(data)

    @staticmethod
    def parse(lines):
        """每行第一个token是标签，其余是字/词；由空格分隔。

        :param lines: lines from dataset
        :return: list(list(list())): the three level of lists are words, sentence, and dataset
        """
        dataset = list()
        for line in lines:
            line = line.strip().split()
            label = line[0]
            words = line[1:]
            if len(words) <= 1:
                continue

            sentence = [words, label]
            dataset.append(sentence)
        return dataset

    def convert(self, data):
        return convert_seq2tag_dataset(data)


 class DummyLMReader(DataSetLoader):
    """A Dummy Language Model Dataset Reader
    """
    def __init__(self):
        super(DummyLMReader, self).__init__()

    def load(self, data_path):
        if not os.path.exists(data_path):
            raise FileNotFoundError("file {} not found.".format(data_path))
        with open(data_path, "r", encoding="utf=8") as f:
            text = " ".join(f.readlines())
        tokens = text.strip().split()
        data = self.sentence_cut(tokens)
        return self.convert(data)

    def sentence_cut(self, tokens, sentence_length=15):
        start_idx = 0
        data_set = []
        for idx in range(len(tokens) // sentence_length):
            x = tokens[start_idx * idx: start_idx * idx + sentence_length]
            y = tokens[start_idx * idx + 1: start_idx * idx + sentence_length + 1]
            if start_idx * idx + sentence_length + 1 >= len(tokens):
                # ad hoc
                y.extend(["<unk>"])
            data_set.append([x, y])
        return data_set

    def convert(self, data):
        pass


 class PeopleDailyCorpusLoader(DataSetLoader):
    """人民日报数据集
    """
@@ -448,8 +155,9 @@ class PeopleDailyCorpusLoader(DataSetLoader):


 class ConllLoader:
    def __init__(self, headers, indexs=None):
    def __init__(self, headers, indexs=None, dropna=True):
        self.headers = headers
        self.dropna = dropna
        if indexs is None:
            self.indexs = list(range(len(self.headers)))
        else:
@@ -458,33 +166,10 @@ class ConllLoader:
            self.indexs = indexs

    def load(self, path):
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            start = next(f)
            if '-DOCSTART-' not in start:
                sample.append(start.split())
            for line in f:
                if line.startswith('\n'):
                    if len(sample):
                        datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split())
            if len(sample) > 0:
                datalist.append(sample)

        data = [self.get_one(sample) for sample in datalist]
        data = filter(lambda x: x is not None, data)

        ds = DataSet()
        for sample in data:
            ins = Instance()
            for name, idx in zip(self.headers, self.indexs):
                ins.add_field(field_name=name, field=sample[idx])
            ds.append(ins)
        for idx, data in read_conll(path, indexes=self.indexs, dropna=self.dropna):
            ins = {h:data[idx] for h, idx in zip(self.headers, self.indexs)}
            ds.append(Instance(**ins))
        return ds

    def get_one(self, sample):
@@ -499,9 +184,7 @@ class Conll2003Loader(ConllLoader):
    """Loader for conll2003 dataset
    
        More information about the given dataset cound be found on 
        https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data 

        Deprecated. Use ConllLoader for all types of conll-format files.
        https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
    """
    def __init__(self):
        headers = [
@@ -510,194 +193,6 @@ class Conll2003Loader(ConllLoader):
        super(Conll2003Loader, self).__init__(headers=headers)


 class SNLIDataSetReader(DataSetLoader):
    """A data set loader for SNLI data set.

    """
    def __init__(self):
        super(SNLIDataSetReader, self).__init__()

    def load(self, path_list):
        """

        :param list path_list: A list of file name, in the order of premise file, hypothesis file, and label file.
        :return: A DataSet object.
        """
        assert len(path_list) == 3
        line_set = []
        for file in path_list:
            if not os.path.exists(file):
                raise FileNotFoundError("file {} NOT found".format(file))

            with open(file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                line_set.append(lines)

        premise_lines, hypothesis_lines, label_lines = line_set
        assert len(premise_lines) == len(hypothesis_lines) and len(premise_lines) == len(label_lines)

        data_set = []
        for premise, hypothesis, label in zip(premise_lines, hypothesis_lines, label_lines):
            p = premise.strip().split()
            h = hypothesis.strip().split()
            l = label.strip()
            data_set.append([p, h, l])

        return self.convert(data_set)

    def convert(self, data):
        """Convert a 3D list to a DataSet object.

        :param data: A 3D tensor.
            Example::
                [
                    [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
                    [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
                    ...
                ]

        :return: A DataSet object.
        """

        data_set = DataSet()

        for example in data:
            p, h, l = example
            # list, list, str
            instance = Instance()
            instance.add_field("premise", p)
            instance.add_field("hypothesis", h)
            instance.add_field("truth", l)
            data_set.append(instance)
        data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len")
        data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len")
        data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len")
        data_set.set_target("truth")
        return data_set


 class ConllCWSReader(object):
    """Deprecated. Use ConllLoader for all types of conll-format files."""
    def __init__(self):
        pass

    def load(self, path, cut_long_sent=False):
        """
        返回的DataSet只包含raw_sentence这个field，内容为str。
        假定了输入为conll的格式，以空行隔开两个句子，每行共7列，即
        ::

            1	编者按	编者按	NN	O	11	nmod:topic
            2	：	：	PU	O	11	punct
            3	7月	7月	NT	DATE	4	compound:nn
            4	12日	12日	NT	DATE	11	nmod:tmod
            5	，	，	PU	O	11	punct

            1	这	这	DT	O	3	det
            2	款	款	M	O	1	mark:clf
            3	飞行	飞行	NN	O	8	nsubj
            4	从	从	P	O	5	case
            5	外型	外型	NN	O	8	nmod:prep

        """
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.strip().split())
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet()
        for sample in datalist:
            # print(sample)
            res = self.get_char_lst(sample)
            if res is None:
                continue
            line = ' '.join(res)
            if cut_long_sent:
                sents = cut_long_sentence(line)
            else:
                sents = [line]
            for raw_sentence in sents:
                ds.append(Instance(raw_sentence=raw_sentence))
        return ds

    def get_char_lst(self, sample):
        if len(sample) == 0:
            return None
        text = []
        for w in sample:
            t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
            if t3 == '_':
                return None
            text.append(t1)
        return text


 class NaiveCWSReader(DataSetLoader):
    """
    这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了
    例如::

        这是 fastNLP , 一个 非常 good 的 包 .

    或者,即每个part后面还有一个pos tag
    例如::

        也/D  在/P  團員/Na  之中/Ng  ，/COMMACATEGORY

    """

    def __init__(self, in_word_splitter=None):
        super(NaiveCWSReader, self).__init__()
        self.in_word_splitter = in_word_splitter

    def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
        """
        允许使用的情况有(默认以\t或空格作为seg)
            这是 fastNLP , 一个 非常 good 的 包 .
        和
            也/D  在/P  團員/Na  之中/Ng  ，/COMMACATEGORY
        如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]

        :param filepath:
        :param in_word_splitter:
        :param cut_long_sent:
        :return:
        """
        if in_word_splitter == None:
            in_word_splitter = self.in_word_splitter
        dataset = DataSet()
        with open(filepath, 'r') as f:
            for line in f:
                line = line.strip()
                if len(line.replace(' ', '')) == 0:  # 不能接受空行
                    continue

                if not in_word_splitter is None:
                    words = []
                    for part in line.split():
                        word = part.split(in_word_splitter)[0]
                        words.append(word)
                        line = ' '.join(words)
                if cut_long_sent:
                    sents = cut_long_sentence(line)
                else:
                    sents = [line]
                for sent in sents:
                    instance = Instance(raw_sentence=sent)
                    dataset.append(instance)

        return dataset


 def cut_long_sentence(sent, max_sample_length=200):
    """
    将长于max_sample_length的sentence截成多段，只会在有空格的地方发生截断。所以截取的句子可能长于或者短于max_sample_length
@@ -727,103 +222,6 @@ def cut_long_sentence(sent, max_sample_length=200):
    return cutted_sentence


 class ZhConllPOSReader(object):
    """读取中文Conll格式。返回“字级别”的标签，使用BMES记号扩展原来的词级别标签。

        Deprecated. Use ConllLoader for all types of conll-format files.
    """
    def __init__(self):
        pass

    def load(self, path):
        """
        返回的DataSet, 包含以下的field
            words：list of str,
            tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..]，会被认为是["S-VP", "B-NN", "M-NN",..]
        假定了输入为conll的格式，以空行隔开两个句子，每行共7列，即
        ::

            1	编者按	编者按	NN	O	11	nmod:topic
            2	：	：	PU	O	11	punct
            3	7月	7月	NT	DATE	4	compound:nn
            4	12日	12日	NT	DATE	11	nmod:tmod
            5	，	，	PU	O	11	punct

            1	这	这	DT	O	3	det
            2	款	款	M	O	1	mark:clf
            3	飞行	飞行	NN	O	8	nsubj
            4	从	从	P	O	5	case
            5	外型	外型	NN	O	8	nmod:prep

        """
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet()
        for sample in datalist:
            # print(sample)
            res = self.get_one(sample)
            if res is None:
                continue
            char_seq = []
            pos_seq = []
            for word, tag in zip(res[0], res[1]):
                char_seq.extend(list(word))
                if len(word) == 1:
                    pos_seq.append('S-{}'.format(tag))
                elif len(word) > 1:
                    pos_seq.append('B-{}'.format(tag))
                    for _ in range(len(word) - 2):
                        pos_seq.append('M-{}'.format(tag))
                    pos_seq.append('E-{}'.format(tag))
                else:
                    raise ValueError("Zero length of word detected.")

            ds.append(Instance(words=char_seq,
                               tag=pos_seq))

        return ds

    def get_one(self, sample):
        if len(sample) == 0:
            return None
        text = []
        pos_tags = []
        for w in sample:
            t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
            if t3 == '_':
                return None
            text.append(t1)
            pos_tags.append(t2)
        return text, pos_tags


 class ConllxDataLoader(ConllLoader):
    """返回“词级别”的标签信息，包括词、词性、（句法）头依赖、（句法）边标签。跟``ZhConllPOSReader``完全不同。

        Deprecated. Use ConllLoader for all types of conll-format files.
    """
    def __init__(self):
        headers = [
            'words', 'pos_tags', 'heads', 'labels',
        ]
        indexs = [
            1, 3, 6, 7,
        ]
        super(ConllxDataLoader, self).__init__(headers=headers, indexs=indexs)


 class SSTLoader(DataSetLoader):
    """load SST data in PTB tree format
        data source: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip
@@ -842,10 +240,7 @@ class SSTLoader(DataSetLoader):
        """

        :param path: str，存储数据的路径
        :return: DataSet。内含field有'words', 'pos_tags', 'heads', 'labels'(parser的label)
            类似于拥有以下结构, 一行为一个instance(sample)
            words           pos_tags        heads       labels
            ['some', ..]    ['NN', ...]     [2, 3...]   ['nn', 'nn'...]
        :return: DataSet。
        """
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
@@ -860,7 +255,6 @@ class SSTLoader(DataSetLoader):

    @staticmethod
    def get_one(data, subtree):
        from nltk.tree import Tree
        tree = Tree.fromstring(data)
        if subtree:
            return [(t.leaves(), t.label()) for t in tree.subtrees()]
@@ -872,26 +266,72 @@ class JsonLoader(DataSetLoader):
        every line contains a json obj, like a dict
        fields is the dict key that need to be load
    """
    def __init__(self, **fields):
    def __init__(self, dropna=False, fields=None):
        super(JsonLoader, self).__init__()
        self.fields = {}
        for k, v in fields.items():
            self.fields[k] = k if v is None else v
        self.dropna = dropna
        self.fields = None
        self.fields_list = None
        if fields:
            self.fields = {}
            for k, v in fields.items():
                self.fields[k] = k if v is None else v
            self.fields_list = list(self.fields.keys())

    def load(self, path):
        ds = DataSet()
        for idx, d in read_json(path, fields=self.fields_list, dropna=self.dropna):
            ins = {self.fields[k]:v for k,v in d.items()}
            ds.append(Instance(**ins))
        return ds


 class SNLILoader(JsonLoader):
    """
    data source: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
    """
    def __init__(self):
        fields = {
            'sentence1_parse': 'words1',
            'sentence2_parse': 'words2',
            'gold_label': 'target',
        }
        super(SNLILoader, self).__init__(fields=fields)

    def load(self, path):
        ds = super(SNLILoader, self).load(path)
        def parse_tree(x):
            t = Tree.fromstring(x)
            return t.leaves()
        ds.apply(lambda ins: parse_tree(ins['words1']), new_field_name='words1')
        ds.apply(lambda ins: parse_tree(ins['words2']), new_field_name='words2')
        ds.drop(lambda x: x['target'] == '-')
        return ds


 class CSVLoader(DataSetLoader):
    """Load data from a CSV file and return a DataSet object.

            :param str csv_path: path to the CSV file
            :param List[str] or Tuple[str] headers: headers of the CSV file
            :param str sep: delimiter in CSV file. Default: ","
            :param bool dropna: If True, drop rows that have less entries than headers.
            :return dataset: the read data set

    """
    def __init__(self, headers=None, sep=",", dropna=True):
        self.headers = headers
        self.sep = sep
        self.dropna = dropna

    def load(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            datas = [json.loads(l) for l in f]
        ds = DataSet()
        for d in datas:
            ins = Instance()
            for k, v in d.items():
                if k in self.fields:
                    ins.add_field(self.fields[k], v)
            ds.append(ins)
        for idx, data in read_csv(path, headers=self.headers,
                                  sep=self.sep, dropna=self.dropna):
            ds.append(Instance(**data))
        return ds


 def add_seg_tag(data):
 def _add_seg_tag(data):
    """

    :param data: list of ([word], [pos], [heads], [head_tags])
--- a/fastNLP/io/file_reader.py
+++ b/fastNLP/io/file_reader.py
@@ -0,0 +1,112 @@
 import json


 def read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
    """
    Construct a generator to read csv items
    :param path: file path
    :param encoding: file's encoding, default: utf-8
    :param headers: file's headers, if None, make file's first line as headers. default: None
    :param sep: separator for each column. default: ','
    :param dropna: weather to ignore and drop invalid data,
            if False, raise ValueError when reading invalid data. default: True
    :return: generator, every time yield (line number, csv item)
    """
    with open(path, 'r', encoding=encoding) as f:
        start_idx = 0
        if headers is None:
            headers = f.readline().rstrip('\r\n')
            headers = headers.split(sep)
            start_idx += 1
        elif not isinstance(headers, (list, tuple)):
                raise TypeError("headers should be list or tuple, not {}." \
                        .format(type(headers)))
        for line_idx, line in enumerate(f, start_idx):
            contents = line.rstrip('\r\n').split(sep)
            if len(contents) != len(headers):
                if dropna:
                    continue
                else:
                    raise ValueError("Line {} has {} parts, while header has {} parts." \
                                     .format(line_idx, len(contents), len(headers)))
            _dict = {}
            for header, content in zip(headers, contents):
                _dict[header] = content
            yield line_idx, _dict


 def read_json(path, encoding='utf-8', fields=None, dropna=True):
    """
    Construct a generator to read json items
    :param path: file path
    :param encoding: file's encoding, default: utf-8
    :param fields: json object's fields that needed, if None, all fields are needed. default: None
    :param dropna: weather to ignore and drop invalid data,
            if False, raise ValueError when reading invalid data. default: True
    :return: generator, every time yield (line number, json item)
    """
    if fields:
        fields = set(fields)
    with open(path, 'r', encoding=encoding) as f:
        for line_idx, line in enumerate(f):
            data = json.loads(line)
            if fields is None:
                yield line_idx, data
                continue
            _res = {}
            for k, v in data.items():
                if k in fields:
                    _res[k] = v
            if len(_res) < len(fields):
                if dropna:
                    continue
                else:
                    raise ValueError('invalid instance at line: {}'.format(line_idx))
            yield line_idx, _res


 def read_conll(path, encoding='utf-8', indexes=None, dropna=True):
    """
    Construct a generator to read conll items
    :param path: file path
    :param encoding: file's encoding, default: utf-8
    :param indexes: conll object's column indexes that needed, if None, all columns are needed. default: None
    :param dropna: weather to ignore and drop invalid data,
            if False, raise ValueError when reading invalid data. default: True
    :return: generator, every time yield (line number, conll item)
    """
    def parse_conll(sample):
        sample = list(map(list, zip(*sample)))
        sample = [sample[i] for i in indexes]
        for f in sample:
            if len(f) <= 0:
                raise ValueError('empty field')
        return sample
    with open(path, 'r', encoding=encoding) as f:
        sample = []
        start = next(f)
        if '-DOCSTART-' not in start:
            sample.append(start.split())
        for line_idx, line in enumerate(f, 1):
            if line.startswith('\n'):
                if len(sample):
                    try:
                        res = parse_conll(sample)
                        sample = []
                        yield line_idx, res
                    except Exception as e:
                        if dropna:
                            continue
                        raise ValueError('invalid instance at line: {}'.format(line_idx))
            elif line.startswith('#'):
                continue
            else:
                sample.append(line.split())
        if len(sample) > 0:
            try:
                res = parse_conll(sample)
                yield line_idx, res
            except Exception as e:
                if dropna:
                    return
                raise ValueError('invalid instance at line: {}'.format(line_idx))
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -1,4 +1,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.utils.rnn as rnn

 from fastNLP.modules.utils import initial_parameter

@@ -19,21 +21,44 @@ class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                 bidirectional=False, bias=True, initial_method=None, get_hidden=False):
        super(LSTM, self).__init__()
        self.batch_first = batch_first
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
                            dropout=dropout, bidirectional=bidirectional)
        self.get_hidden = get_hidden
        initial_parameter(self, initial_method)

    def forward(self, x, h0=None, c0=None):
    def forward(self, x, seq_lens=None, h0=None, c0=None):
        if h0 is not None and c0 is not None:
            x, (ht, ct) = self.lstm(x, (h0, c0))
            hx = (h0, c0)
        else:
            x, (ht, ct) = self.lstm(x)
        if self.get_hidden:
            return x, (ht, ct)
            hx = None
        if seq_lens is not None and not isinstance(x, rnn.PackedSequence):
            print('padding')
            sort_lens, sort_idx = torch.sort(seq_lens, dim=0, descending=True)
            if self.batch_first:
                x = x[sort_idx]
            else:
                x = x[:, sort_idx]
            x = rnn.pack_padded_sequence(x, sort_lens, batch_first=self.batch_first)
            output, hx = self.lstm(x, hx)  # -> [N,L,C]
            output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first)
            _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
            if self.batch_first:
                output = output[unsort_idx]
            else:
                output = output[:, unsort_idx]
        else:
            return x
            output, hx = self.lstm(x, hx)
        if self.get_hidden:
            return output, hx
        return output


 if __name__ == "__main__":
    lstm = LSTM(10)
    lstm = LSTM(input_size=2, hidden_size=2, get_hidden=False)
    x = torch.randn((3, 5, 2))
    seq_lens = torch.tensor([5,1,2])
    y = lstm(x, seq_lens)
    print(x)
    print(y)
    print(x.size(), y.size(), )
--- a/test/core/test_dataset.py
+++ b/test/core/test_dataset.py
@@ -202,25 +202,11 @@ class TestDataSetMethods(unittest.TestCase):
        self.assertTrue(isinstance(ans, FieldArray))
        self.assertEqual(ans.content, [[5, 6]] * 10)

    def test_reader(self):
        # 跑通即可
        ds = DataSet().read_naive("test/data_for_tests/tutorial_sample_dataset.csv")
        self.assertTrue(isinstance(ds, DataSet))
        self.assertTrue(len(ds) > 0)

        ds = DataSet().read_rawdata("test/data_for_tests/people_daily_raw.txt")
        self.assertTrue(isinstance(ds, DataSet))
        self.assertTrue(len(ds) > 0)

        ds = DataSet().read_pos("test/data_for_tests/people.txt")
        self.assertTrue(isinstance(ds, DataSet))
        self.assertTrue(len(ds) > 0)

    def test_add_null(self):
        # TODO test failed because 'fastNLP\core\fieldarray.py:143: RuntimeError'
        ds = DataSet()
        ds.add_field('test', [])
        ds.set_target('test')
    # def test_add_null(self):
    #     # TODO test failed because 'fastNLP\core\fieldarray.py:143: RuntimeError'
    #     ds = DataSet()
    #     ds.add_field('test', [])
    #     ds.set_target('test')


 class TestDataSetIter(unittest.TestCase):
--- a/test/data_for_tests/sample_snli.jsonl
+++ b/test/data_for_tests/sample_snli.jsonl
@@ -0,0 +1,3 @@
 {"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"}
 {"annotator_labels": ["contradiction"], "captionID": "3416050480.jpg#4", "gold_label": "contradiction", "pairID": "3416050480.jpg#4r1c", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is at a diner, ordering an omelette.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) ( ordering ( an omelette ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) (NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP (DT an) (NN omelette))))) (. .)))"}
 {"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"}
--- a/test/io/test_dataset_loader.py
+++ b/test/io/test_dataset_loader.py
@@ -1,8 +1,7 @@
 import unittest

 from fastNLP.io.dataset_loader import Conll2003Loader, PeopleDailyCorpusLoader, ConllCWSReader, \
    ZhConllPOSReader, ConllxDataLoader

 from fastNLP.io.dataset_loader import Conll2003Loader, PeopleDailyCorpusLoader, \
    CSVLoader, SNLILoader

 class TestDatasetLoader(unittest.TestCase):

@@ -17,11 +16,11 @@ class TestDatasetLoader(unittest.TestCase):
    def test_PeopleDailyCorpusLoader(self):
        data_set = PeopleDailyCorpusLoader().load("test/data_for_tests/people_daily_raw.txt")

    def test_ConllCWSReader(self):
        dataset = ConllCWSReader().load("test/data_for_tests/conll_example.txt")

    def test_ZhConllPOSReader(self):
        dataset = ZhConllPOSReader().load("test/data_for_tests/zh_sample.conllx")
    def test_CSVLoader(self):
        ds = CSVLoader(sep='\t', headers=['words', 'label'])\
            .load('test/data_for_tests/tutorial_sample_dataset.csv')
        assert len(ds) > 0

    def test_ConllxDataLoader(self):
        dataset = ConllxDataLoader().load("test/data_for_tests/zh_sample.conllx")
    def test_SNLILoader(self):
        ds = SNLILoader().load('test/data_for_tests/sample_snli.jsonl')
        assert len(ds) == 3