Browse Source

Merge branch 'dataset' of https://github.com/yhcc/fastNLP into dataset

tags/v0.2.0
yh 5 years ago
parent
commit
89ce85b6ed
12 changed files with 185 additions and 276 deletions
  1. +44
    -0
      fastNLP/api/pos_tagger.py
  2. +2
    -2
      fastNLP/core/batch.py
  3. +49
    -80
      fastNLP/core/dataset.py
  4. +9
    -74
      fastNLP/core/field.py
  5. +39
    -0
      fastNLP/core/fieldarray.py
  6. +0
    -52
      fastNLP/core/instance.py
  7. +4
    -37
      fastNLP/core/predictor.py
  8. +9
    -2
      fastNLP/loader/model_loader.py
  9. +2
    -1
      fastNLP/models/sequence_modeling.py
  10. +6
    -2
      fastNLP/saver/model_saver.py
  11. +20
    -25
      reproduction/pos_tag_model/train_pos_tag.py
  12. +1
    -1
      requirements.txt

+ 44
- 0
fastNLP/api/pos_tagger.py View File

@@ -0,0 +1,44 @@
import pickle

import numpy as np

from fastNLP.core.dataset import DataSet
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.predictor import Predictor


class POS_tagger:
def __init__(self):
pass

def predict(self, query):
"""
:param query: List[str]
:return answer: List[str]

"""
# TODO: 根据query 构建DataSet
pos_dataset = DataSet()
pos_dataset["text_field"] = np.array(query)

# 加载pipeline和model
pipeline = self.load_pipeline("./xxxx")

# 将DataSet作为参数运行 pipeline
pos_dataset = pipeline(pos_dataset)

# 加载模型
model = ModelLoader().load_pytorch("./xxx")

# 调 predictor
predictor = Predictor()
output = predictor.predict(model, pos_dataset)

# TODO: 转成最终输出
return None

@staticmethod
def load_pipeline(path):
with open(path, "r") as fp:
pipeline = pickle.load(fp)
return pipeline

+ 2
- 2
fastNLP/core/batch.py View File

@@ -56,8 +56,8 @@ class Batch(object):
indices = self.idx_list[self.curidx:endidx]

for field_name, field in self.dataset.get_fields():
batch = field.get(indices)
if not field.tensorable: #TODO 修改
batch = torch.from_numpy(field.get(indices))
if not field.need_tensor: #TODO 修改
pass
elif field.is_target:
batch_y[field_name] = batch


+ 49
- 80
fastNLP/core/dataset.py View File

@@ -2,10 +2,12 @@ import random
import sys
from collections import defaultdict
from copy import deepcopy
import numpy as np

from fastNLP.core.field import TextField, LabelField
from fastNLP.core.instance import Instance
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.fieldarray import FieldArray

_READERS = {}

@@ -14,43 +16,36 @@ class DataSet(object):

"""

def __init__(self, fields=None):
"""

"""
pass

def index_all(self, vocab):
for ins in self:
ins.index_all(vocab)
return self
def __init__(self, instance=None):
if instance is not None:
self._convert_ins(instance)
else:
self.field_arrays = {}

def index_field(self, field_name, vocab):
if isinstance(field_name, str):
field_list = [field_name]
vocab_list = [vocab]
def _convert_ins(self, ins_list):
if isinstance(ins_list, list):
for ins in ins_list:
self.append(ins)
else:
classes = (list, tuple)
assert isinstance(field_name, classes) and isinstance(vocab, classes) and len(field_name) == len(vocab)
field_list = field_name
vocab_list = vocab

for name, vocabs in zip(field_list, vocab_list):
for ins in self:
ins.index_field(name, vocabs)
return self
self.append(ins)

def to_tensor(self, idx: int, padding_length: dict):
"""Convert an instance in a dataset to tensor.
def append(self, ins):
# no field
if len(self.field_arrays) == 0:
for name, field in ins.field.items():
self.field_arrays[name] = FieldArray(name, [field])
else:
assert len(self.field_arrays) == len(ins.field)
for name, field in ins.field.items():
assert name in self.field_arrays
self.field_arrays[name].append(field)

:param idx: int, the index of the instance in the dataset.
:param padding_length: int
:return tensor_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ])
tensor_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ])
def get_fields(self):
return self.field_arrays

"""
ins = self[idx]
return ins.to_tensor(padding_length, self.origin_len)
def __len__(self):
field = self.field_arrays.values()[0]
return len(field)

def get_length(self):
"""Fetch lengths of all fields in all instances in a dataset.
@@ -59,15 +54,10 @@ class DataSet(object):
The list contains lengths of this field in all instances.

"""
lengths = defaultdict(list)
for ins in self:
for field_name, field_length in ins.get_length().items():
lengths[field_name].append(field_length)
return lengths
pass

def shuffle(self):
random.shuffle(self)
return self
pass

def split(self, ratio, shuffle=True):
"""Train/dev splitting
@@ -78,58 +68,37 @@ class DataSet(object):
dev_set: a DataSet object, representing the validation set

"""
assert 0 < ratio < 1
if shuffle:
self.shuffle()
split_idx = int(len(self) * ratio)
dev_set = deepcopy(self)
train_set = deepcopy(self)
del train_set[:split_idx]
del dev_set[split_idx:]
return train_set, dev_set
pass

def rename_field(self, old_name, new_name):
"""rename a field
"""
for ins in self:
ins.rename_field(old_name, new_name)
if old_name in self.field_arrays:
self.field_arrays[new_name] = self.field_arrays.pop(old_name)
else:
raise KeyError
return self

def set_target(self, **fields):
def set_is_target(self, **fields):
"""Change the flag of `is_target` for all instance. For fields not set here, leave their `is_target` unchanged.

:param key-value pairs for field-name and `is_target` value(True, False or None).
"""
for ins in self:
ins.set_target(**fields)
return self

def update_vocab(self, **name_vocab):
"""using certain field data to update vocabulary.

e.g. ::

# update word vocab and label vocab seperately
dataset.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
:param key-value pairs for field-name and `is_target` value(True, False).
"""
for field_name, vocab in name_vocab.items():
for ins in self:
vocab.update(ins[field_name].contents())
for name, val in fields.items():
if name in self.field_arrays:
assert isinstance(val, bool)
self.field_arrays[name].is_target = val
else:
raise KeyError
return self

def set_origin_len(self, origin_field, origin_len_name=None):
"""make dataset tensor output contain origin_len field.

e.g. ::

# output "word_seq_origin_len", lengths based on "word_seq" field
dataset.set_origin_len("word_seq")
"""
if origin_field is None:
self.origin_len = None
else:
self.origin_len = (origin_field + "_origin_len", origin_field) \
if origin_len_name is None else (origin_len_name, origin_field)
def set_need_tensor(self, **kwargs):
for name, val in kwargs.items():
if name in self.field_arrays:
assert isinstance(val, bool)
self.field_arrays[name].need_tensor = val
else:
raise KeyError
return self

def __getattribute__(self, name):


+ 9
- 74
fastNLP/core/field.py View File

@@ -7,10 +7,9 @@ class Field(object):

"""

def __init__(self, name, is_target: bool):
self.name = name
def __init__(self, content, is_target: bool):
self.is_target = is_target
self.content = None
self.content = content

def index(self, vocab):
"""create index field
@@ -29,23 +28,15 @@ class Field(object):
raise NotImplementedError

def __repr__(self):
return self.contents().__repr__()

def new(self, *args, **kwargs):
return self.__class__(*args, **kwargs, is_target=self.is_target)
return self.content.__repr__()

class TextField(Field):
def __init__(self, name, text, is_target):
def __init__(self, text, is_target):
"""
:param text: list of strings
:param is_target: bool
"""
super(TextField, self).__init__(name, is_target)
self.content = text

def index(self, vocab):
idx_field = IndexField(self.name+'_idx', self.content, vocab, self.is_target)
return idx_field
super(TextField, self).__init__(text, is_target)


class IndexField(Field):
@@ -82,75 +73,19 @@ class LabelField(Field):

"""
def __init__(self, label, is_target=True):
super(LabelField, self).__init__(is_target)
self.label = label
self._index = None
super(LabelField, self).__init__(label, is_target)

def get_length(self):
"""Fetch the length of the label field.

:return length: int, the length of the label, always 1.
"""
return 1

def index(self, vocab):
if self._index is None:
if isinstance(self.label, str):
self._index = vocab[self.label]
return self._index

def to_tensor(self, padding_length):
if self._index is None:
if isinstance(self.label, int):
return torch.tensor(self.label)
elif isinstance(self.label, str):
raise RuntimeError("Field {} not indexed. Call index method.".format(self.label))
else:
raise RuntimeError(
"Not support type for LabelField. Expect str or int, got {}.".format(type(self.label)))
else:
return torch.LongTensor([self._index])

def contents(self):
return [self.label]

class SeqLabelField(Field):
def __init__(self, label_seq, is_target=True):
super(SeqLabelField, self).__init__(is_target)
self.label_seq = label_seq
self._index = None

def get_length(self):
return len(self.label_seq)

def index(self, vocab):
if self._index is None:
self._index = [vocab[c] for c in self.label_seq]
return self._index

def to_tensor(self, padding_length):
pads = [0] * (padding_length - self.get_length())
if self._index is None:
if self.get_length() == 0:
return torch.LongTensor(pads)
elif isinstance(self.label_seq[0], int):
return torch.LongTensor(self.label_seq + pads)
elif isinstance(self.label_seq[0], str):
raise RuntimeError("Field {} not indexed. Call index method.".format(self.label))
else:
raise RuntimeError(
"Not support type for SeqLabelField. Expect str or int, got {}.".format(type(self.label)))
else:
return torch.LongTensor(self._index + pads)

def contents(self):
return self.label_seq.copy()
super(SeqLabelField, self).__init__(label_seq, is_target)


class CharTextField(Field):
def __init__(self, text, max_word_len, is_target=False):
super(CharTextField, self).__init__(is_target)
self.text = text
# TODO
raise NotImplementedError
self.max_word_len = max_word_len
self._index = []



+ 39
- 0
fastNLP/core/fieldarray.py View File

@@ -0,0 +1,39 @@
import torch
import numpy as np

class FieldArray(object):
def __init__(self, name, content, padding_val=0, is_target=True, need_tensor=True):
self.name = name
self.data = [self._convert_np(val) for val in content]
self.padding_val = padding_val
self.is_target = is_target
self.need_tensor = need_tensor

def _convert_np(self, val):
if not isinstance(val, np.array):
return np.array(val)
else:
return val

def append(self, val):
self.data.append(self._convert_np(val))

def get(self, idxes):
if isinstance(idxes, int):
return self.data[idxes]
elif isinstance(idxes, list):
id_list = np.array(idxes)
batch_size = len(id_list)
len_list = [(i, self.data[i].shape[0]) for i in id_list]
_, max_len = max(len_list, key=lambda x: x[1])
array = np.full((batch_size, max_len), self.padding_val, dtype=np.int32)

for i, (idx, length) in enumerate(len_list):
if length == max_len:
array[i] = self.data[idx]
else:
array[i][:length] = self.data[idx]
return array

def __len__(self):
return len(self.data)

+ 0
- 52
fastNLP/core/instance.py View File

@@ -7,8 +7,6 @@ class Instance(object):

def __init__(self, **fields):
self.fields = fields
self.has_index = False
self.indexes = {}

def add_field(self, field_name, field):
self.fields[field_name] = field
@@ -17,8 +15,6 @@ class Instance(object):
def rename_field(self, old_name, new_name):
if old_name in self.fields:
self.fields[new_name] = self.fields.pop(old_name)
if old_name in self.indexes:
self.indexes[new_name] = self.indexes.pop(old_name)
else:
raise KeyError("error, no such field: {}".format(old_name))
return self
@@ -38,53 +34,5 @@ class Instance(object):
def __setitem__(self, name, field):
return self.add_field(name, field)

def get_length(self):
"""Fetch the length of all fields in the instance.

:return length: dict of (str: int), which means (field name: field length).

"""
length = {name: field.get_length() for name, field in self.fields.items()}
return length

def index_field(self, field_name, vocab):
"""use `vocab` to index certain field
"""
self.indexes[field_name] = self.fields[field_name].index(vocab)
return self

def index_all(self, vocab):
"""use `vocab` to index all fields
"""
if self.has_index:
print("error")
return self.indexes
indexes = {name: field.index(vocab) for name, field in self.fields.items()}
self.indexes = indexes
return indexes

def to_tensor(self, padding_length: dict, origin_len=None):
"""Convert instance to tensor.

:param padding_length: dict of (str: int), which means (field name: padding_length of this field)
:return tensor_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ])
tensor_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ])
If is_target is False for all fields, tensor_y would be an empty dict.
"""
tensor_x = {}
tensor_y = {}
for name, field in self.fields.items():
if field.is_target is True:
tensor_y[name] = field.to_tensor(padding_length[name])
elif field.is_target is False:
tensor_x[name] = field.to_tensor(padding_length[name])
else:
# is_target is None
continue
if origin_len is not None:
name, field_name = origin_len
tensor_x[name] = torch.LongTensor([self.fields[field_name].get_length()])
return tensor_x, tensor_y

def __repr__(self):
return self.fields.__repr__()

+ 4
- 37
fastNLP/core/predictor.py View File

@@ -2,9 +2,7 @@ import numpy as np
import torch

from fastNLP.core.batch import Batch
from fastNLP.core.preprocess import load_pickle
from fastNLP.core.sampler import SequentialSampler
from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq2tag_dataset, convert_seq_dataset


class Predictor(object):
@@ -16,19 +14,9 @@ class Predictor(object):
Currently, Predictor does not support GPU.
"""

def __init__(self, pickle_path, post_processor):
"""

:param pickle_path: str, the path to the pickle files.
:param post_processor: a function or callable object, that takes list of batch outputs as input

"""
def __init__(self):
self.batch_size = 1
self.batch_output = []
self.pickle_path = pickle_path
self._post_processor = post_processor
self.label_vocab = load_pickle(self.pickle_path, "label2id.pkl")
self.word_vocab = load_pickle(self.pickle_path, "word2id.pkl")

def predict(self, network, data):
"""Perform inference using the trained model.
@@ -37,9 +25,6 @@ class Predictor(object):
:param data: a DataSet object.
:return: list of list of strings, [num_examples, tag_seq_length]
"""
# transform strings into DataSet object
# data = self.prepare_input(data)

# turn on the testing mode; clean up the history
self.mode(network, test=True)
batch_output = []
@@ -51,7 +36,7 @@ class Predictor(object):
prediction = self.data_forward(network, batch_x)
batch_output.append(prediction)

return self._post_processor(batch_output, self.label_vocab)
return batch_output

def mode(self, network, test=True):
if test:
@@ -64,37 +49,19 @@ class Predictor(object):
y = network(**x)
return y

def prepare_input(self, data):
"""Transform two-level list of strings into an DataSet object.
In the training pipeline, this is done by Preprocessor. But in inference time, we do not call Preprocessor.

:param data: list of list of strings.
::
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]

:return data_set: a DataSet instance.
"""
assert isinstance(data, list)
data = convert_seq_dataset(data)
data.index_field("word_seq", self.word_vocab)


class SeqLabelInfer(Predictor):
def __init__(self, pickle_path):
print(
"[FastNLP Warning] SeqLabelInfer will be deprecated. Please use Predictor directly.")
super(SeqLabelInfer, self).__init__(pickle_path, seq_label_post_processor)
super(SeqLabelInfer, self).__init__()


class ClassificationInfer(Predictor):
def __init__(self, pickle_path):
print(
"[FastNLP Warning] ClassificationInfer will be deprecated. Please use Predictor directly.")
super(ClassificationInfer, self).__init__(pickle_path, text_classify_post_processor)
super(ClassificationInfer, self).__init__()


def seq_label_post_processor(batch_outputs, label_vocab):


+ 9
- 2
fastNLP/loader/model_loader.py View File

@@ -8,8 +8,8 @@ class ModelLoader(BaseLoader):
Loader for models.
"""

def __init__(self, data_path):
super(ModelLoader, self).__init__(data_path)
def __init__(self):
super(ModelLoader, self).__init__()

@staticmethod
def load_pytorch(empty_model, model_path):
@@ -19,3 +19,10 @@ class ModelLoader(BaseLoader):
:param model_path: str, the path to the saved model.
"""
empty_model.load_state_dict(torch.load(model_path))
@staticmethod
def load_pytorch(model_path):
"""Load the entire model.

"""
return torch.load(model_path)

+ 2
- 1
fastNLP/models/sequence_modeling.py View File

@@ -127,7 +127,8 @@ class AdvSeqLabel(SeqLabeling):
:param word_seq: LongTensor, [batch_size, mex_len]
:param word_seq_origin_len: list of int.
:param truth: LongTensor, [batch_size, max_len]
:return y:
:return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting.
If truth is not None, return loss, a scalar. Used in training.
"""
self.mask = self.make_mask(word_seq, word_seq_origin_len)



+ 6
- 2
fastNLP/saver/model_saver.py View File

@@ -15,10 +15,14 @@ class ModelSaver(object):
"""
self.save_path = save_path

def save_pytorch(self, model):
def save_pytorch(self, model, param_only=True):
"""Save a pytorch model into .pkl file.

:param model: a PyTorch model
:param param_only: bool, whether only to save the model parameters or the entire model.

"""
torch.save(model.state_dict(), self.save_path)
if param_only is True:
torch.save(model.state_dict(), self.save_path)
else:
torch.save(model, self.save_path)

+ 20
- 25
reproduction/pos_tag_model/train_pos_tag.py View File

@@ -59,42 +59,37 @@ def infer():
print("Inference finished!")


def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader("good_name").load_config(cfgfile, {"train": train_args, "test": test_args})
def train():
# load config
trainer_args = ConfigSection()
model_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args})

# Data Loader
loader = PeopleDailyCorpusLoader()
train_data, _ = loader.load()

# Preprocessor
preprocessor = SeqLabelPreprocess()
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocessor.vocab_size
train_args["num_classes"] = preprocessor.num_classes
# TODO: define processors
# define pipeline
pp = Pipeline()
# TODO: pp.add_processor()

# Trainer
trainer = SeqLabelTrainer(**train_args.data)
# run the pipeline, get data_set
train_data = pp(train_data)

# Model
# define a model
model = AdvSeqLabel(train_args)
try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as e:
print("No saved model. Continue.")
pass

# Start training
# call trainer to train
trainer = SeqLabelTrainer(train_args)
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")
# save model
ModelSaver("./saved_model.pkl").save_pytorch(model, param_only=False)

# TODO:save pipeline



def test():


+ 1
- 1
requirements.txt View File

@@ -1,4 +1,4 @@
numpy>=1.14.2
torch==0.4.0
torch>=0.4.0
torchvision>=0.1.8
tensorboardX

Loading…
Cancel
Save