Browse Source

changes to batch methods

- [action] add k-means bucketing, partition sequences into buckets of nearly the same length
- [trainer] print train loss every 10 steps
- [loader] cws pku loader split sequence longer than max_seq_len into several shorter sequences
tags/v0.1.0
FengZiYjun 7 years ago
parent
commit
8e3e6d4579
5 changed files with 28 additions and 16 deletions
  1. +10
    -9
      fastNLP/core/action.py
  2. +1
    -1
      fastNLP/core/tester.py
  3. +3
    -0
      fastNLP/core/trainer.py
  4. +13
    -6
      fastNLP/loader/dataset_loader.py
  5. +1
    -0
      fastNLP/loader/preprocess.py

+ 10
- 9
fastNLP/core/action.py View File

@@ -14,7 +14,7 @@ class Action(object):


def k_means_1d(x, k, max_iter=100): def k_means_1d(x, k, max_iter=100):
""" """
Perform k-means on 1-D data.
:param x: list of int, representing points in 1-D. :param x: list of int, representing points in 1-D.
:param k: the number of clusters required. :param k: the number of clusters required.
:param max_iter: maximum iteration :param max_iter: maximum iteration
@@ -117,12 +117,12 @@ class BucketSampler(BaseSampler):


def __init__(self, data_set): def __init__(self, data_set):
super(BucketSampler, self).__init__(data_set) super(BucketSampler, self).__init__(data_set)
BUCKETS = ([None] * 10)
BUCKETS = ([None] * 20)
self.length_freq = dict(Counter([len(example) for example in data_set])) self.length_freq = dict(Counter([len(example) for example in data_set]))
self.buckets = k_means_bucketing(data_set, BUCKETS) self.buckets = k_means_bucketing(data_set, BUCKETS)


def __iter__(self): def __iter__(self):
bucket_samples = self.buckets[np.random.randint(0, len(self.buckets) + 1)]
bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))]
np.random.shuffle(bucket_samples) np.random.shuffle(bucket_samples)
return iter(bucket_samples) return iter(bucket_samples)


@@ -140,10 +140,11 @@ class Batchifier(object):


def __iter__(self): def __iter__(self):
batch = [] batch = []
for idx in self.sampler:
batch.append(idx)
if len(batch) == self.batch_size:
while True:
for idx in self.sampler:
batch.append(idx)
if len(batch) == self.batch_size:
yield batch
batch = []
if 0 < len(batch) < self.batch_size and self.drop_last is False:
yield batch yield batch
batch = []
if 0 < len(batch) < self.batch_size and self.drop_last is False:
yield batch

+ 1
- 1
fastNLP/core/tester.py View File

@@ -174,7 +174,7 @@ class POSTester(BaseTester):
truth = torch.Tensor(truth) truth = torch.Tensor(truth)
if torch.cuda.is_available() and self.use_cuda: if torch.cuda.is_available() and self.use_cuda:
truth = truth.cuda() truth = truth.cuda()
loss = self.model.loss(predict, truth, self.seq_len)
loss = self.model.loss(predict, truth, self.seq_len) / self.batch_size
prediction = self.model.prediction(predict, self.seq_len) prediction = self.model.prediction(predict, self.seq_len)
results = torch.Tensor(prediction).view(-1,) results = torch.Tensor(prediction).view(-1,)
if torch.cuda.is_available() and self.use_cuda: if torch.cuda.is_available() and self.use_cuda:


+ 3
- 0
fastNLP/core/trainer.py View File

@@ -101,6 +101,9 @@ class BaseTrainer(Action):
self.grad_backward(loss) self.grad_backward(loss)
self.update() self.update()


if step % 10 == 0:
print("[epoch {} step {}] train loss={:.2f}".format(epoch, step, loss.data))

if self.validate: if self.validate:
if data_dev is None: if data_dev is None:
raise RuntimeError("No validation data provided.") raise RuntimeError("No validation data provided.")


+ 13
- 6
fastNLP/loader/dataset_loader.py View File

@@ -30,6 +30,7 @@ class POSDatasetLoader(DatasetLoader):
and "Hello world !". Each word has its own label from label1 and "Hello world !". Each word has its own label from label1
to label5. to label5.
""" """

def __init__(self, data_name, data_path): def __init__(self, data_name, data_path):
super(POSDatasetLoader, self).__init__(data_name, data_path) super(POSDatasetLoader, self).__init__(data_name, data_path)


@@ -86,7 +87,7 @@ class TokenizeDatasetLoader(DatasetLoader):
def __init__(self, data_name, data_path): def __init__(self, data_name, data_path):
super(TokenizeDatasetLoader, self).__init__(data_name, data_path) super(TokenizeDatasetLoader, self).__init__(data_name, data_path)


def load_pku(self, max_seq_len=64):
def load_pku(self, max_seq_len=32):
""" """
load pku dataset for Chinese word segmentation load pku dataset for Chinese word segmentation
CWS (Chinese Word Segmentation) pku training dataset format: CWS (Chinese Word Segmentation) pku training dataset format:
@@ -107,12 +108,10 @@ class TokenizeDatasetLoader(DatasetLoader):
sentences = f.readlines() sentences = f.readlines()
data = [] data = []
for sent in sentences: for sent in sentences:
tokens = sent.strip().split()
words = [] words = []
labels = [] labels = []
tokens = sent.strip().split()
for start in range(len(tokens) // max_seq_len):

for token in token_seq:
for token in tokens:
if len(token) == 1: if len(token) == 1:
words.append(token) words.append(token)
labels.append("S") labels.append("S")
@@ -124,7 +123,15 @@ class TokenizeDatasetLoader(DatasetLoader):
labels.append("M") labels.append("M")
words.append(token[-1]) words.append(token[-1])
labels.append("E") labels.append("E")
data.append([words, labels])
num_samples = len(words) // max_seq_len
if len(words) % max_seq_len != 0:
num_samples += 1
for sample_idx in range(num_samples):
start = sample_idx * max_seq_len
end = (sample_idx + 1) * max_seq_len
seq_words = words[start:end]
seq_labels = labels[start:end]
data.append([seq_words, seq_labels])
return data return data






+ 1
- 0
fastNLP/loader/preprocess.py View File

@@ -23,6 +23,7 @@ def save_pickle(obj, pickle_path, file_name):
def load_pickle(pickle_path, file_name): def load_pickle(pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "rb") as f: with open(os.path.join(pickle_path, file_name), "rb") as f:
obj = _pickle.load(f) obj = _pickle.load(f)
print("{} loaded. ".format(file_name))
return obj return obj






Loading…
Cancel
Save