Browse Source

changes to batch methods

- [action] add k-means bucketing, partition sequences into buckets of nearly the same length
- [trainer] print train loss every 10 steps
- [loader] cws pku loader split sequence longer than max_seq_len into several shorter sequences
tags/v0.1.0
FengZiYjun 7 years ago
parent
commit
8e3e6d4579
5 changed files with 28 additions and 16 deletions
  1. +10
    -9
      fastNLP/core/action.py
  2. +1
    -1
      fastNLP/core/tester.py
  3. +3
    -0
      fastNLP/core/trainer.py
  4. +13
    -6
      fastNLP/loader/dataset_loader.py
  5. +1
    -0
      fastNLP/loader/preprocess.py

+ 10
- 9
fastNLP/core/action.py View File

@@ -14,7 +14,7 @@ class Action(object):

def k_means_1d(x, k, max_iter=100):
"""
Perform k-means on 1-D data.
:param x: list of int, representing points in 1-D.
:param k: the number of clusters required.
:param max_iter: maximum iteration
@@ -117,12 +117,12 @@ class BucketSampler(BaseSampler):

def __init__(self, data_set):
super(BucketSampler, self).__init__(data_set)
BUCKETS = ([None] * 10)
BUCKETS = ([None] * 20)
self.length_freq = dict(Counter([len(example) for example in data_set]))
self.buckets = k_means_bucketing(data_set, BUCKETS)

def __iter__(self):
bucket_samples = self.buckets[np.random.randint(0, len(self.buckets) + 1)]
bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))]
np.random.shuffle(bucket_samples)
return iter(bucket_samples)

@@ -140,10 +140,11 @@ class Batchifier(object):

def __iter__(self):
batch = []
for idx in self.sampler:
batch.append(idx)
if len(batch) == self.batch_size:
while True:
for idx in self.sampler:
batch.append(idx)
if len(batch) == self.batch_size:
yield batch
batch = []
if 0 < len(batch) < self.batch_size and self.drop_last is False:
yield batch
batch = []
if 0 < len(batch) < self.batch_size and self.drop_last is False:
yield batch

+ 1
- 1
fastNLP/core/tester.py View File

@@ -174,7 +174,7 @@ class POSTester(BaseTester):
truth = torch.Tensor(truth)
if torch.cuda.is_available() and self.use_cuda:
truth = truth.cuda()
loss = self.model.loss(predict, truth, self.seq_len)
loss = self.model.loss(predict, truth, self.seq_len) / self.batch_size
prediction = self.model.prediction(predict, self.seq_len)
results = torch.Tensor(prediction).view(-1,)
if torch.cuda.is_available() and self.use_cuda:


+ 3
- 0
fastNLP/core/trainer.py View File

@@ -101,6 +101,9 @@ class BaseTrainer(Action):
self.grad_backward(loss)
self.update()

if step % 10 == 0:
print("[epoch {} step {}] train loss={:.2f}".format(epoch, step, loss.data))

if self.validate:
if data_dev is None:
raise RuntimeError("No validation data provided.")


+ 13
- 6
fastNLP/loader/dataset_loader.py View File

@@ -30,6 +30,7 @@ class POSDatasetLoader(DatasetLoader):
and "Hello world !". Each word has its own label from label1
to label5.
"""

def __init__(self, data_name, data_path):
super(POSDatasetLoader, self).__init__(data_name, data_path)

@@ -86,7 +87,7 @@ class TokenizeDatasetLoader(DatasetLoader):
def __init__(self, data_name, data_path):
super(TokenizeDatasetLoader, self).__init__(data_name, data_path)

def load_pku(self, max_seq_len=64):
def load_pku(self, max_seq_len=32):
"""
load pku dataset for Chinese word segmentation
CWS (Chinese Word Segmentation) pku training dataset format:
@@ -107,12 +108,10 @@ class TokenizeDatasetLoader(DatasetLoader):
sentences = f.readlines()
data = []
for sent in sentences:
tokens = sent.strip().split()
words = []
labels = []
tokens = sent.strip().split()
for start in range(len(tokens) // max_seq_len):

for token in token_seq:
for token in tokens:
if len(token) == 1:
words.append(token)
labels.append("S")
@@ -124,7 +123,15 @@ class TokenizeDatasetLoader(DatasetLoader):
labels.append("M")
words.append(token[-1])
labels.append("E")
data.append([words, labels])
num_samples = len(words) // max_seq_len
if len(words) % max_seq_len != 0:
num_samples += 1
for sample_idx in range(num_samples):
start = sample_idx * max_seq_len
end = (sample_idx + 1) * max_seq_len
seq_words = words[start:end]
seq_labels = labels[start:end]
data.append([seq_words, seq_labels])
return data




+ 1
- 0
fastNLP/loader/preprocess.py View File

@@ -23,6 +23,7 @@ def save_pickle(obj, pickle_path, file_name):
def load_pickle(pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "rb") as f:
obj = _pickle.load(f)
print("{} loaded. ".format(file_name))
return obj




Loading…
Cancel
Save