changes to batch methods

- [action] add k-means bucketing, partition sequences into buckets of nearly the same length - [trainer] print train loss every 10 steps - [loader] cws pku loader split sequence longer than max_seq_len into several shorter sequences
7 years ago · 8e3e6d4579
--- a/fastNLP/core/action.py
+++ b/fastNLP/core/action.py
@@ -14,7 +14,7 @@ class Action(object):

 def k_means_1d(x, k, max_iter=100):
    """

    Perform k-means on 1-D data.
    :param x: list of int, representing points in 1-D.
    :param k: the number of clusters required.
    :param max_iter: maximum iteration
@@ -117,12 +117,12 @@ class BucketSampler(BaseSampler):

    def __init__(self, data_set):
        super(BucketSampler, self).__init__(data_set)
        BUCKETS = ([None] * 10)
        BUCKETS = ([None] * 20)
        self.length_freq = dict(Counter([len(example) for example in data_set]))
        self.buckets = k_means_bucketing(data_set, BUCKETS)

    def __iter__(self):
        bucket_samples = self.buckets[np.random.randint(0, len(self.buckets) + 1)]
        bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))]
        np.random.shuffle(bucket_samples)
        return iter(bucket_samples)

@@ -140,10 +140,11 @@ class Batchifier(object):

    def __iter__(self):
        batch = []
        for idx in self.sampler:
            batch.append(idx)
            if len(batch) == self.batch_size:
        while True:
            for idx in self.sampler:
                batch.append(idx)
                if len(batch) == self.batch_size:
                    yield batch
                    batch = []
            if 0 < len(batch) < self.batch_size and self.drop_last is False:
                yield batch
                batch = []
        if 0 < len(batch) < self.batch_size and self.drop_last is False:
            yield batch
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -174,7 +174,7 @@ class POSTester(BaseTester):
        truth = torch.Tensor(truth)
        if torch.cuda.is_available() and self.use_cuda:
            truth = truth.cuda()
        loss = self.model.loss(predict, truth, self.seq_len)
        loss = self.model.loss(predict, truth, self.seq_len) / self.batch_size
        prediction = self.model.prediction(predict, self.seq_len)
        results = torch.Tensor(prediction).view(-1,)
        if torch.cuda.is_available() and self.use_cuda:
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -101,6 +101,9 @@ class BaseTrainer(Action):
                self.grad_backward(loss)
                self.update()

                if step % 10 == 0:
                    print("[epoch {} step {}] train loss={:.2f}".format(epoch, step, loss.data))

            if self.validate:
                if data_dev is None:
                    raise RuntimeError("No validation data provided.")
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -30,6 +30,7 @@ class POSDatasetLoader(DatasetLoader):
    and "Hello world !". Each word has its own label from label1
    to label5.
    """

    def __init__(self, data_name, data_path):
        super(POSDatasetLoader, self).__init__(data_name, data_path)

@@ -86,7 +87,7 @@ class TokenizeDatasetLoader(DatasetLoader):
    def __init__(self, data_name, data_path):
        super(TokenizeDatasetLoader, self).__init__(data_name, data_path)

    def load_pku(self, max_seq_len=64):
    def load_pku(self, max_seq_len=32):
        """
        load pku dataset for Chinese word segmentation
        CWS (Chinese Word Segmentation) pku training dataset format:
@@ -107,12 +108,10 @@ class TokenizeDatasetLoader(DatasetLoader):
            sentences = f.readlines()
        data = []
        for sent in sentences:
            tokens = sent.strip().split()
            words = []
            labels = []
            tokens = sent.strip().split()
            for start in range(len(tokens) // max_seq_len):

            for token in token_seq:
            for token in tokens:
                if len(token) == 1:
                    words.append(token)
                    labels.append("S")
@@ -124,7 +123,15 @@ class TokenizeDatasetLoader(DatasetLoader):
                        labels.append("M")
                    words.append(token[-1])
                    labels.append("E")
            data.append([words, labels])
            num_samples = len(words) // max_seq_len
            if len(words) % max_seq_len != 0:
                num_samples += 1
            for sample_idx in range(num_samples):
                start = sample_idx * max_seq_len
                end = (sample_idx + 1) * max_seq_len
                seq_words = words[start:end]
                seq_labels = labels[start:end]
                data.append([seq_words, seq_labels])
        return data


--- a/fastNLP/loader/preprocess.py
+++ b/fastNLP/loader/preprocess.py
@@ -23,6 +23,7 @@ def save_pickle(obj, pickle_path, file_name):
 def load_pickle(pickle_path, file_name):
    with open(os.path.join(pickle_path, file_name), "rb") as f:
        obj = _pickle.load(f)
    print("{} loaded. ".format(file_name))
    return obj