From 9aad8dff6e8aa31a004d23757126106264ce626f Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Mon, 30 Jul 2018 18:08:29 +0800
Subject: [PATCH] update

---
 fastNLP/core/action.py           | 78 ++++++++++++++++++++++++++++++++
 fastNLP/core/trainer.py          |  4 +-
 fastNLP/loader/dataset_loader.py |  9 +++-
 3 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/fastNLP/core/action.py b/fastNLP/core/action.py
index 2bc08b75..ca2a3ba4 100644
--- a/fastNLP/core/action.py
+++ b/fastNLP/core/action.py
@@ -1,3 +1,5 @@
+from collections import Counter
+
 import numpy as np
 
 
@@ -10,6 +12,63 @@ class Action(object):
         super(Action, self).__init__()
 
 
+def k_means_1d(x, k, max_iter=100):
+    """
+
+    :param x: list of int, representing points in 1-D.
+    :param k: the number of clusters required.
+    :param max_iter: maximum iteration
+    :return centroids: numpy array, centroids of the k clusters
+            assignment: numpy array, 1-D, the bucket id assigned to each example.
+    """
+    sorted_x = sorted(list(set(x)))
+    if len(sorted_x) < k:
+        raise ValueError("too few buckets")
+    gap = len(sorted_x) / k
+
+    centroids = np.array([sorted_x[int(x * gap)] for x in range(k)])
+    assign = None
+
+    for i in range(max_iter):
+        # Cluster Assignment step
+        assign = np.array([np.argmin([np.absolute(x_i - x) for x in centroids]) for x_i in x])
+        # Move centroids step
+        new_centroids = np.array([x[assign == k].mean() for k in range(k)])
+        if (new_centroids == centroids).all():
+            centroids = new_centroids
+            break
+        centroids = new_centroids
+    return np.array(centroids), assign
+
+
+def k_means_bucketing(all_inst, buckets):
+    """
+    :param all_inst: 3-level list
+                [
+                    [[word_11, word_12, word_13], [label_11. label_12]],  # sample 1
+                    [[word_21, word_22, word_23], [label_21. label_22]],  # sample 2
+                    ...
+                ]
+    :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length
+        threshold for each bucket (This is usually None.).
+    :return data: 2-level list
+                [
+                    [index_11, index_12, ...],  # bucket 1
+                    [index_21, index_22, ...],  # bucket 2
+                    ...
+                ]
+    """
+    bucket_data = [[] for _ in buckets]
+    num_buckets = len(buckets)
+    lengths = np.array([len(inst[0]) for inst in all_inst])
+    _, assignments = k_means_1d(lengths, num_buckets)
+
+    for idx, bucket_id in enumerate(assignments):
+        if buckets[bucket_id] is None or lengths[idx] <= buckets[bucket_id]:
+            bucket_data[bucket_id].append(idx)
+    return bucket_data
+
+
 class BaseSampler(object):
     """
         Base class for all samplers.
@@ -49,6 +108,25 @@ class RandomSampler(BaseSampler):
         return iter(np.random.permutation(self.data_set_length))
 
 
+class BucketSampler(BaseSampler):
+    """
+    Partition all samples into multiple buckets, each of which contains sentences of approximately the same length.
+    In sampling, first random choose a bucket. Then sample data from it.
+    The number of buckets is decided dynamically by the variance of sentence lengths.
+    """
+
+    def __init__(self, data_set):
+        super(BucketSampler, self).__init__(data_set)
+        BUCKETS = ([None] * 10)
+        self.length_freq = dict(Counter([len(example) for example in data_set]))
+        self.buckets = k_means_bucketing(data_set, BUCKETS)
+
+    def __iter__(self):
+        bucket_samples = self.buckets[np.random.randint(0, len(self.buckets) + 1)]
+        np.random.shuffle(bucket_samples)
+        return iter(bucket_samples)
+
+
 class Batchifier(object):
     """
     Wrap random or sequential sampler to generate a mini-batch.
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 996fe0e6..1e7171ad 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn as nn
 
 from fastNLP.core.action import Action
-from fastNLP.core.action import RandomSampler, Batchifier
+from fastNLP.core.action import RandomSampler, Batchifier, BucketSampler
 from fastNLP.core.tester import POSTester
 from fastNLP.saver.model_saver import ModelSaver
 
@@ -89,7 +89,7 @@ class BaseTrainer(Action):
 
             # turn on network training mode; define optimizer; prepare batch iterator
             self.mode(test=False)
-            self.iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=True))
+            self.iterator = iter(Batchifier(BucketSampler(data_train), self.batch_size, drop_last=True))
 
             # training iterations in one epoch
             for step in range(iterations):
diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py
index 88ff151d..99073c5a 100644
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -86,7 +86,7 @@ class TokenizeDatasetLoader(DatasetLoader):
     def __init__(self, data_name, data_path):
         super(TokenizeDatasetLoader, self).__init__(data_name, data_path)
 
-    def load_pku(self):
+    def load_pku(self, max_seq_len=64):
         """
         load pku dataset for Chinese word segmentation
         CWS (Chinese Word Segmentation) pku training dataset format:
@@ -98,8 +98,11 @@ class TokenizeDatasetLoader(DatasetLoader):
             E: ending of a word
             S: single character
 
+        :param max_seq_len: int, the maximum length of a sequence. If a sequence is longer than it, split it into
+                several sequences.
         :return: three-level lists
         """
+        assert isinstance(max_seq_len, int) and max_seq_len > 0
         with open(self.data_path, "r", encoding="utf-8") as f:
             sentences = f.readlines()
         data = []
@@ -107,7 +110,9 @@ class TokenizeDatasetLoader(DatasetLoader):
             words = []
             labels = []
             tokens = sent.strip().split()
-            for token in tokens:
+            for start in range(len(tokens) // max_seq_len):
+
+            for token in token_seq:
                 if len(token) == 1:
                     words.append(token)
                     labels.append("S")