diff --git a/fastNLP/core/action.py b/fastNLP/core/action.py index ca2a3ba4..17680216 100644 --- a/fastNLP/core/action.py +++ b/fastNLP/core/action.py @@ -14,7 +14,7 @@ class Action(object): def k_means_1d(x, k, max_iter=100): """ - + Perform k-means on 1-D data. :param x: list of int, representing points in 1-D. :param k: the number of clusters required. :param max_iter: maximum iteration @@ -117,12 +117,12 @@ class BucketSampler(BaseSampler): def __init__(self, data_set): super(BucketSampler, self).__init__(data_set) - BUCKETS = ([None] * 10) + BUCKETS = ([None] * 20) self.length_freq = dict(Counter([len(example) for example in data_set])) self.buckets = k_means_bucketing(data_set, BUCKETS) def __iter__(self): - bucket_samples = self.buckets[np.random.randint(0, len(self.buckets) + 1)] + bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))] np.random.shuffle(bucket_samples) return iter(bucket_samples) @@ -140,10 +140,11 @@ class Batchifier(object): def __iter__(self): batch = [] - for idx in self.sampler: - batch.append(idx) - if len(batch) == self.batch_size: + while True: + for idx in self.sampler: + batch.append(idx) + if len(batch) == self.batch_size: + yield batch + batch = [] + if 0 < len(batch) < self.batch_size and self.drop_last is False: yield batch - batch = [] - if 0 < len(batch) < self.batch_size and self.drop_last is False: - yield batch diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index bbd98581..b22f32ef 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -174,7 +174,7 @@ class POSTester(BaseTester): truth = torch.Tensor(truth) if torch.cuda.is_available() and self.use_cuda: truth = truth.cuda() - loss = self.model.loss(predict, truth, self.seq_len) + loss = self.model.loss(predict, truth, self.seq_len) / self.batch_size prediction = self.model.prediction(predict, self.seq_len) results = torch.Tensor(prediction).view(-1,) if torch.cuda.is_available() and self.use_cuda: diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 1e7171ad..3da6b061 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -101,6 +101,9 @@ class BaseTrainer(Action): self.grad_backward(loss) self.update() + if step % 10 == 0: + print("[epoch {} step {}] train loss={:.2f}".format(epoch, step, loss.data)) + if self.validate: if data_dev is None: raise RuntimeError("No validation data provided.") diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 99073c5a..13a96030 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -30,6 +30,7 @@ class POSDatasetLoader(DatasetLoader): and "Hello world !". Each word has its own label from label1 to label5. """ + def __init__(self, data_name, data_path): super(POSDatasetLoader, self).__init__(data_name, data_path) @@ -86,7 +87,7 @@ class TokenizeDatasetLoader(DatasetLoader): def __init__(self, data_name, data_path): super(TokenizeDatasetLoader, self).__init__(data_name, data_path) - def load_pku(self, max_seq_len=64): + def load_pku(self, max_seq_len=32): """ load pku dataset for Chinese word segmentation CWS (Chinese Word Segmentation) pku training dataset format: @@ -107,12 +108,10 @@ class TokenizeDatasetLoader(DatasetLoader): sentences = f.readlines() data = [] for sent in sentences: + tokens = sent.strip().split() words = [] labels = [] - tokens = sent.strip().split() - for start in range(len(tokens) // max_seq_len): - - for token in token_seq: + for token in tokens: if len(token) == 1: words.append(token) labels.append("S") @@ -124,7 +123,15 @@ class TokenizeDatasetLoader(DatasetLoader): labels.append("M") words.append(token[-1]) labels.append("E") - data.append([words, labels]) + num_samples = len(words) // max_seq_len + if len(words) % max_seq_len != 0: + num_samples += 1 + for sample_idx in range(num_samples): + start = sample_idx * max_seq_len + end = (sample_idx + 1) * max_seq_len + seq_words = words[start:end] + seq_labels = labels[start:end] + data.append([seq_words, seq_labels]) return data diff --git a/fastNLP/loader/preprocess.py b/fastNLP/loader/preprocess.py index 40003c73..fd378ba0 100644 --- a/fastNLP/loader/preprocess.py +++ b/fastNLP/loader/preprocess.py @@ -23,6 +23,7 @@ def save_pickle(obj, pickle_path, file_name): def load_pickle(pickle_path, file_name): with open(os.path.join(pickle_path, file_name), "rb") as f: obj = _pickle.load(f) + print("{} loaded. ".format(file_name)) return obj