- [action] add k-means bucketing, partition sequences into buckets of nearly the same length - [trainer] print train loss every 10 steps - [loader] cws pku loader split sequence longer than max_seq_len into several shorter sequencestags/v0.1.0
@@ -14,7 +14,7 @@ class Action(object): | |||||
def k_means_1d(x, k, max_iter=100): | def k_means_1d(x, k, max_iter=100): | ||||
""" | """ | ||||
Perform k-means on 1-D data. | |||||
:param x: list of int, representing points in 1-D. | :param x: list of int, representing points in 1-D. | ||||
:param k: the number of clusters required. | :param k: the number of clusters required. | ||||
:param max_iter: maximum iteration | :param max_iter: maximum iteration | ||||
@@ -117,12 +117,12 @@ class BucketSampler(BaseSampler): | |||||
def __init__(self, data_set): | def __init__(self, data_set): | ||||
super(BucketSampler, self).__init__(data_set) | super(BucketSampler, self).__init__(data_set) | ||||
BUCKETS = ([None] * 10) | |||||
BUCKETS = ([None] * 20) | |||||
self.length_freq = dict(Counter([len(example) for example in data_set])) | self.length_freq = dict(Counter([len(example) for example in data_set])) | ||||
self.buckets = k_means_bucketing(data_set, BUCKETS) | self.buckets = k_means_bucketing(data_set, BUCKETS) | ||||
def __iter__(self): | def __iter__(self): | ||||
bucket_samples = self.buckets[np.random.randint(0, len(self.buckets) + 1)] | |||||
bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))] | |||||
np.random.shuffle(bucket_samples) | np.random.shuffle(bucket_samples) | ||||
return iter(bucket_samples) | return iter(bucket_samples) | ||||
@@ -140,10 +140,11 @@ class Batchifier(object): | |||||
def __iter__(self): | def __iter__(self): | ||||
batch = [] | batch = [] | ||||
for idx in self.sampler: | |||||
batch.append(idx) | |||||
if len(batch) == self.batch_size: | |||||
while True: | |||||
for idx in self.sampler: | |||||
batch.append(idx) | |||||
if len(batch) == self.batch_size: | |||||
yield batch | |||||
batch = [] | |||||
if 0 < len(batch) < self.batch_size and self.drop_last is False: | |||||
yield batch | yield batch | ||||
batch = [] | |||||
if 0 < len(batch) < self.batch_size and self.drop_last is False: | |||||
yield batch |
@@ -174,7 +174,7 @@ class POSTester(BaseTester): | |||||
truth = torch.Tensor(truth) | truth = torch.Tensor(truth) | ||||
if torch.cuda.is_available() and self.use_cuda: | if torch.cuda.is_available() and self.use_cuda: | ||||
truth = truth.cuda() | truth = truth.cuda() | ||||
loss = self.model.loss(predict, truth, self.seq_len) | |||||
loss = self.model.loss(predict, truth, self.seq_len) / self.batch_size | |||||
prediction = self.model.prediction(predict, self.seq_len) | prediction = self.model.prediction(predict, self.seq_len) | ||||
results = torch.Tensor(prediction).view(-1,) | results = torch.Tensor(prediction).view(-1,) | ||||
if torch.cuda.is_available() and self.use_cuda: | if torch.cuda.is_available() and self.use_cuda: | ||||
@@ -101,6 +101,9 @@ class BaseTrainer(Action): | |||||
self.grad_backward(loss) | self.grad_backward(loss) | ||||
self.update() | self.update() | ||||
if step % 10 == 0: | |||||
print("[epoch {} step {}] train loss={:.2f}".format(epoch, step, loss.data)) | |||||
if self.validate: | if self.validate: | ||||
if data_dev is None: | if data_dev is None: | ||||
raise RuntimeError("No validation data provided.") | raise RuntimeError("No validation data provided.") | ||||
@@ -30,6 +30,7 @@ class POSDatasetLoader(DatasetLoader): | |||||
and "Hello world !". Each word has its own label from label1 | and "Hello world !". Each word has its own label from label1 | ||||
to label5. | to label5. | ||||
""" | """ | ||||
def __init__(self, data_name, data_path): | def __init__(self, data_name, data_path): | ||||
super(POSDatasetLoader, self).__init__(data_name, data_path) | super(POSDatasetLoader, self).__init__(data_name, data_path) | ||||
@@ -86,7 +87,7 @@ class TokenizeDatasetLoader(DatasetLoader): | |||||
def __init__(self, data_name, data_path): | def __init__(self, data_name, data_path): | ||||
super(TokenizeDatasetLoader, self).__init__(data_name, data_path) | super(TokenizeDatasetLoader, self).__init__(data_name, data_path) | ||||
def load_pku(self, max_seq_len=64): | |||||
def load_pku(self, max_seq_len=32): | |||||
""" | """ | ||||
load pku dataset for Chinese word segmentation | load pku dataset for Chinese word segmentation | ||||
CWS (Chinese Word Segmentation) pku training dataset format: | CWS (Chinese Word Segmentation) pku training dataset format: | ||||
@@ -107,12 +108,10 @@ class TokenizeDatasetLoader(DatasetLoader): | |||||
sentences = f.readlines() | sentences = f.readlines() | ||||
data = [] | data = [] | ||||
for sent in sentences: | for sent in sentences: | ||||
tokens = sent.strip().split() | |||||
words = [] | words = [] | ||||
labels = [] | labels = [] | ||||
tokens = sent.strip().split() | |||||
for start in range(len(tokens) // max_seq_len): | |||||
for token in token_seq: | |||||
for token in tokens: | |||||
if len(token) == 1: | if len(token) == 1: | ||||
words.append(token) | words.append(token) | ||||
labels.append("S") | labels.append("S") | ||||
@@ -124,7 +123,15 @@ class TokenizeDatasetLoader(DatasetLoader): | |||||
labels.append("M") | labels.append("M") | ||||
words.append(token[-1]) | words.append(token[-1]) | ||||
labels.append("E") | labels.append("E") | ||||
data.append([words, labels]) | |||||
num_samples = len(words) // max_seq_len | |||||
if len(words) % max_seq_len != 0: | |||||
num_samples += 1 | |||||
for sample_idx in range(num_samples): | |||||
start = sample_idx * max_seq_len | |||||
end = (sample_idx + 1) * max_seq_len | |||||
seq_words = words[start:end] | |||||
seq_labels = labels[start:end] | |||||
data.append([seq_words, seq_labels]) | |||||
return data | return data | ||||
@@ -23,6 +23,7 @@ def save_pickle(obj, pickle_path, file_name): | |||||
def load_pickle(pickle_path, file_name): | def load_pickle(pickle_path, file_name): | ||||
with open(os.path.join(pickle_path, file_name), "rb") as f: | with open(os.path.join(pickle_path, file_name), "rb") as f: | ||||
obj = _pickle.load(f) | obj = _pickle.load(f) | ||||
print("{} loaded. ".format(file_name)) | |||||
return obj | return obj | ||||