diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py
index 483f6dc1..5dfd889b 100644
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -548,7 +548,7 @@ class LRScheduler(Callback):
         else:
             raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.")
     
-    def on_epoch_begin(self):
+    def on_epoch_end(self):
         self.scheduler.step(self.epoch)
 
 
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index 4cd1ad9c..b7df9dec 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -801,17 +801,19 @@ class DataSet(object):
             else:
                 return DataSet()
     
-    def split(self, ratio):
+    def split(self, ratio, shuffle=True):
         """
         将DataSet按照ratio的比例拆分，返回两个DataSet
 
         :param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `(1-ratio)` 这么多数据，第二个DataSet拥有`ratio`这么多数据
+        :param bool shuffle: 在split前是否shuffle一下
         :return: [DataSet, DataSet]
         """
         assert isinstance(ratio, float)
         assert 0 < ratio < 1
         all_indices = [_ for _ in range(len(self))]
-        np.random.shuffle(all_indices)
+        if shuffle:
+            np.random.shuffle(all_indices)
         split = int(ratio * len(self))
         dev_indices = all_indices[:split]
         train_indices = all_indices[split:]
diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py
index ef619042..0849b35d 100644
--- a/fastNLP/core/optimizer.py
+++ b/fastNLP/core/optimizer.py
@@ -36,6 +36,23 @@ class Optimizer(object):
         """
         return [param for param in params if param.requires_grad]
 
+class NullOptimizer(Optimizer):
+    """
+    当不希望Trainer更新optimizer时，传入本optimizer，但请确保通过callback的方式对参数进行了更新。
+
+    """
+    def __init__(self):
+        super().__init__(None)
+
+    def construct_from_pytorch(self, model_params):
+        pass
+
+    def __getattr__(self, item):
+        def pass_func(*args, **kwargs):
+            pass
+
+        return pass_func
+
 
 class SGD(Optimizer):
     """
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index e8dfa814..6edeb4a0 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -615,7 +615,7 @@ class Trainer(object):
                     if self.step % self.print_every == 0:
                         avg_loss = float(avg_loss) / self.print_every
                         if self.use_tqdm:
-                            print_output = "loss:{0:<6.5f}".format(avg_loss)
+                            print_output = "loss:{:<6.5f}".format(avg_loss)
                             pbar.update(self.print_every)
                         else:
                             end = time.time()
@@ -679,7 +679,7 @@ class Trainer(object):
         """Perform weight update on a model.
 
         """
-        if self.optimizer is not None and (self.step + 1) % self.update_every == 0:
+        if self.step % self.update_every == 0:
             self.optimizer.step()
     
     def _data_forward(self, network, x):
@@ -697,7 +697,7 @@ class Trainer(object):
 
         For PyTorch, just do "loss.backward()"
         """
-        if self.step % self.update_every == 0:
+        if (self.step-1) % self.update_every == 0:
             self.model.zero_grad()
         loss.backward()
     
diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py
index e046f1df..91a0919c 100644
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -38,7 +38,8 @@ class EmbedLoader(BaseLoader):
         super(EmbedLoader, self).__init__()
 
     @staticmethod
-    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True, error='ignore'):
+    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True,
+                        error='ignore', init_method=None):
         """
         从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
         word2vec(第一行只有两个元素)还是glove格式的数据。
@@ -52,6 +53,7 @@ class EmbedLoader(BaseLoader):
         :param bool normalize: 是否将每个vector归一化到norm为1
         :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。
             这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
+        :param callable init_method: 传入numpy.ndarray, 返回numpy.ndarray, 用以初始化embedding
         :return numpy.ndarray:  shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
         """
         assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
@@ -69,6 +71,8 @@ class EmbedLoader(BaseLoader):
                 dim = len(parts) - 1
                 f.seek(0)
             matrix = np.random.randn(len(vocab), dim).astype(dtype)
+            if init_method:
+                matrix = init_method(matrix)
             for idx, line in enumerate(f, start_idx):
                 try:
                     parts = line.strip().split()
@@ -91,14 +95,15 @@ class EmbedLoader(BaseLoader):
                         raise e
             total_hits = sum(hit_flags)
             print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab)))
-            found_vectors = matrix[hit_flags]
-            if len(found_vectors) != 0:
-                mean = np.mean(found_vectors, axis=0, keepdims=True)
-                std = np.std(found_vectors, axis=0, keepdims=True)
-                unfound_vec_num = len(vocab) - total_hits
-                r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
-                matrix[hit_flags == False] = r_vecs
-            
+            if init_method is None:
+                found_vectors = matrix[hit_flags]
+                if len(found_vectors) != 0:
+                    mean = np.mean(found_vectors, axis=0, keepdims=True)
+                    std = np.std(found_vectors, axis=0, keepdims=True)
+                    unfound_vec_num = len(vocab) - total_hits
+                    r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
+                    matrix[hit_flags == False] = r_vecs
+
             if normalize:
                 matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
             
@@ -157,13 +162,17 @@ class EmbedLoader(BaseLoader):
             if dim == -1:
                 raise RuntimeError("{} is an empty file.".format(embed_filepath))
             matrix = np.random.randn(len(vocab), dim).astype(dtype)
+            for key, vec in vec_dict.items():
+                index = vocab.to_index(key)
+                matrix[index] = vec
+
             if (unknown is not None and not found_unknown) or (padding is not None and not found_pad):
                 start_idx = 0
                 if padding is not None:
                     start_idx += 1
                 if unknown is not None:
                     start_idx += 1
-                
+
                 mean = np.mean(matrix[start_idx:], axis=0, keepdims=True)
                 std = np.std(matrix[start_idx:], axis=0, keepdims=True)
                 if (unknown is not None and not found_unknown):
@@ -171,10 +180,6 @@ class EmbedLoader(BaseLoader):
                 if (padding is not None and not found_pad):
                     matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean
             
-            for key, vec in vec_dict.items():
-                index = vocab.to_index(key)
-                matrix[index] = vec
-            
             if normalize:
                 matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
             
diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py
index c48f2683..5e599a65 100644
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -73,21 +73,12 @@ class LSTM(nn.Module):
                 x = x[:, sort_idx]
             x = rnn.pack_padded_sequence(x, sort_lens, batch_first=self.batch_first)
             output, hx = self.lstm(x, hx)  # -> [N,L,C]
-            output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first)
+            output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first, total_length=max_len)
             _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
             if self.batch_first:
                 output = output[unsort_idx]
             else:
                 output = output[:, unsort_idx]
-            #  解决LSTM无法在DataParallel下使用的问题问题https://github.com/pytorch/pytorch/issues/1591
-            if self.batch_first:
-                if output.size(1) < max_len:
-                    dummy_tensor = output.new_zeros(batch_size, max_len - output.size(1), output.size(-1))
-                    output = torch.cat([output, dummy_tensor], 1)
-            else:
-                if output.size(0) < max_len:
-                    dummy_tensor = output.new_zeros(max_len - output.size(1), batch_size, output.size(-1))
-                    output = torch.cat([output, dummy_tensor], 0)
         else:
             output, hx = self.lstm(x, hx)
         return output, hx
diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py
index c87f3a68..3c6a3d27 100644
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -82,6 +82,8 @@ def get_embeddings(init_embed):
     if isinstance(init_embed, tuple):
         res = nn.Embedding(
             num_embeddings=init_embed[0], embedding_dim=init_embed[1])
+        nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
+                         b=np.sqrt(3/res.weight.data.size(1)))
     elif isinstance(init_embed, nn.Module):
         res = init_embed
     elif isinstance(init_embed, torch.Tensor):
diff --git a/setup.py b/setup.py
index 49646761..0dbef455 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ with open('requirements.txt', encoding='utf-8') as f:
 
 setup(
     name='FastNLP',
-    version='0.4.0',
+    version='dev0.5.0',
     description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team',
     long_description=readme,
     long_description_content_type='text/markdown',