diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py
index 1003c824..dfe35f77 100644
--- a/fastNLP/core/__init__.py
+++ b/fastNLP/core/__init__.py
@@ -2,10 +2,10 @@ from .batch import Batch
 from .dataset import DataSet
 from .fieldarray import FieldArray
 from .instance import Instance
+from .losses import Loss
 from .metrics import Evaluator, ClassifyEvaluator, SNLIEvaluator, SeqLabelEvaluator
+from .optimizer import Optimizer
 from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler
 from .tester import Tester
 from .trainer import Trainer
 from .vocabulary import Vocabulary
-from .optimizer import Optimizer
-from .loss import Loss
diff --git a/fastNLP/core/loss.py b/fastNLP/core/loss.py
deleted file mode 100644
index 093b3b96..00000000
--- a/fastNLP/core/loss.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import torch
-
-def squash(predict , truth , **kwargs):
-	'''To reshape tensors in order to fit Loss functions in pytorch
-
-	:param predict	: Tensor, model output
-	:param truth	: Tensor, truth from dataset
-	:param **kwargs : extra arguments
-
-	:return predict , truth: predict & truth after processing
-	'''
-	return predict.view(-1 , predict.size()[-1]) , truth.view(-1,)
-
-def unpad(predict , truth , **kwargs):
-	'''To process padded sequence output to get true loss
-	Using pack_padded_sequence() method
-	This method contains squash()
-
-	:param predict	: Tensor, [batch_size , max_len , tag_size]
-	:param truth	: Tensor, [batch_size , max_len]
-	:param **kwargs : extra arguments, kwargs["lens"] is expected to be exsist
-		kwargs["lens"] : list or LongTensor, [batch_size]
-					  the i-th element is true lengths of i-th sequence  
-	
-	:return predict , truth: predict & truth after processing
-	'''
-	if kwargs.get("lens") is None:
-		return predict , truth
-	lens = torch.LongTensor(kwargs["lens"])
-	lens , idx = torch.sort(lens , descending = True)
-	predict = torch.nn.utils.rnn.pack_padded_sequence(predict[idx] , lens , batch_first = True).data
-	truth = torch.nn.utils.rnn.pack_padded_sequence(truth[idx] , lens , batch_first = True).data
-	return predict , truth
-
-def unpad_mask(predict , truth , **kwargs):
-	'''To process padded sequence output to get true loss
-	Using mask() method
-	This method contains squash()
-
-	:param predict	: Tensor, [batch_size , max_len , tag_size]
-	:param truth	: Tensor, [batch_size , max_len]
-	:param **kwargs : extra arguments, kwargs["lens"] is expected to be exsist
-		kwargs["lens"] : list or LongTensor, [batch_size]
-					  the i-th element is true lengths of i-th sequence  
-	
-	:return predict , truth: predict & truth after processing
-	'''
-	if kwargs.get("lens") is None:
-		return predict , truth
-	mas = make_mask(kwargs["lens"] , truth.size()[1])
-	return mask(predict , truth , mask = mas)
-
-def mask(predict , truth , **kwargs):
-	'''To select specific elements from Tensor
-	This method contains squash()
-
-	:param predict	: Tensor, [batch_size , max_len , tag_size]
-	:param truth	: Tensor, [batch_size , max_len]
-	:param **kwargs : extra arguments, kwargs["mask"] is expected to be exsist
-		kwargs["mask"] : ByteTensor, [batch_size , max_len]
-					  the mask Tensor , the position that is 1 will be selected
-	
-	:return predict , truth: predict & truth after processing
-	'''
-	if kwargs.get("mask") is None:
-		return predict , truth
-	mask = kwargs["mask"]
-	
-	predict , truth = squash(predict , truth)
-	mask = mask.view(-1,)
-
-	predict = torch.masked_select(predict.permute(1,0) , mask).view(predict.size()[-1] , -1).permute(1,0)
-	truth = torch.masked_select(truth , mask)
-
-	return predict , truth
-
-def make_mask(lens , tar_len):
-	'''to generate a mask that select [:lens[i]] for i-th element
-	embezzle from fastNLP.models.sequence_modeling.seq_mask
-
-	:param lens		: list or LongTensor, [batch_size]
-	:param tar_len	: int
-	
-	:return mask 	: ByteTensor
-	'''
-	lens = torch.LongTensor(lens)
-	mask = [torch.ge(lens, i + 1) for i in range(tar_len)]
-	mask = torch.stack(mask, 1)
-	return mask
-
-#map string to function. Just for more elegant using
-method_dict = {
-	"squash" : squash,
-	"unpad" : unpad,
-	"unpad_mask" : unpad_mask,
-	"mask" : mask,
-}
-
-loss_function_name = {
-	"L1Loss".lower() : torch.nn.L1Loss,
-	"BCELoss".lower() : torch.nn.BCELoss,
-	"MSELoss".lower() : torch.nn.MSELoss,
-	"NLLLoss".lower() : torch.nn.NLLLoss,
-	"KLDivLoss".lower() : torch.nn.KLDivLoss,
-	"NLLLoss2dLoss".lower() : torch.nn.NLLLoss2d,	#every name should end with "loss"
-	"SmoothL1Loss".lower() : torch.nn.SmoothL1Loss,
-	"SoftMarginLoss".lower() : torch.nn.SoftMarginLoss,
-	"PoissonNLLLoss".lower() : torch.nn.PoissonNLLLoss,
-	"MultiMarginLoss".lower() : torch.nn.MultiMarginLoss,
-	"CrossEntropyLoss".lower() : torch.nn.CrossEntropyLoss,
-	"BCEWithLogitsLoss".lower() : torch.nn.BCEWithLogitsLoss,
-	"MarginRankingLoss".lower() : torch.nn.MarginRankingLoss,
-	"TripletMarginLoss".lower() : torch.nn.TripletMarginLoss,
-	"HingeEmbeddingLoss".lower() : torch.nn.HingeEmbeddingLoss,
-	"CosineEmbeddingLoss".lower() : torch.nn.CosineEmbeddingLoss,
-	"MultiLabelMarginLoss".lower() : torch.nn.MultiLabelMarginLoss,
-	"MultiLabelSoftMarginLoss".lower() : torch.nn.MultiLabelSoftMarginLoss,
-}
-
-class Loss(object):
-	'''a Loss object is a callable object represents loss functions
-	'''
-
-	def __init__(self , loss_name , pre_pro = [squash], **kwargs):
-		'''
-
-		:param loss_name: str or None , the name of loss function
-		:param pre_pro 	: list of function or str, methods to reform parameters before calculating loss
-			the strings will be auto translated to pre-defined functions
-		:param **kwargs: kwargs for torch loss function
-
-		pre_pro funcsions should have three arguments: predict, truth, **arg
-			predict and truth is the necessary parameters in loss function
-			kwargs is the extra parameters passed-in when calling loss function
-		pre_pro functions should return two objects, respectively predict and truth that after processed
-
-		'''
-
-		if loss_name is None:
-			# this is useful when Trainer.__init__ performs type check
-			self._loss = None
-		else:
-			if not isinstance(loss_name, str):
-				raise NotImplementedError
-			else:
-				self._loss = self._get_loss(loss_name , **kwargs)
-
-		self.pre_pro = [f if callable(f) else method_dict.get(f) for f in pre_pro]
-
-	def add_pre_pro(self , func):
-		'''add a pre_pro function
-
-		:param func: a function or str, methods to reform parameters before calculating loss
-			the strings will be auto translated to pre-defined functions
-		'''
-		if not callable(func):
-			func = method_dict.get(func)
-			if func is None:
-				return
-		self.pre_pro.append(func)
-
-	@staticmethod
-	def _get_loss(loss_name , **kwargs):
-		'''Get loss function from torch
-
-		:param loss_name: str, the name of loss function
-		:param **kwargs: kwargs for torch loss function
-		:return: A callable loss function object
-		'''
-		loss_name = loss_name.strip().lower()
-		loss_name = "".join(loss_name.split("_"))
-
-		if len(loss_name) < 4 or loss_name[-4 : ] != "loss":
-			loss_name += "loss"
-		return loss_function_name[loss_name](**kwargs)
-
-	def get(self):
-		'''This method exists just for make some existing codes run error-freely
-		'''
-		return self
-
-	def __call__(self , predict , truth , **kwargs):
-		'''call a loss function
-		predict and truth will be processed by pre_pro methods in order of addition 
-
-		:param predict	: Tensor, model output
-		:param truth 	: Tensor, truth from dataset 
-		:param **kwargs : extra arguments, pass to pre_pro functions
-			for example, if used unpad_mask() in pre_pro, there should be a kwarg named lens
-		'''
-		for f in self.pre_pro:
-			if f is None:
-				continue
-			predict , truth = f(predict , truth , **kwargs)
-
-		return self._loss(predict , truth)
diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py
new file mode 100644
index 00000000..1e5a4914
--- /dev/null
+++ b/fastNLP/core/losses.py
@@ -0,0 +1,219 @@
+import torch
+
+
+class LossBase(object):
+    def __init__(self):
+        self.param_map = {}
+
+    def get_loss(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def __call__(self, output_dict, predict_dict):
+        pass
+
+
+class Loss(LossBase):
+    def __init__(self):
+        pass
+
+
+def squash(predict, truth, **kwargs):
+    '''To reshape tensors in order to fit Loss functions in pytorch
+
+    :param predict	: Tensor, model output
+    :param truth	: Tensor, truth from dataset
+    :param **kwargs : extra arguments
+
+    :return predict , truth: predict & truth after processing
+    '''
+    return predict.view(-1, predict.size()[-1]), truth.view(-1, )
+
+
+def unpad(predict, truth, **kwargs):
+    '''To process padded sequence output to get true loss
+    Using pack_padded_sequence() method
+    This method contains squash()
+
+    :param predict	: Tensor, [batch_size , max_len , tag_size]
+    :param truth	: Tensor, [batch_size , max_len]
+    :param **kwargs : extra arguments, kwargs["lens"] is expected to be exsist
+        kwargs["lens"] : list or LongTensor, [batch_size]
+                      the i-th element is true lengths of i-th sequence
+
+    :return predict , truth: predict & truth after processing
+    '''
+    if kwargs.get("lens") is None:
+        return predict, truth
+    lens = torch.LongTensor(kwargs["lens"])
+    lens, idx = torch.sort(lens, descending=True)
+    predict = torch.nn.utils.rnn.pack_padded_sequence(predict[idx], lens, batch_first=True).data
+    truth = torch.nn.utils.rnn.pack_padded_sequence(truth[idx], lens, batch_first=True).data
+    return predict, truth
+
+
+def unpad_mask(predict, truth, **kwargs):
+    '''To process padded sequence output to get true loss
+    Using mask() method
+    This method contains squash()
+
+    :param predict	: Tensor, [batch_size , max_len , tag_size]
+    :param truth	: Tensor, [batch_size , max_len]
+    :param **kwargs : extra arguments, kwargs["lens"] is expected to be exsist
+        kwargs["lens"] : list or LongTensor, [batch_size]
+                      the i-th element is true lengths of i-th sequence
+
+    :return predict , truth: predict & truth after processing
+    '''
+    if kwargs.get("lens") is None:
+        return predict, truth
+    mas = make_mask(kwargs["lens"], truth.size()[1])
+    return mask(predict, truth, mask=mas)
+
+
+def mask(predict, truth, **kwargs):
+    '''To select specific elements from Tensor
+    This method contains squash()
+
+    :param predict	: Tensor, [batch_size , max_len , tag_size]
+    :param truth	: Tensor, [batch_size , max_len]
+    :param **kwargs : extra arguments, kwargs["mask"] is expected to be exsist
+        kwargs["mask"] : ByteTensor, [batch_size , max_len]
+                      the mask Tensor , the position that is 1 will be selected
+
+    :return predict , truth: predict & truth after processing
+    '''
+    if kwargs.get("mask") is None:
+        return predict, truth
+    mask = kwargs["mask"]
+
+    predict, truth = squash(predict, truth)
+    mask = mask.view(-1, )
+
+    predict = torch.masked_select(predict.permute(1, 0), mask).view(predict.size()[-1], -1).permute(1, 0)
+    truth = torch.masked_select(truth, mask)
+
+    return predict, truth
+
+
+def make_mask(lens, tar_len):
+    '''to generate a mask that select [:lens[i]] for i-th element
+    embezzle from fastNLP.models.sequence_modeling.seq_mask
+
+    :param lens		: list or LongTensor, [batch_size]
+    :param tar_len	: int
+
+    :return mask 	: ByteTensor
+    '''
+    lens = torch.LongTensor(lens)
+    mask = [torch.ge(lens, i + 1) for i in range(tar_len)]
+    mask = torch.stack(mask, 1)
+    return mask
+
+
+# map string to function. Just for more elegant using
+method_dict = {
+    "squash": squash,
+    "unpad": unpad,
+    "unpad_mask": unpad_mask,
+    "mask": mask,
+}
+
+loss_function_name = {
+    "L1Loss".lower(): torch.nn.L1Loss,
+    "BCELoss".lower(): torch.nn.BCELoss,
+    "MSELoss".lower(): torch.nn.MSELoss,
+    "NLLLoss".lower(): torch.nn.NLLLoss,
+    "KLDivLoss".lower(): torch.nn.KLDivLoss,
+    "NLLLoss2dLoss".lower(): torch.nn.NLLLoss2d,  # every name should end with "loss"
+    "SmoothL1Loss".lower(): torch.nn.SmoothL1Loss,
+    "SoftMarginLoss".lower(): torch.nn.SoftMarginLoss,
+    "PoissonNLLLoss".lower(): torch.nn.PoissonNLLLoss,
+    "MultiMarginLoss".lower(): torch.nn.MultiMarginLoss,
+    "CrossEntropyLoss".lower(): torch.nn.CrossEntropyLoss,
+    "BCEWithLogitsLoss".lower(): torch.nn.BCEWithLogitsLoss,
+    "MarginRankingLoss".lower(): torch.nn.MarginRankingLoss,
+    "TripletMarginLoss".lower(): torch.nn.TripletMarginLoss,
+    "HingeEmbeddingLoss".lower(): torch.nn.HingeEmbeddingLoss,
+    "CosineEmbeddingLoss".lower(): torch.nn.CosineEmbeddingLoss,
+    "MultiLabelMarginLoss".lower(): torch.nn.MultiLabelMarginLoss,
+    "MultiLabelSoftMarginLoss".lower(): torch.nn.MultiLabelSoftMarginLoss,
+}
+
+
+class Loss(object):
+    '''a Loss object is a callable object represents loss functions
+    '''
+
+    def __init__(self, loss_name, pre_pro=[squash], **kwargs):
+        '''
+
+        :param loss_name: str or None , the name of loss function
+        :param pre_pro 	: list of function or str, methods to reform parameters before calculating loss
+            the strings will be auto translated to pre-defined functions
+        :param **kwargs: kwargs for torch loss function
+
+        pre_pro funcsions should have three arguments: predict, truth, **arg
+            predict and truth is the necessary parameters in loss function
+            kwargs is the extra parameters passed-in when calling loss function
+        pre_pro functions should return two objects, respectively predict and truth that after processed
+
+        '''
+
+        if loss_name is None:
+            # this is useful when Trainer.__init__ performs type check
+            self._loss = None
+        else:
+            if not isinstance(loss_name, str):
+                raise NotImplementedError
+            else:
+                self._loss = self._get_loss(loss_name, **kwargs)
+
+        self.pre_pro = [f if callable(f) else method_dict.get(f) for f in pre_pro]
+
+    def add_pre_pro(self, func):
+        '''add a pre_pro function
+
+        :param func: a function or str, methods to reform parameters before calculating loss
+            the strings will be auto translated to pre-defined functions
+        '''
+        if not callable(func):
+            func = method_dict.get(func)
+            if func is None:
+                return
+        self.pre_pro.append(func)
+
+    @staticmethod
+    def _get_loss(loss_name, **kwargs):
+        '''Get loss function from torch
+
+        :param loss_name: str, the name of loss function
+        :param **kwargs: kwargs for torch loss function
+        :return: A callable loss function object
+        '''
+        loss_name = loss_name.strip().lower()
+        loss_name = "".join(loss_name.split("_"))
+
+        if len(loss_name) < 4 or loss_name[-4:] != "loss":
+            loss_name += "loss"
+        return loss_function_name[loss_name](**kwargs)
+
+    def get(self):
+        '''This method exists just for make some existing codes run error-freely
+        '''
+        return self
+
+    def __call__(self, predict, truth, **kwargs):
+        '''call a loss function
+        predict and truth will be processed by pre_pro methods in order of addition
+
+        :param predict	: Tensor, model output
+        :param truth 	: Tensor, truth from dataset
+        :param **kwargs : extra arguments, pass to pre_pro functions
+            for example, if used unpad_mask() in pre_pro, there should be a kwarg named lens
+        '''
+        for f in self.pre_pro:
+            if f is None:
+                continue
+            predict, truth = f(predict, truth, **kwargs)
+
+        return self._loss(predict, truth)
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 6b0398b5..26362cb9 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -1,27 +1,25 @@
+import itertools
+import os
 import time
-from datetime import timedelta
-from datetime import datetime
 import warnings
 from collections import defaultdict
-import os
-import itertools
-import shutil
+from datetime import datetime
+from datetime import timedelta
 
-from tensorboardX import SummaryWriter
 import torch
+from tensorboardX import SummaryWriter
 
 from fastNLP.core.batch import Batch
-from fastNLP.core.loss import Loss
-from fastNLP.core.metrics import Evaluator
 from fastNLP.core.optimizer import Optimizer
 from fastNLP.core.sampler import RandomSampler
 from fastNLP.core.sampler import SequentialSampler
 from fastNLP.core.tester import Tester
-from fastNLP.core.utils import _check_arg_dict_list
 from fastNLP.core.utils import _build_args
+from fastNLP.core.utils import _check_arg_dict_list
 from fastNLP.core.utils import _syn_model_data
 from fastNLP.core.utils import get_func_signature
 
+
 class Trainer(object):
     """Main Training Loop
 
diff --git a/test/core/test_loss.py b/test/core/test_loss.py
index d45d54e3..fdde4f0e 100644
--- a/test/core/test_loss.py
+++ b/test/core/test_loss.py
@@ -1,9 +1,10 @@
+import math
 import unittest
 
-import fastNLP.core.loss as loss
-import math
 import torch as tc
-import pdb
+
+import fastNLP.core.losses as loss
+
 
 class TestLoss(unittest.TestCase):