From f87723e2eb7fda7e2da203b9346bd76c0709f7ed Mon Sep 17 00:00:00 2001
From: x54-729 <17307130121@fudan.edu.cn>
Date: Wed, 13 Apr 2022 08:40:26 +0000
Subject: [PATCH 1/5] small

---
 fastNLP/core/drivers/paddle_driver/fleet.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fastNLP/core/drivers/paddle_driver/fleet.py b/fastNLP/core/drivers/paddle_driver/fleet.py
index 582ce542..4c937217 100644
--- a/fastNLP/core/drivers/paddle_driver/fleet.py
+++ b/fastNLP/core/drivers/paddle_driver/fleet.py
@@ -244,7 +244,6 @@ class PaddleFleetDriver(PaddleDriver):
         """
         if self.local_rank == 0:
             # 是 rank0 的话，则拉起其它子进程
-            print("in launcher")
             launcher = FleetLauncher(self.parallel_device, self.output_from_new_proc)
             launcher.launch()
         # 设置参数和初始化分布式环境

From d2439fe443fbbac0bec5de7368c3e43998c3a79f Mon Sep 17 00:00:00 2001
From: x54-729 <17307130121@fudan.edu.cn>
Date: Wed, 13 Apr 2022 09:05:21 +0000
Subject: [PATCH 2/5] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=5FMetricsWrapper=20updat?=
 =?UTF-8?q?e=E4=BC=A0=E5=8F=82=E7=9A=84bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/controllers/evaluator.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fastNLP/core/controllers/evaluator.py b/fastNLP/core/controllers/evaluator.py
index 479686e1..2e3678d3 100644
--- a/fastNLP/core/controllers/evaluator.py
+++ b/fastNLP/core/controllers/evaluator.py
@@ -364,16 +364,16 @@ class _MetricsWrapper:
             else:
                 args.append(batch)
             if not isinstance(outputs, dict):
-                raise RuntimeError(f"The output of your model is of type:`{type(batch)}`, please either directly"
+                raise RuntimeError(f"The output of your model is of type:`{type(outputs)}`, please either directly"
                                    f" return a dict from your model or use `output_mapping` to convert it into dict type.")
             if isinstance(metric, Metric):
-                auto_param_call(metric.update, batch, *args)
+                auto_param_call(metric.update, outputs, *args)
             elif _is_torchmetrics_metric(metric):
-                auto_param_call(metric.update, batch, *args)
+                auto_param_call(metric.update, outputs, *args)
             elif _is_allennlp_metric(metric):
-                auto_param_call(metric.__call__, batch, *args)
+                auto_param_call(metric.__call__, outputs, *args)
             elif _is_paddle_metric(metric):
-                res = auto_param_call(metric.compute, batch, *args)
+                res = auto_param_call(metric.compute, outputs, *args)
                 metric.update(res)
 
     def reset(self):

From 1452aa8f6c54e2ad313d93687baf491b9cb37559 Mon Sep 17 00:00:00 2001
From: YWMditto <ditto@YWMdittodeMBP.lan>
Date: Thu, 14 Apr 2022 13:50:53 +0800
Subject: [PATCH 3/5] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=20dist=20?=
 =?UTF-8?q?=E4=B8=BA=20None=20=E6=97=B6=E7=9A=84=20set=5Fdist=5Frepro=5Fda?=
 =?UTF-8?q?taloader=20=E7=9A=84=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/drivers/torch_driver/ddp.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fastNLP/core/drivers/torch_driver/ddp.py b/fastNLP/core/drivers/torch_driver/ddp.py
index 11a61dde..c673fe62 100644
--- a/fastNLP/core/drivers/torch_driver/ddp.py
+++ b/fastNLP/core/drivers/torch_driver/ddp.py
@@ -471,12 +471,11 @@ class TorchDDPDriver(TorchDriver):
                 raise RuntimeError("It is not allowed to use checkpoint retraining when you initialize ddp out of our "
                                    "control.")
             else:
-                if isinstance(dist, ReproducibleBatchSampler):
-                    dist = re_instantiate_sampler(dist)
-                    return replace_batch_sampler(dataloader, dist)
-                if isinstance(dist, ReproducibleSampler):
-                    dist = re_instantiate_sampler(dist)
-                    return replace_sampler(dataloader, dist)
+                args = self.get_dataloader_args(dataloader)
+                if isinstance(args.batch_sampler, ReproducibleBatchSampler):
+                    return replace_batch_sampler(dataloader, re_instantiate_sampler(args.batch_sampler))
+                if isinstance(args.sampler, ReproducibleSampler):
+                    return replace_sampler(dataloader, re_instantiate_sampler(args.sampler))
                 return dataloader
         # trainer
         elif dist == "dist":

From 64fa182aeb02d7e06882eede6b27aa5b311c9658 Mon Sep 17 00:00:00 2001
From: x54-729 <17307130121@fudan.edu.cn>
Date: Thu, 14 Apr 2022 08:01:04 +0000
Subject: [PATCH 4/5] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=96=AD=E7=82=B9?=
 =?UTF-8?q?=E9=87=8D=E8=AE=AD=E9=83=A8=E5=88=86=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/drivers/paddle_driver/fleet.py   | 16 ++++++++--------
 .../drivers/paddle_driver/paddle_driver.py    | 19 ++++++++++++++++---
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/fastNLP/core/drivers/paddle_driver/fleet.py b/fastNLP/core/drivers/paddle_driver/fleet.py
index 4c937217..3f29e4dd 100644
--- a/fastNLP/core/drivers/paddle_driver/fleet.py
+++ b/fastNLP/core/drivers/paddle_driver/fleet.py
@@ -325,7 +325,6 @@ class PaddleFleetDriver(PaddleDriver):
         assert dataloader.dataset_kind != _DatasetKind.ITER, \
                     "FastNLP does not support `IteratorDataset` now."
         # 如果 dist 为 ReproducibleBatchSampler, ReproducibleSampler 说明是在断点重训时 driver.load 函数调用；
-        # 注意这里不需要调用 dist_sampler.set_distributed；因为如果用户使用的是 TorchDDPDriver，那么其在 Trainer 初始化的时候就已经调用了该函数；
         if isinstance(dist, ReproducibleBatchSampler):
             dist.set_distributed(
                 num_replicas=self.world_size,
@@ -345,15 +344,16 @@ class PaddleFleetDriver(PaddleDriver):
         # trainer, evaluator
         if dist is None:
             if reproducible:
-                raise RuntimeError("It is not allowed to use checkpoint retraining when you initialize ddp out of our "
+                raise RuntimeError("It is not allowed to use checkpoint retraining when you initialize fleet out of our "
                                    "control.")
             else:
-                if isinstance(dist, ReproducibleBatchSampler):
-                    dist = re_instantiate_sampler(dist)
-                    return replace_batch_sampler(dataloader, dist)
-                if isinstance(dist, ReproducibleSampler):
-                    dist = re_instantiate_sampler(dist)
-                    return replace_sampler(dataloader, dist)
+                args = self.get_dataloader_args(dataloader)
+                if isinstance(args.batch_sampler, ReproducibleBatchSampler):
+                    batch_sampler = re_instantiate_sampler(args.batch_sampler)
+                    return replace_batch_sampler(dataloader, batch_sampler)
+                if isinstance(args.sampler, ReproducibleSampler):
+                    sampler = re_instantiate_sampler(args.sampler)
+                    return replace_sampler(dataloader, sampler)
                 return dataloader
         # trainer
         elif dist == "dist":
diff --git a/fastNLP/core/drivers/paddle_driver/paddle_driver.py b/fastNLP/core/drivers/paddle_driver/paddle_driver.py
index 4362dcce..cc870536 100644
--- a/fastNLP/core/drivers/paddle_driver/paddle_driver.py
+++ b/fastNLP/core/drivers/paddle_driver/paddle_driver.py
@@ -66,8 +66,8 @@ class PaddleDriver(Driver):
 
         :param set_to_none: 用来判断是否需要将梯度直接置为 None；Paddle中这个参数无效。
         """
-        # if set_to_none:
-        #     log.warning("Parameter `set_to_none` does nothing in paddle since grad cannot be set directly.")
+        if set_to_none:
+            logger.warning_once("Parameter `set_to_none` does nothing in paddle since grad cannot be set directly.")
         for optimizer in self.optimizers:
             optimizer.clear_grad()
 
@@ -254,8 +254,21 @@ class PaddleDriver(Driver):
         else:
             raise RuntimeError("This condition is not supposed to appear. Please report a bug to us.")
 
+        num_consumed_batches = states.pop('num_consumed_batches')
         if hasattr(sampler, 'state_dict') and callable(sampler.state_dict):
-            states['sampler_states'] = sampler.state_dict()
+            sampler_states = sampler.state_dict()
+            # 如果有，需要针对 num_consumed_samples 做特殊的处理。因为DataLoader存在预取行为，直接使用sampler中的num_consumed_samples
+            #   会造成多余实际消耗的问题。
+            num_consumed_samples_array = sampler_states.pop('num_consumed_samples_array', None)
+            if num_consumed_samples_array is not None:
+                if isinstance(sampler, ReproducibleSampler):  # 如果是 sampler 的话，需要考虑 batch_size 。
+                    try:
+                        num_consumed_batches = num_consumed_batches * dataloader_args.batch_size
+                    except:  # 有可能 batch_size 为 None，就只有损失精度了
+                        num_consumed_batches = sampler_states['num_consumed_samples']
+                sampler_states['num_consumed_samples'] = num_consumed_samples_array[num_consumed_batches]
+                assert sampler_states['num_consumed_samples'] != -1, "This is a bug, please report."
+            
         else:
             raise RuntimeError(
                 'The sampler has no `state_dict()` method, it will fail to recover to the specific batch.')

From 7c6e8b20a8612a20d2dfe077896fe537ac7c2d1b Mon Sep 17 00:00:00 2001
From: x54-729 <17307130121@fudan.edu.cn>
Date: Thu, 14 Apr 2022 08:01:14 +0000
Subject: [PATCH 5/5] small

---
 fastNLP/core/samplers/reproducible_batch_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastNLP/core/samplers/reproducible_batch_sampler.py b/fastNLP/core/samplers/reproducible_batch_sampler.py
index d4535bae..73621b5f 100644
--- a/fastNLP/core/samplers/reproducible_batch_sampler.py
+++ b/fastNLP/core/samplers/reproducible_batch_sampler.py
@@ -151,7 +151,7 @@ class RandomBatchSampler(ReproducibleBatchSampler):
         self.need_reinitialize = False
 
     def set_distributed(self, num_replicas, rank, pad=True):
-        raise RuntimeError(f"ReproduceBatchSampler does not support to change to distributed training.")
+        raise RuntimeError(f"RandomBatchSampler does not support to change to distributed training.")
 
     def set_epoch(self, epoch):
         if hasattr(self.batch_sampler, "sampler") and hasattr(self.batch_sampler.sampler, 'set_epoch') and callable(self.batch_sampler.sampler.set_epoch):