Browse Source

merge

tags/v1.0.0alpha
YWMditto 3 years ago
parent
commit
0376d3e78d
100 changed files with 996 additions and 1553 deletions
  1. +5
    -2
      docs/Makefile
  2. +7
    -4
      docs/source/conf.py
  3. +2
    -2
      docs/source/fastNLP.core.callbacks.rst
  4. +1
    -1
      docs/source/fastNLP.core.callbacks.torch_callbacks.rst
  5. +1
    -1
      docs/source/fastNLP.core.collators.padders.rst
  6. +2
    -2
      docs/source/fastNLP.core.collators.rst
  7. +1
    -1
      docs/source/fastNLP.core.controllers.loops.rst
  8. +2
    -2
      docs/source/fastNLP.core.controllers.rst
  9. +1
    -1
      docs/source/fastNLP.core.controllers.utils.rst
  10. +1
    -1
      docs/source/fastNLP.core.dataloaders.jittor_dataloader.rst
  11. +1
    -1
      docs/source/fastNLP.core.dataloaders.paddle_dataloader.rst
  12. +7
    -0
      docs/source/fastNLP.core.dataloaders.prepare_dataloader.rst
  13. +3
    -2
      docs/source/fastNLP.core.dataloaders.rst
  14. +1
    -1
      docs/source/fastNLP.core.dataloaders.torch_dataloader.rst
  15. +1
    -1
      docs/source/fastNLP.core.dataset.rst
  16. +1
    -1
      docs/source/fastNLP.core.drivers.jittor_driver.rst
  17. +1
    -1
      docs/source/fastNLP.core.drivers.paddle_driver.rst
  18. +2
    -2
      docs/source/fastNLP.core.drivers.rst
  19. +1
    -1
      docs/source/fastNLP.core.drivers.torch_driver.rst
  20. +1
    -1
      docs/source/fastNLP.core.log.rst
  21. +1
    -1
      docs/source/fastNLP.core.metrics.backend.jittor_backend.rst
  22. +1
    -1
      docs/source/fastNLP.core.metrics.backend.paddle_backend.rst
  23. +2
    -2
      docs/source/fastNLP.core.metrics.backend.rst
  24. +1
    -1
      docs/source/fastNLP.core.metrics.backend.torch_backend.rst
  25. +2
    -2
      docs/source/fastNLP.core.metrics.rst
  26. +2
    -2
      docs/source/fastNLP.core.rst
  27. +1
    -1
      docs/source/fastNLP.core.samplers.rst
  28. +1
    -1
      docs/source/fastNLP.core.utils.rst
  29. +1
    -1
      docs/source/fastNLP.envs.rst
  30. +1
    -1
      docs/source/fastNLP.io.loader.rst
  31. +1
    -1
      docs/source/fastNLP.io.pipe.rst
  32. +2
    -2
      docs/source/fastNLP.io.rst
  33. +1
    -1
      docs/source/fastNLP.rst
  34. +1
    -1
      docs/source/modules.rst
  35. +0
    -4
      fastNLP/core/__init__.py
  36. +13
    -9
      fastNLP/core/callbacks/callback_manager.py
  37. +7
    -20
      fastNLP/core/callbacks/checkpoint_callback.py
  38. +2
    -1
      fastNLP/core/callbacks/load_best_model_callback.py
  39. +2
    -3
      fastNLP/core/callbacks/topk_saver.py
  40. +2
    -2
      fastNLP/core/callbacks/utils.py
  41. +1
    -1
      fastNLP/core/collators/padders/get_padder.py
  42. +4
    -1
      fastNLP/core/collators/padders/paddle_padder.py
  43. +2
    -2
      fastNLP/core/collators/padders/torch_padder.py
  44. +6
    -2
      fastNLP/core/controllers/evaluator.py
  45. +1
    -1
      fastNLP/core/controllers/loops/evaluate_batch_loop.py
  46. +1
    -1
      fastNLP/core/controllers/loops/train_batch_loop.py
  47. +20
    -11
      fastNLP/core/controllers/trainer.py
  48. +2
    -2
      fastNLP/core/dataloaders/paddle_dataloader/fdl.py
  49. +1
    -1
      fastNLP/core/dataloaders/prepare_dataloader.py
  50. +2
    -2
      fastNLP/core/dataloaders/torch_dataloader/fdl.py
  51. +18
    -11
      fastNLP/core/dataset/dataset.py
  52. +0
    -2
      fastNLP/core/drivers/__init__.py
  53. +2
    -2
      fastNLP/core/drivers/choose_driver.py
  54. +10
    -6
      fastNLP/core/drivers/jittor_driver/initialize_jittor_driver.py
  55. +11
    -1
      fastNLP/core/drivers/jittor_driver/jittor_driver.py
  56. +8
    -0
      fastNLP/core/drivers/jittor_driver/mpi.py
  57. +11
    -7
      fastNLP/core/drivers/jittor_driver/single_device.py
  58. +2
    -1
      fastNLP/core/drivers/jittor_driver/utils.py
  59. +14
    -27
      fastNLP/core/drivers/paddle_driver/fleet.py
  60. +5
    -2
      fastNLP/core/drivers/paddle_driver/fleet_launcher.py
  61. +28
    -18
      fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
  62. +5
    -2
      fastNLP/core/drivers/paddle_driver/paddle_driver.py
  63. +6
    -10
      fastNLP/core/drivers/paddle_driver/single_device.py
  64. +7
    -8
      fastNLP/core/drivers/torch_driver/initialize_torch_driver.py
  65. +6
    -0
      fastNLP/core/drivers/torch_driver/torch_driver.py
  66. +0
    -5
      fastNLP/core/drivers/torch_paddle_driver/__init__.py
  67. +0
    -193
      fastNLP/core/drivers/torch_paddle_driver/torch_paddle_driver.py
  68. +0
    -4
      fastNLP/core/drivers/torch_paddle_driver/utils.py
  69. +4
    -2
      fastNLP/core/log/print.py
  70. +4
    -2
      fastNLP/core/metrics/backend/paddle_backend/backend.py
  71. +1
    -1
      fastNLP/core/metrics/metric.py
  72. +18
    -11
      fastNLP/core/samplers/reproducible_batch_sampler.py
  73. +18
    -10
      fastNLP/core/samplers/reproducible_sampler.py
  74. +1
    -4
      fastNLP/core/utils/__init__.py
  75. +1
    -1
      fastNLP/core/utils/dummy_class.py
  76. +8
    -1
      fastNLP/core/utils/jittor_utils.py
  77. +75
    -63
      fastNLP/core/utils/paddle_utils.py
  78. +2
    -3
      fastNLP/core/utils/rich_progress.py
  79. +0
    -49
      fastNLP/core/utils/torch_paddle_utils.py
  80. +5
    -5
      fastNLP/core/utils/torch_utils.py
  81. +99
    -84
      fastNLP/core/utils/utils.py
  82. +19
    -4
      fastNLP/envs/set_backend.py
  83. +5
    -3
      fastNLP/envs/utils.py
  84. +2
    -2
      fastNLP/modules/__init__.py
  85. +2
    -2
      fastNLP/modules/mix_modules/__init__.py
  86. +0
    -310
      fastNLP/modules/mix_modules/mix_module.py
  87. +0
    -233
      fastNLP/modules/mix_modules/utils.py
  88. +73
    -0
      tests/core/callbacks/test_checkpoint_callback_torch.py
  89. +2
    -2
      tests/core/collators/test_collator.py
  90. +13
    -6
      tests/core/controllers/_test_trainer_fleet.py
  91. +9
    -6
      tests/core/controllers/_test_trainer_fleet_outside.py
  92. +237
    -0
      tests/core/controllers/_test_trainer_jittor.py
  93. +110
    -0
      tests/core/controllers/imdb.py
  94. +4
    -0
      tests/core/controllers/test_trainer_jittor.py
  95. +5
    -0
      tests/core/controllers/test_trainer_paddle.py
  96. +0
    -0
      tests/core/drivers/torch_paddle_driver/__init__.py
  97. +0
    -122
      tests/core/drivers/torch_paddle_driver/_test_torch_paddle_driver.py
  98. +0
    -0
      tests/core/drivers/torch_paddle_driver/_test_utils.py
  99. +0
    -204
      tests/core/utils/_test_torch_paddle_utils.py
  100. +20
    -15
      tests/core/utils/test_paddle_utils.py

+ 5
- 2
docs/Makefile View File

@@ -6,7 +6,7 @@ SPHINXOPTS =
SPHINXAPIDOC = sphinx-apidoc SPHINXAPIDOC = sphinx-apidoc
SPHINXBUILD = sphinx-build SPHINXBUILD = sphinx-build
SPHINXPROJ = fastNLP SPHINXPROJ = fastNLP
SPHINXEXCLUDE = ../fastNLP/transformers/* ../fastNLP/modules/* ../fastNLP/core/drivers/torch_paddle_driver/* ../fastNLP/core/utils/torch_paddle_utils.py
SPHINXEXCLUDE = ../fastNLP/transformers/*
SOURCEDIR = source SOURCEDIR = source
BUILDDIR = build BUILDDIR = build
PORT = 9000 PORT = 9000
@@ -16,7 +16,7 @@ help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)


apidoc: apidoc:
$(SPHINXAPIDOC) -efM -d 6 -o source ../$(SPHINXPROJ) $(SPHINXEXCLUDE)
$(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ) $(SPHINXEXCLUDE)


server: server:
cd build/html && python -m http.server $(PORT) cd build/html && python -m http.server $(PORT)
@@ -24,6 +24,9 @@ server:
delete: delete:
rm -f source/$(SPHINXPROJ).* source/modules.rst && rm -rf build rm -f source/$(SPHINXPROJ).* source/modules.rst && rm -rf build


web:
make html && make server

dev: dev:
make delete && make apidoc && make html && make server make delete && make apidoc && make html && make server




+ 7
- 4
docs/source/conf.py View File

@@ -42,7 +42,8 @@ extensions = [
'sphinx.ext.viewcode', 'sphinx.ext.viewcode',
'sphinx.ext.autosummary', 'sphinx.ext.autosummary',
'sphinx.ext.mathjax', 'sphinx.ext.mathjax',
'sphinx.ext.todo'
'sphinx.ext.todo',
'sphinx_autodoc_typehints'
] ]


autodoc_default_options = { autodoc_default_options = {
@@ -53,8 +54,10 @@ autodoc_default_options = {


add_module_names = False add_module_names = False
autosummary_ignore_module_all = False autosummary_ignore_module_all = False
autodoc_typehints = "description"
# autodoc_typehints = "description"
autoclass_content = "class" autoclass_content = "class"
typehints_fully_qualified = False
typehints_defaults = "comma"


# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates'] templates_path = ['_templates']
@@ -168,8 +171,8 @@ texinfo_documents = [


# -- Extension configuration ------------------------------------------------- # -- Extension configuration -------------------------------------------------
def maybe_skip_member(app, what, name, obj, skip, options): def maybe_skip_member(app, what, name, obj, skip, options):
# if obj.__doc__ is None:
# return True
if obj.__doc__ is None:
return True
if name == "__init__": if name == "__init__":
return False return False
if name.startswith("_"): if name.startswith("_"):


+ 2
- 2
docs/source/fastNLP.core.callbacks.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.callbacks.torch_callbacks fastNLP.core.callbacks.torch_callbacks


@@ -18,7 +18,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.callbacks.callback fastNLP.core.callbacks.callback
fastNLP.core.callbacks.callback_event fastNLP.core.callbacks.callback_event


+ 1
- 1
docs/source/fastNLP.core.callbacks.torch_callbacks.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.callbacks.torch_callbacks.torch_grad_clip_callback fastNLP.core.callbacks.torch_callbacks.torch_grad_clip_callback
fastNLP.core.callbacks.torch_callbacks.torch_lr_sched_callback fastNLP.core.callbacks.torch_callbacks.torch_lr_sched_callback

+ 1
- 1
docs/source/fastNLP.core.collators.padders.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.collators.padders.exceptions fastNLP.core.collators.padders.exceptions
fastNLP.core.collators.padders.get_padder fastNLP.core.collators.padders.get_padder


+ 2
- 2
docs/source/fastNLP.core.collators.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.collators.padders fastNLP.core.collators.padders


@@ -18,7 +18,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.collators.collator fastNLP.core.collators.collator
fastNLP.core.collators.packer_unpacker fastNLP.core.collators.packer_unpacker

+ 1
- 1
docs/source/fastNLP.core.controllers.loops.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.controllers.loops.evaluate_batch_loop fastNLP.core.controllers.loops.evaluate_batch_loop
fastNLP.core.controllers.loops.loop fastNLP.core.controllers.loops.loop


+ 2
- 2
docs/source/fastNLP.core.controllers.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.controllers.loops fastNLP.core.controllers.loops
fastNLP.core.controllers.utils fastNLP.core.controllers.utils
@@ -19,7 +19,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.controllers.evaluator fastNLP.core.controllers.evaluator
fastNLP.core.controllers.trainer fastNLP.core.controllers.trainer

+ 1
- 1
docs/source/fastNLP.core.controllers.utils.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.controllers.utils.state fastNLP.core.controllers.utils.state
fastNLP.core.controllers.utils.utils fastNLP.core.controllers.utils.utils

+ 1
- 1
docs/source/fastNLP.core.dataloaders.jittor_dataloader.rst View File

@@ -10,6 +10,6 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.dataloaders.jittor_dataloader.fdl fastNLP.core.dataloaders.jittor_dataloader.fdl

+ 1
- 1
docs/source/fastNLP.core.dataloaders.paddle_dataloader.rst View File

@@ -10,6 +10,6 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.dataloaders.paddle_dataloader.fdl fastNLP.core.dataloaders.paddle_dataloader.fdl

+ 7
- 0
docs/source/fastNLP.core.dataloaders.prepare_dataloader.rst View File

@@ -0,0 +1,7 @@
fastNLP.core.dataloaders.prepare\_dataloader module
===================================================

.. automodule:: fastNLP.core.dataloaders.prepare_dataloader
:members:
:undoc-members:
:show-inheritance:

+ 3
- 2
docs/source/fastNLP.core.dataloaders.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.dataloaders.jittor_dataloader fastNLP.core.dataloaders.jittor_dataloader
fastNLP.core.dataloaders.paddle_dataloader fastNLP.core.dataloaders.paddle_dataloader
@@ -20,7 +20,8 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.dataloaders.mix_dataloader fastNLP.core.dataloaders.mix_dataloader
fastNLP.core.dataloaders.prepare_dataloader
fastNLP.core.dataloaders.utils fastNLP.core.dataloaders.utils

+ 1
- 1
docs/source/fastNLP.core.dataloaders.torch_dataloader.rst View File

@@ -10,6 +10,6 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.dataloaders.torch_dataloader.fdl fastNLP.core.dataloaders.torch_dataloader.fdl

+ 1
- 1
docs/source/fastNLP.core.dataset.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.dataset.dataset fastNLP.core.dataset.dataset
fastNLP.core.dataset.field fastNLP.core.dataset.field


+ 1
- 1
docs/source/fastNLP.core.drivers.jittor_driver.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.drivers.jittor_driver.initialize_jittor_driver fastNLP.core.drivers.jittor_driver.initialize_jittor_driver
fastNLP.core.drivers.jittor_driver.jittor_driver fastNLP.core.drivers.jittor_driver.jittor_driver


+ 1
- 1
docs/source/fastNLP.core.drivers.paddle_driver.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.drivers.paddle_driver.dist_utils fastNLP.core.drivers.paddle_driver.dist_utils
fastNLP.core.drivers.paddle_driver.fleet fastNLP.core.drivers.paddle_driver.fleet


+ 2
- 2
docs/source/fastNLP.core.drivers.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.drivers.jittor_driver fastNLP.core.drivers.jittor_driver
fastNLP.core.drivers.paddle_driver fastNLP.core.drivers.paddle_driver
@@ -20,7 +20,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.drivers.choose_driver fastNLP.core.drivers.choose_driver
fastNLP.core.drivers.driver fastNLP.core.drivers.driver


+ 1
- 1
docs/source/fastNLP.core.drivers.torch_driver.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.drivers.torch_driver.ddp fastNLP.core.drivers.torch_driver.ddp
fastNLP.core.drivers.torch_driver.dist_utils fastNLP.core.drivers.torch_driver.dist_utils


+ 1
- 1
docs/source/fastNLP.core.log.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.log.handler fastNLP.core.log.handler
fastNLP.core.log.highlighter fastNLP.core.log.highlighter


+ 1
- 1
docs/source/fastNLP.core.metrics.backend.jittor_backend.rst View File

@@ -10,6 +10,6 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.metrics.backend.jittor_backend.backend fastNLP.core.metrics.backend.jittor_backend.backend

+ 1
- 1
docs/source/fastNLP.core.metrics.backend.paddle_backend.rst View File

@@ -10,6 +10,6 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.metrics.backend.paddle_backend.backend fastNLP.core.metrics.backend.paddle_backend.backend

+ 2
- 2
docs/source/fastNLP.core.metrics.backend.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.metrics.backend.jittor_backend fastNLP.core.metrics.backend.jittor_backend
fastNLP.core.metrics.backend.paddle_backend fastNLP.core.metrics.backend.paddle_backend
@@ -20,7 +20,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.metrics.backend.auto_backend fastNLP.core.metrics.backend.auto_backend
fastNLP.core.metrics.backend.backend fastNLP.core.metrics.backend.backend

+ 1
- 1
docs/source/fastNLP.core.metrics.backend.torch_backend.rst View File

@@ -10,6 +10,6 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.metrics.backend.torch_backend.backend fastNLP.core.metrics.backend.torch_backend.backend

+ 2
- 2
docs/source/fastNLP.core.metrics.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.metrics.backend fastNLP.core.metrics.backend


@@ -18,7 +18,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.metrics.accuracy fastNLP.core.metrics.accuracy
fastNLP.core.metrics.classify_f1_pre_rec_metric fastNLP.core.metrics.classify_f1_pre_rec_metric


+ 2
- 2
docs/source/fastNLP.core.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.callbacks fastNLP.core.callbacks
fastNLP.core.collators fastNLP.core.collators
@@ -27,6 +27,6 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.vocabulary fastNLP.core.vocabulary

+ 1
- 1
docs/source/fastNLP.core.samplers.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.samplers.conversion_utils fastNLP.core.samplers.conversion_utils
fastNLP.core.samplers.mix_sampler fastNLP.core.samplers.mix_sampler


+ 1
- 1
docs/source/fastNLP.core.utils.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core.utils.cache_results fastNLP.core.utils.cache_results
fastNLP.core.utils.dummy_class fastNLP.core.utils.dummy_class


+ 1
- 1
docs/source/fastNLP.envs.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.envs.distributed fastNLP.envs.distributed
fastNLP.envs.env fastNLP.envs.env


+ 1
- 1
docs/source/fastNLP.io.loader.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.io.loader.classification fastNLP.io.loader.classification
fastNLP.io.loader.conll fastNLP.io.loader.conll


+ 1
- 1
docs/source/fastNLP.io.pipe.rst View File

@@ -10,7 +10,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.io.pipe.classification fastNLP.io.pipe.classification
fastNLP.io.pipe.conll fastNLP.io.pipe.conll


+ 2
- 2
docs/source/fastNLP.io.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.io.loader fastNLP.io.loader
fastNLP.io.pipe fastNLP.io.pipe
@@ -19,7 +19,7 @@ Submodules
---------- ----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.io.data_bundle fastNLP.io.data_bundle
fastNLP.io.embed_loader fastNLP.io.embed_loader


+ 1
- 1
docs/source/fastNLP.rst View File

@@ -10,7 +10,7 @@ Subpackages
----------- -----------


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP.core fastNLP.core
fastNLP.envs fastNLP.envs


+ 1
- 1
docs/source/modules.rst View File

@@ -2,6 +2,6 @@ fastNLP
======= =======


.. toctree:: .. toctree::
:maxdepth: 6
:maxdepth: 4


fastNLP fastNLP

+ 0
- 4
fastNLP/core/__init__.py View File

@@ -3,9 +3,7 @@ __all__ = [
'Callback', 'Callback',
'Event', 'Event',
'Filter', 'Filter',
'CallbackManager',
'CheckpointCallback', 'CheckpointCallback',
'choose_progress_callback',
'ProgressCallback', 'ProgressCallback',
'RichCallback', 'RichCallback',
"LRSchedCallback", "LRSchedCallback",
@@ -54,7 +52,6 @@ __all__ = [
'DataSet', 'DataSet',
'FieldArray', 'FieldArray',
'Instance', 'Instance',
'ApplyResultException',


# drivers # drivers
"TorchSingleDriver", "TorchSingleDriver",
@@ -63,7 +60,6 @@ __all__ = [
"PaddleFleetDriver", "PaddleFleetDriver",
"JittorSingleDriver", "JittorSingleDriver",
"JittorMPIDriver", "JittorMPIDriver",
"TorchPaddleDriver",


# log # log
"logger", "logger",


+ 13
- 9
fastNLP/core/callbacks/callback_manager.py View File

@@ -10,8 +10,8 @@ from .callback_event import Event
from .callback import Callback from .callback import Callback
from fastNLP.core.log import logger from fastNLP.core.log import logger
from .progress_callback import ProgressCallback, choose_progress_callback from .progress_callback import ProgressCallback, choose_progress_callback
from fastNLP.envs import rank_zero_call
from fastNLP.core.utils.utils import _get_fun_msg
from ..utils.exceptions import EarlyStopException
from ..utils.utils import _get_fun_msg




def _transfer(func): def _transfer(func):
@@ -25,6 +25,8 @@ def _transfer(func):
for callback_fn in manager.callback_fns[func.__name__]: for callback_fn in manager.callback_fns[func.__name__]:
try: try:
callback_fn(*arg, **kwargs) callback_fn(*arg, **kwargs)
except EarlyStopException as e:
raise e
except BaseException as e: except BaseException as e:
logger.error(f"The following callback_fn raise exception:{_get_fun_msg(callback_fn)}.") logger.error(f"The following callback_fn raise exception:{_get_fun_msg(callback_fn)}.")
raise e raise e
@@ -178,14 +180,16 @@ class CallbackManager:
states[each_callback.callback_name]["states"] = each_callback.on_save_checkpoint(trainer) states[each_callback.callback_name]["states"] = each_callback.on_save_checkpoint(trainer)


if len(_duplicated_callbacks) > 0: if len(_duplicated_callbacks) > 0:
logger.warning(f"Notice these callbacks' `callback_name` are duplicated: {_duplicated_callbacks}, "
f"and we will only save the first callback's state we meet.")
logger.warning(f"Notice these callback_name: {_duplicated_callbacks} are duplicated, "
f"fastNLP will only save the first callback's state.")


# 2. 每一个具体的 callback 函数的 filter 的状态; # 2. 每一个具体的 callback 函数的 filter 的状态;
_record_duplicated_callback_names = set() _record_duplicated_callback_names = set()
for each_callback_filters in self._callback_filters: for each_callback_filters in self._callback_filters:
if each_callback_filters[0] not in _record_duplicated_callback_names: if each_callback_filters[0] not in _record_duplicated_callback_names:
_record_duplicated_callback_names.add(each_callback_filters[0]) _record_duplicated_callback_names.add(each_callback_filters[0])
if 'filter_states' not in states[each_callback_filters[0]]:
states[each_callback_filters[0]]["filter_states"] = {}
states[each_callback_filters[0]]["filter_states"][each_callback_filters[1]] = each_callback_filters[2].state_dict() states[each_callback_filters[0]]["filter_states"][each_callback_filters[1]] = each_callback_filters[2].state_dict()


# 3. 保存 callback_counter; # 3. 保存 callback_counter;
@@ -212,13 +216,15 @@ class CallbackManager:
if each_callback_filters[0] in states: if each_callback_filters[0] in states:
if each_callback_filters[0] not in _already_loaded_callback_names: if each_callback_filters[0] not in _already_loaded_callback_names:
_already_loaded_callback_names.add(each_callback_filters[0]) _already_loaded_callback_names.add(each_callback_filters[0])
each_callback_filters[2].load_state_dict(states[each_callback_filters[0]]["filter_states"][each_callback_filters[1]])
if 'filter_states' in states[each_callback_filters[0]] and \
each_callback_filters[1] in states[each_callback_filters[0]]['filter_states']:
each_callback_filters[2].load_state_dict(states[each_callback_filters[0]]['filter_states'][each_callback_filters[1]])
else: else:
_duplicated_callback_names.add(each_callback_filters[0]) _duplicated_callback_names.add(each_callback_filters[0])


if len(_duplicated_callback_names) > 0: if len(_duplicated_callback_names) > 0:
logger.warning(f"Notice these callbacks' `callback_name` are duplicated: {_duplicated_callback_names}, "
f"and we will only load the first callback's state we meet.")
logger.rank_zero_warning(f"Notice these callback_name: {_duplicated_callback_names} are duplicated, "
f"fastNLP will only load the first callback's state.")


# 2. 再恢复每一个 callback 的单独的状态; # 2. 再恢复每一个 callback 的单独的状态;
# 每一个我们自己提供的类 callback,都需要重写其特定的 `callback_name` 方法,保证如果两个 callback 的 callback_name 一样, # 每一个我们自己提供的类 callback,都需要重写其特定的 `callback_name` 方法,保证如果两个 callback 的 callback_name 一样,
@@ -229,8 +235,6 @@ class CallbackManager:
_already_loaded_callback_names.add(each_callback.callback_name) _already_loaded_callback_names.add(each_callback.callback_name)
# 这里要注意,我们已经确保每一个 callback 的 `on_load_checkpoint` 函数拿到的就是其自己的状态; # 这里要注意,我们已经确保每一个 callback 的 `on_load_checkpoint` 函数拿到的就是其自己的状态;
each_callback.on_load_checkpoint(trainer, states[each_callback.callback_name]["states"]) each_callback.on_load_checkpoint(trainer, states[each_callback.callback_name]["states"])
else:
each_callback.on_load_checkpoint(trainer, None)


@property @property
def has_trainer_checkpoint(self) -> bool: def has_trainer_checkpoint(self) -> bool:


+ 7
- 20
fastNLP/core/callbacks/checkpoint_callback.py View File

@@ -19,7 +19,7 @@ class CheckpointCallback(Callback):
only_state_dict: bool = True, model_save_fn: Optional[Callable] = None, save_object: str = 'model', only_state_dict: bool = True, model_save_fn: Optional[Callable] = None, save_object: str = 'model',
save_evaluate_results=True, **kwargs): save_evaluate_results=True, **kwargs):
""" """
保存模型 checkpoint 的 callback ,其保存的文件目录以及文件名命名规则如下::
保存 checkpoint 的 callback ,其保存的文件目录以及文件名命名规则如下::


- folder/ - folder/
- YYYY-mm-dd-HH_MM_SS_fffff/ # 自动根据当前脚本的启动时间创建的 - YYYY-mm-dd-HH_MM_SS_fffff/ # 自动根据当前脚本的启动时间创建的
@@ -29,8 +29,9 @@ class CheckpointCallback(Callback):
- {save_object}-epoch_{epoch_idx}-batch_{global_batch_idx}-exception_{exception_type}/ # exception时保存。 - {save_object}-epoch_{epoch_idx}-batch_{global_batch_idx}-exception_{exception_type}/ # exception时保存。
- {save_object}-epoch_{epoch_idx}-batch_{global_batch_idx}-{monitor}_{monitor_value}/ # 满足topk条件存储文件名 - {save_object}-epoch_{epoch_idx}-batch_{global_batch_idx}-{monitor}_{monitor_value}/ # 满足topk条件存储文件名


model_save_fn 为 None ,则以上每个 folder 中,将生成 fastnlp_model.pkl.tar 文件。
若 model_save_fn 不为 None,则 fastNLP 将 folder 绝对路径传递给该函数,fastNLP 在该 folder 下不进行模型保存。
model_save_fn 为 None ,则以上每个 folder 中,将生成 fastnlp_model.pkl.tar 文件。若 model_save_fn 不为 None,
则 fastNLP 将 folder 绝对路径传递给该函数,fastNLP 在该 folder 下不进行模型保存。默认情况下,本 checkpoint 只保存了 model
的状态;如还需保存 Trainer 的状态以断点重训的话,请使用 ``save_object='trainer'`` 。


:param monitor: 监控的 metric 值。如果在 evaluation 结果中没有找到完全一致的名称,将使用 最长公共字符串算法 找到最匹配 :param monitor: 监控的 metric 值。如果在 evaluation 结果中没有找到完全一致的名称,将使用 最长公共字符串算法 找到最匹配
的那个作为 monitor 。如果为 None,将尝试使用 Trainer 设置的 monitor 。也可以传入一个函数,接受参数为 evaluation 的结 的那个作为 monitor 。如果为 None,将尝试使用 Trainer 设置的 monitor 。也可以传入一个函数,接受参数为 evaluation 的结
@@ -46,22 +47,14 @@ class CheckpointCallback(Callback):
:param only_state_dict: 保存模型时是否只保存 state_dict 。当 model_save_fn 不为 None 时,该参数无效。 :param only_state_dict: 保存模型时是否只保存 state_dict 。当 model_save_fn 不为 None 时,该参数无效。
:param model_save_fn: 个性化的保存函数,当触发保存操作时,就调用这个函数,这个函数应当接受一个文件夹作为参数,不返回任何东西。 :param model_save_fn: 个性化的保存函数,当触发保存操作时,就调用这个函数,这个函数应当接受一个文件夹作为参数,不返回任何东西。
如果传入了 model_save_fn 函数,fastNLP 将不再进行模型相关的保存。在多卡场景下,我们只在 rank 0 上会运行该函数。 如果传入了 model_save_fn 函数,fastNLP 将不再进行模型相关的保存。在多卡场景下,我们只在 rank 0 上会运行该函数。
:param save_object: 可选 ['trainer', 'model'],表示在保存时的保存对象为 trainer+model 还是 只是model 。
:param save_object: 可选 ['trainer', 'model'],表示在保存时的保存对象为 ``trainer+model`` 还是 只是 ``model`` 。如果
保存 ``trainer`` 对象的话,将会保存 :class:~fastNLP.Trainer 的相关状态,可以通过 :meth:`Trainer.load` 加载该断
点继续训练。如果保存的是 ``Model`` 对象,则可以通过 :meth:`Trainer.load_model` 加载该模型权重。
:param save_evaluate_results: 是否保存 evaluate 的结果。如果为 True ,在保存 topk 模型的 folder 中还将额外保存一个 :param save_evaluate_results: 是否保存 evaluate 的结果。如果为 True ,在保存 topk 模型的 folder 中还将额外保存一个
fastnlp_evaluate_results.json 文件,记录当前的 results。仅在设置了 topk 的场景下有用,默认为 True 。 fastnlp_evaluate_results.json 文件,记录当前的 results。仅在设置了 topk 的场景下有用,默认为 True 。
:param kwargs: :param kwargs:
""" """
super().__init__() super().__init__()
if folder is None:
logger.warning(
"Parameter `folder` is None, and we will use the current work directory to find and load your model.")
folder = Path.cwd()
folder = Path(folder)
if not folder.exists():
raise NotADirectoryError(f"Path '{folder.absolute()}' is not existed!")
elif folder.is_file():
raise ValueError("Parameter `folder` should be a directory instead of a file.")

if every_n_epochs is not None: if every_n_epochs is not None:
if not isinstance(every_n_epochs, int) or every_n_epochs < 1: if not isinstance(every_n_epochs, int) or every_n_epochs < 1:
raise ValueError("Parameter `every_n_epochs` should be an int and greater than or equal to 1.") raise ValueError("Parameter `every_n_epochs` should be an int and greater than or equal to 1.")
@@ -74,12 +67,6 @@ class CheckpointCallback(Callback):
else: else:
every_n_batches = sys.maxsize # 使得没有数字可以整除 every_n_batches = sys.maxsize # 使得没有数字可以整除


if topk is not None:
if not isinstance(topk, int):
raise ValueError("Parameter `topk` should be an int.")
else:
topk = 0

if on_exceptions is not None: if on_exceptions is not None:
if not isinstance(on_exceptions, Sequence): if not isinstance(on_exceptions, Sequence):
on_exceptions = [on_exceptions] on_exceptions = [on_exceptions]


+ 2
- 1
fastNLP/core/callbacks/load_best_model_callback.py View File

@@ -19,7 +19,8 @@ class LoadBestModelCallback(HasMonitorCallback):
model_load_fn:Optional[Callable] = None, model_load_fn:Optional[Callable] = None,
delete_after_train:bool = True): delete_after_train:bool = True):
""" """
保存最佳的 monitor 值最佳的模型,并在训练结束的时候重新加载模型。仅在训练正常结束的时候才能加载最好的模型。
保存最佳的 monitor 值最佳的模型,并在训练结束的时候重新加载模型,默认会在加载之后删除权重文件。仅在训练正常结束的时候才能加载
最好的模型。


:param str monitor: 监控的 metric 值。如果在 evaluation 结果中没有找到完全一致的名称,将使用 最长公共字符串算法 找到最匹配 :param str monitor: 监控的 metric 值。如果在 evaluation 结果中没有找到完全一致的名称,将使用 最长公共字符串算法 找到最匹配
的那个作为 monitor 。如果为 None,将尝试使用 Trainer 设置的 monitor 。也可以传入一个函数,接受参数为 evaluation 的结 的那个作为 monitor 。如果为 None,将尝试使用 Trainer 设置的 monitor 。也可以传入一个函数,接受参数为 evaluation 的结


+ 2
- 3
fastNLP/core/callbacks/topk_saver.py View File

@@ -33,9 +33,8 @@ class Saver:
:param kwargs: 更多需要传递给 Trainer.save() 或者 Trainer.save_model() 接口的参数。 :param kwargs: 更多需要传递给 Trainer.save() 或者 Trainer.save_model() 接口的参数。
""" """
if folder is None: if folder is None:
logger.rank_zero_warning(
"Parameter `folder` is None, and we will use the current work directory to find and load your model.")
folder = Path.cwd()
folder = Path.cwd().absolute()
logger.info(f"Parameter `folder` is None, and we will use {folder} to save and load your model.")
folder = Path(folder) folder = Path(folder)
if not folder.exists(): if not folder.exists():
folder.mkdir(parents=True, exist_ok=True) folder.mkdir(parents=True, exist_ok=True)


+ 2
- 2
fastNLP/core/callbacks/utils.py View File

@@ -8,8 +8,8 @@ from fastNLP.core.utils.utils import _get_fun_msg


def _get_monitor_value(monitor: Union[callable, str], real_monitor: Optional[str], res: dict) ->Tuple[str, float]: def _get_monitor_value(monitor: Union[callable, str], real_monitor: Optional[str], res: dict) ->Tuple[str, float]:
""" """
res中寻找 monitor 并返回。如果 monitor 没找到则尝试用 _real_monitor ,若 _real_monitor 为 None 则尝试使用 monitor 的值进行
匹配。
``res`` 中寻找 ``monitor`` 并返回。如果 ``monitor`` 没找到则尝试用 ``_real_monitor`` ,若 ``_real_monitor`` 为 ``None``
则尝试使用 ``monitor`` 的值进行匹配。


:param monitor: :param monitor:
:param real_monitor: :param real_monitor:


+ 1
- 1
fastNLP/core/collators/padders/get_padder.py View File

@@ -121,7 +121,7 @@ def get_padder(batch_field:Sequence[Any], pad_val, dtype, backend, field_name)->
# 这里 ele_dtype 传入为 None 的原因是防止出现 paddle tensor 转换为 torch tensor # 这里 ele_dtype 传入为 None 的原因是防止出现 paddle tensor 转换为 torch tensor
return TorchTensorPadder(pad_val=pad_val, ele_dtype=None, dtype=dtype) return TorchTensorPadder(pad_val=pad_val, ele_dtype=None, dtype=dtype)
elif backend == 'paddle': elif backend == 'paddle':
return PaddleTensorPadder(pad_val=pad_val, ele_dtype=None, dtype=dtype)
return PaddleTensorPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype)
elif backend == 'jittor': elif backend == 'jittor':
return JittorTensorPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) return JittorTensorPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype)
else: else:


+ 4
- 1
fastNLP/core/collators/padders/paddle_padder.py View File

@@ -141,7 +141,10 @@ class PaddleTensorPadder(Padder):


shapes = [field.shape for field in batch_field] shapes = [field.shape for field in batch_field]
max_shape = [len(batch_field)] + [max(*_) for _ in zip(*shapes)] max_shape = [len(batch_field)] + [max(*_) for _ in zip(*shapes)]
array = np.full(max_shape, fill_value=pad_val)
if isinstance(batch_field[0], paddle.Tensor):
array = paddle.full(max_shape, fill_value=pad_val, dtype=dtype)
else:
array = np.full(max_shape, fill_value=pad_val, dtype=batch_field[0].dtype)
for i, field in enumerate(batch_field): for i, field in enumerate(batch_field):
slices = (i, ) + tuple(slice(0, s) for s in shapes[i]) slices = (i, ) + tuple(slice(0, s) for s in shapes[i])
array[slices] = field array[slices] = field


+ 2
- 2
fastNLP/core/collators/padders/torch_padder.py View File

@@ -118,8 +118,8 @@ class TorchTensorPadder(Padder):
batch_field = [torch.tensor(field.tolist(), dtype=dtype) for field in batch_field] batch_field = [torch.tensor(field.tolist(), dtype=dtype) for field in batch_field]
else: else:
device = batch_field[0].device device = batch_field[0].device
if dtype is None:
dtype = batch_field[0].dtype
if dtype is None:
dtype = batch_field[0].dtype
except AttributeError: except AttributeError:
raise RuntimeError(f"If the field is not a torch.Tensor (it is {type(batch_field[0])}), " raise RuntimeError(f"If the field is not a torch.Tensor (it is {type(batch_field[0])}), "
f"it must have tolist() method.") f"it must have tolist() method.")


+ 6
- 2
fastNLP/core/controllers/evaluator.py View File

@@ -236,8 +236,7 @@ class Evaluator:
""" """
调用所有 metric 的 reset() 方法,清除累积的状态。 调用所有 metric 的 reset() 方法,清除累积的状态。


Returns:

:return:
""" """
self.metrics_wrapper.reset() self.metrics_wrapper.reset()


@@ -359,6 +358,11 @@ class _MetricsWrapper:
metric.update(res) metric.update(res)


def reset(self): def reset(self):
"""
将 Metric 中的状态重新设置。

:return:
"""
for metric in self._metrics: for metric in self._metrics:
if _is_allennlp_metric(metric): if _is_allennlp_metric(metric):
metric.get_metric(reset=True) metric.get_metric(reset=True)


+ 1
- 1
fastNLP/core/controllers/loops/evaluate_batch_loop.py View File

@@ -34,7 +34,7 @@ class EvaluateBatchLoop(Loop):
except BaseException as e: except BaseException as e:
if callable(getattr(dataloader, 'get_batch_indices', None)): if callable(getattr(dataloader, 'get_batch_indices', None)):
indices = dataloader.get_batch_indices() indices = dataloader.get_batch_indices()
logger.debug(f"The following exception happens when running on samples: {indices}")
logger.error(f"Exception happens when evaluating on samples: {indices}")
raise e raise e


self.batch_step_fn(evaluator, batch) self.batch_step_fn(evaluator, batch)


+ 1
- 1
fastNLP/core/controllers/loops/train_batch_loop.py View File

@@ -32,7 +32,7 @@ class TrainBatchLoop(Loop):
break break
except BaseException as e: except BaseException as e:
if indices and not isinstance(e, EarlyStopException): if indices and not isinstance(e, EarlyStopException):
logger.debug(f"The following exception happens when running on samples: {indices}")
logger.error(f"Exception happens when running on samples: {indices}")
raise e raise e


trainer.on_train_batch_begin(batch, indices) trainer.on_train_batch_begin(batch, indices)


+ 20
- 11
fastNLP/core/controllers/trainer.py View File

@@ -282,32 +282,41 @@ class Trainer(TrainerEventTrigger):


:kwargs: :kwargs:
* *torch_kwargs* -- 用于在指定 ``driver`` 为 'torch' 时设定具体 driver 实例的一些参数: * *torch_kwargs* -- 用于在指定 ``driver`` 为 'torch' 时设定具体 driver 实例的一些参数:
* ddp_kwargs -- 用于在使用 ``TorchDDPDriver`` 时指定 ``DistributedDataParallel`` 初始化时的参数;例如传入 * ddp_kwargs -- 用于在使用 ``TorchDDPDriver`` 时指定 ``DistributedDataParallel`` 初始化时的参数;例如传入
{'find_unused_parameters': True} 来解决有参数不参与前向运算导致的报错等;
{'find_unused_parameters': True} 来解决有参数不参与前向运算导致的报错等;
* set_grad_to_none -- 是否在训练过程中在每一次 optimizer 更新后将 grad 置为 None; * set_grad_to_none -- 是否在训练过程中在每一次 optimizer 更新后将 grad 置为 None;
* torch_non_blocking -- 表示用于 pytorch 的 tensor 的 to 方法的参数 non_blocking; * torch_non_blocking -- 表示用于 pytorch 的 tensor 的 to 方法的参数 non_blocking;
* *paddle_kwargs* -- 用于在指定 ``driver`` 为 'paddle' 时设定具体 driver 实例的一些参数:
* fleet_kwargs -- 用于在使用 ``PaddleFleetDriver`` 时指定 ``DataParallel`` 和 ``fleet`` 初始化时的参数,包括:
* is_collective -- 是否使用 paddle 集群式的分布式训练方法,目前仅支持为 True 的情况;
* role_maker -- 初始化 ``fleet`` 分布式训练 API 时使用的 ``RoleMaker``
* 其它用于初始化 ``DataParallel`` 的参数;
* *data_device* -- 一个具体的 driver 实例中,有 ``model_device`` 和 ``data_device``,前者表示模型所在的设备,后者表示 * *data_device* -- 一个具体的 driver 实例中,有 ``model_device`` 和 ``data_device``,前者表示模型所在的设备,后者表示
当 ``model_device`` 为 None 时应当将数据迁移到哪个设备;
当 ``model_device`` 为 None 时应当将数据迁移到哪个设备;


.. note::
.. note::


注意您在绝大部分情况下不会用到该参数! 注意您在绝大部分情况下不会用到该参数!


1. 当 driver 实例的 ``model_device`` 不为 None 时,该参数无效; 1. 当 driver 实例的 ``model_device`` 不为 None 时,该参数无效;
2. 对于 pytorch,仅当用户自己通过 ``python -m torch.distributed.launch`` 并且自己初始化 ``init_process_group`` 时, 2. 对于 pytorch,仅当用户自己通过 ``python -m torch.distributed.launch`` 并且自己初始化 ``init_process_group`` 时,
driver 实例的 ``model_device`` 才会为 None; driver 实例的 ``model_device`` 才会为 None;
3. 对于 paddle,该参数无效;


* *use_dist_sampler* -- 表示是否使用分布式的 ``sampler``。在多卡时,分布式 ``sampler`` 将自动决定每张卡上读取的 sample ,使得一个 epoch * *use_dist_sampler* -- 表示是否使用分布式的 ``sampler``。在多卡时,分布式 ``sampler`` 将自动决定每张卡上读取的 sample ,使得一个 epoch
内所有卡的 sample 加起来为一整个数据集的 sample。默认会根据 driver 是否为分布式进行设置。
内所有卡的 sample 加起来为一整个数据集的 sample。默认会根据 driver 是否为分布式进行设置。
* *evaluate_use_dist_sampler* -- 表示在 ``Evaluator`` 中在使用分布式的时候是否将 dataloader 的 ``sampler`` 替换为分布式的 ``sampler``;默认为 ``True``; * *evaluate_use_dist_sampler* -- 表示在 ``Evaluator`` 中在使用分布式的时候是否将 dataloader 的 ``sampler`` 替换为分布式的 ``sampler``;默认为 ``True``;
* *output_from_new_proc* -- 应当为一个字符串,表示在多进程的 driver 中其它进程的输出流应当被做如何处理;其值应当为以下之一: * *output_from_new_proc* -- 应当为一个字符串,表示在多进程的 driver 中其它进程的输出流应当被做如何处理;其值应当为以下之一:
["all", "ignore", "only_error"];当该参数的值不是以上值时,该值应当表示一个文件夹的名字,我们会将其他 rank 的输出流重定向到
log 文件中,然后将 log 文件保存在通过该参数值设定的文件夹中;默认为 "only_error";
["all", "ignore", "only_error"];当该参数的值不是以上值时,该值应当表示一个文件夹的名字,我们会将其他 rank 的输出流重定向到
log 文件中,然后将 log 文件保存在通过该参数值设定的文件夹中;默认为 "only_error";


注意该参数仅当使用分布式的 ``driver`` 时才有效,例如 ``TorchDDPDriver``;
注意该参数仅当使用分布式的 ``driver`` 时才有效,例如 ``TorchDDPDriver``;
* *progress_bar* -- 以哪种方式显示 progress ,目前支持[None, 'raw', 'rich', 'auto'] 或者 RichCallback, RawTextCallback对象, * *progress_bar* -- 以哪种方式显示 progress ,目前支持[None, 'raw', 'rich', 'auto'] 或者 RichCallback, RawTextCallback对象,
默认为 auto , auto 表示如果检测到当前 terminal 为交互型则使用 RichCallback,否则使用 RawTextCallback对象。如果
需要定制 progress bar 的参数,例如打印频率等,可以传入 RichCallback, RawTextCallback 对象。
默认为 auto , auto 表示如果检测到当前 terminal 为交互型则使用 RichCallback,否则使用 RawTextCallback对象。如果
需要定制 progress bar 的参数,例如打印频率等,可以传入 RichCallback, RawTextCallback 对象。
* *train_input_mapping* -- 与 input_mapping 一致,但是只用于 ``Trainer`` 中。与 input_mapping 互斥。 * *train_input_mapping* -- 与 input_mapping 一致,但是只用于 ``Trainer`` 中。与 input_mapping 互斥。
* *train_output_mapping* -- 与 output_mapping 一致,但是只用于 ``Trainer`` 中。与 output_mapping 互斥。 * *train_output_mapping* -- 与 output_mapping 一致,但是只用于 ``Trainer`` 中。与 output_mapping 互斥。
* *evaluate_input_mapping* -- 与 input_mapping 一致,但是只用于 ``Evaluator`` 中。与 input_mapping 互斥。 * *evaluate_input_mapping* -- 与 input_mapping 一致,但是只用于 ``Evaluator`` 中。与 input_mapping 互斥。
@@ -558,7 +567,7 @@ class Trainer(TrainerEventTrigger):
else: else:
raise FileNotFoundError("You are using `resume_from`, but we can not find your specific file.") raise FileNotFoundError("You are using `resume_from`, but we can not find your specific file.")


if self.evaluator is not None and num_eval_sanity_batch > 0:
if self.evaluator is not None and num_eval_sanity_batch != 0:
logger.info(f"Running evaluator sanity check for {num_eval_sanity_batch} batches.") logger.info(f"Running evaluator sanity check for {num_eval_sanity_batch} batches.")
self.on_sanity_check_begin() self.on_sanity_check_begin()
sanity_check_res = self.evaluator.run(num_eval_batch_per_dl=num_eval_sanity_batch) sanity_check_res = self.evaluator.run(num_eval_batch_per_dl=num_eval_sanity_batch)
@@ -951,7 +960,7 @@ class Trainer(TrainerEventTrigger):
self.driver.save_model(folder, only_state_dict, **kwargs) self.driver.save_model(folder, only_state_dict, **kwargs)
self.driver.barrier() self.driver.barrier()


def load_model(self, folder: Union[str, Path, BinaryIO, io.BytesIO], only_state_dict: bool = False,
def load_model(self, folder: Union[str, Path, BinaryIO, io.BytesIO], only_state_dict: bool = True,
model_load_fn: Optional[Callable] = None, **kwargs): model_load_fn: Optional[Callable] = None, **kwargs):
""" """
加载模型 加载模型


+ 2
- 2
fastNLP/core/dataloaders/paddle_dataloader/fdl.py View File

@@ -162,9 +162,9 @@ class PaddleDataLoader(DataLoader):


def get_batch_indices(self) -> List[int]: def get_batch_indices(self) -> List[int]:
""" """
获取当前 batch 的 idx
获取当前 ``batch`` 中每条数据对应的索引。


:return:
:return: 当前 ``batch`` 数据的索引;
""" """
return self.cur_batch_indices return self.cur_batch_indices




+ 1
- 1
fastNLP/core/dataloaders/prepare_dataloader.py View File

@@ -10,7 +10,7 @@ from ..samplers import RandomBatchSampler, RandomSampler
from .torch_dataloader import prepare_torch_dataloader from .torch_dataloader import prepare_torch_dataloader
from .paddle_dataloader import prepare_paddle_dataloader from .paddle_dataloader import prepare_paddle_dataloader
from .jittor_dataloader import prepare_jittor_dataloader from .jittor_dataloader import prepare_jittor_dataloader
from ...envs import FASTNLP_BACKEND, SUPPORT_BACKENDS, _module_available
from ...envs import FASTNLP_BACKEND, SUPPORT_BACKENDS
from ..log import logger from ..log import logger






+ 2
- 2
fastNLP/core/dataloaders/torch_dataloader/fdl.py View File

@@ -170,9 +170,9 @@ class TorchDataLoader(DataLoader):


def get_batch_indices(self) -> List[int]: def get_batch_indices(self) -> List[int]:
""" """
获取当前 batch 的 idx
获取当前 ``batch`` 中每条数据对应的索引。


:return:
:return: 当前 ``batch`` 数据的索引;
""" """
return self.cur_batch_indices return self.cur_batch_indices




+ 18
- 11
fastNLP/core/dataset/dataset.py View File

@@ -400,15 +400,22 @@ class DataSet:
new_field_name: str = None, num_proc: int = 0, new_field_name: str = None, num_proc: int = 0,
progress_desc: str = None, show_progress_bar: bool = True): progress_desc: str = None, show_progress_bar: bool = True):
r""" r"""
将 DataSet 中的每个 instance 中的名为 `field_name` 的 field 传给 func,并获取它的返回值。

:param field_name: 传入 func 的是哪个 field。
:param func: input是 instance 中名为 `field_name` 的 field 的内容。
:param new_field_name: 将 func 返回的内容放入到 `new_field_name` 这个 field 中,如果名称与已有的 field 相同,则覆
盖之前的 field。如果为 None 则不创建新的 field。
:param num_proc: 进程的数量。请注意,由于python语言的特性,多少进程就会导致多少倍内存的增长。
:param progress_desc: progress_desc 的值,默认为 Main
:param show_progress_bar: 是否展示进度条,默认展示进度条
将 :class:`~DataSet` 每个 ``instance`` 中为 ``field_name`` 的 ``field`` 传给函数 ``func``,并写入到 ``new_field_name``
中。

:param field_name: 传入 ``func`` 的 ``field`` 名称;
:param func: 对指定 ``field`` 进行处理的函数,注意其输入应为 ``instance`` 中名为 ``field_name`` 的 ``field`` 的内容;
:param new_field_name: 函数执行结果写入的 ``field`` 名称。该函数会将 ``func`` 返回的内容放入到 ``new_field_name`` 对
应的 ``field`` 中,注意如果名称与已有的 ``field`` 相同则会进行覆盖。如果为 ``None`` 则不会覆盖和创建 ``field`` ;
:param num_proc: 使用进程的数量。
.. note::
由于 ``python`` 语言的特性,设置该参数后会导致相应倍数的内存增长,这可能会对您程序的执行带来一定的影响。

:param progress_desc: 进度条的描述字符,默认为 ``Main``;
:param show_progress_bar: 是否在处理过程中展示进度条;
:return: 从函数 ``func`` 中得到的返回值;
""" """
assert len(self) != 0, "Null DataSet cannot use apply_field()." assert len(self) != 0, "Null DataSet cannot use apply_field()."
if not self.has_field(field_name=field_name): if not self.has_field(field_name=field_name):
@@ -451,8 +458,8 @@ class DataSet:
apply_out = self._apply_process(num_proc, func, progress_desc=progress_desc, apply_out = self._apply_process(num_proc, func, progress_desc=progress_desc,
show_progress_bar=show_progress_bar, _apply_field=field_name) show_progress_bar=show_progress_bar, _apply_field=field_name)
# 只检测第一个数据是否为dict类型,若是则默认所有返回值为dict;否则报错。 # 只检测第一个数据是否为dict类型,若是则默认所有返回值为dict;否则报错。
if not isinstance(apply_out[0], dict):
raise Exception("The result of func is not a dict")
if not isinstance(apply_out[0], Mapping):
raise Exception(f"The result of func is not a Mapping, but a {type(apply_out[0])}")


for key, value in apply_out[0].items(): for key, value in apply_out[0].items():
results[key] = [value] results[key] = [value]


+ 0
- 2
fastNLP/core/drivers/__init__.py View File

@@ -9,7 +9,6 @@ __all__ = [
"JittorDriver", "JittorDriver",
"JittorSingleDriver", "JittorSingleDriver",
"JittorMPIDriver", "JittorMPIDriver",
"TorchPaddleDriver",
'torch_seed_everything', 'torch_seed_everything',
'paddle_seed_everything', 'paddle_seed_everything',
'optimizer_state_to_device' 'optimizer_state_to_device'
@@ -18,7 +17,6 @@ __all__ = [
from .torch_driver import TorchDriver, TorchSingleDriver, TorchDDPDriver, torch_seed_everything, optimizer_state_to_device from .torch_driver import TorchDriver, TorchSingleDriver, TorchDDPDriver, torch_seed_everything, optimizer_state_to_device
from .jittor_driver import JittorDriver, JittorMPIDriver, JittorSingleDriver from .jittor_driver import JittorDriver, JittorMPIDriver, JittorSingleDriver
from .paddle_driver import PaddleDriver, PaddleFleetDriver, PaddleSingleDriver, paddle_seed_everything from .paddle_driver import PaddleDriver, PaddleFleetDriver, PaddleSingleDriver, paddle_seed_everything
from .torch_paddle_driver import TorchPaddleDriver
from .driver import Driver from .driver import Driver






+ 2
- 2
fastNLP/core/drivers/choose_driver.py View File

@@ -23,9 +23,9 @@ def choose_driver(model, driver: Union[str, Driver], device: Optional[Union[int,
elif driver in {"jittor"}: elif driver in {"jittor"}:
from fastNLP.core.drivers.jittor_driver.initialize_jittor_driver import initialize_jittor_driver from fastNLP.core.drivers.jittor_driver.initialize_jittor_driver import initialize_jittor_driver
return initialize_jittor_driver(driver, device, model, **kwargs) return initialize_jittor_driver(driver, device, model, **kwargs)
elif driver in {"paddle", "fleet"}:
elif driver in {"paddle"}:
from fastNLP.core.drivers.paddle_driver.initialize_paddle_driver import initialize_paddle_driver from fastNLP.core.drivers.paddle_driver.initialize_paddle_driver import initialize_paddle_driver
return initialize_paddle_driver(driver, device, model, **kwargs) return initialize_paddle_driver(driver, device, model, **kwargs)
else: else:
raise ValueError("Parameter `driver` can only be one of these values: ['torch', 'fairscale', " raise ValueError("Parameter `driver` can only be one of these values: ['torch', 'fairscale', "
"'jittor', 'paddle', 'fleet'].")
"'jittor', 'paddle'].")

+ 10
- 6
fastNLP/core/drivers/jittor_driver/initialize_jittor_driver.py View File

@@ -7,18 +7,22 @@ from fastNLP.envs.imports import _NEED_IMPORT_JITTOR
if _NEED_IMPORT_JITTOR: if _NEED_IMPORT_JITTOR:
import jittor import jittor


__all__ = []

def initialize_jittor_driver(driver: str, device: Union[str, int, List[int]], model: jittor.Module, **kwargs) -> JittorDriver: def initialize_jittor_driver(driver: str, device: Union[str, int, List[int]], model: jittor.Module, **kwargs) -> JittorDriver:
r""" r"""
用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去;
在这个函数中,我们会根据用户设置的device来确定JittorDriver的mode。
用来根据参数 ``device`` 来确定并且初始化一个具体的 ``Driver`` 实例然后返回回去。

.. todo::

创建多卡的 driver


:param driver: 该参数的值应为以下之一:["jittor"];
:param device: jittor运行的设备
:param driver: 该参数的值应为以下之一:``["jittor"]``
:param device: ``jittor`` 运行的设备
:param model: 训练或者评测的具体的模型; :param model: 训练或者评测的具体的模型;
:param kwargs: :param kwargs:


:return: 返回一个元组,元组的第一个值是具体的基于 jittor 的 `Driver` 实例,元组的第二个值是该 driver 的名字(用于检测一个脚本中
先后 driver 的次序的正确问题);
:return: :class:`~fastNLP.core.JittorSingleDriver` 或 :class:`~fastNLP.core.JittorMPIDriver` 实例;
""" """


if driver not in {"jittor"}: if driver not in {"jittor"}:


+ 11
- 1
fastNLP/core/drivers/jittor_driver/jittor_driver.py View File

@@ -24,7 +24,17 @@ if _NEED_IMPORT_JITTOR:


class JittorDriver(Driver): class JittorDriver(Driver):
r""" r"""
Jittor 框架的 Driver
``Jittor`` 框架的 ``Driver``

.. note::

这是一个正在开发中的功能,敬请期待。

.. todo::

实现 fp16 的设置,且支持 cpu 和 gpu 的切换;
实现用于断点重训的 save 和 load 函数;

""" """


def __init__(self, model, fp16: bool = False, **kwargs): def __init__(self, model, fp16: bool = False, **kwargs):


+ 8
- 0
fastNLP/core/drivers/jittor_driver/mpi.py View File

@@ -13,6 +13,14 @@ __all__ = [
] ]


class JittorMPIDriver(JittorDriver): class JittorMPIDriver(JittorDriver):
"""
执行 ``Jittor`` 框架下分布式训练的 ``Driver``。

.. note::

这是一个正在开发中的功能,敬请期待。

"""
def __init__( def __init__(
self, self,
model, model,


+ 11
- 7
fastNLP/core/drivers/jittor_driver/single_device.py View File

@@ -16,8 +16,17 @@ __all__ = [


class JittorSingleDriver(JittorDriver): class JittorSingleDriver(JittorDriver):
r""" r"""
用于 cpu 和 单卡 gpu 运算
TODO: jittor 的 fp16
``Jittor`` 框架下用于 ``cpu`` 和单卡 ``gpu`` 运算的 ``Driver``。

.. note::

这是一个正在开发中的功能,敬请期待。

.. todo::

支持 cpu 和 gpu 的切换;
实现断点重训中替换 dataloader 的 set_dist_repro_dataloader 函数

""" """


def __init__(self, model, device=None, fp16: bool = False, **kwargs): def __init__(self, model, device=None, fp16: bool = False, **kwargs):
@@ -30,11 +39,6 @@ class JittorSingleDriver(JittorDriver):
self.world_size = 1 self.world_size = 1


def step(self): def step(self):
"""
jittor optimizers 的step函数可以传入参数loss
此时会同时进行 zero_grad 和 backward
为了统一,这里暂不使用这样的方式
"""
for optimizer in self.optimizers: for optimizer in self.optimizers:
optimizer.step() optimizer.step()




+ 2
- 1
fastNLP/core/drivers/jittor_driver/utils.py View File

@@ -5,10 +5,11 @@ from fastNLP.envs.imports import _NEED_IMPORT_JITTOR
if _NEED_IMPORT_JITTOR: if _NEED_IMPORT_JITTOR:
import jittor import jittor


__all__ = []

class DummyGradScaler: class DummyGradScaler:
""" """
用于仿造的GradScaler对象,防止重复写大量的if判断 用于仿造的GradScaler对象,防止重复写大量的if判断

""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
pass pass


+ 14
- 27
fastNLP/core/drivers/paddle_driver/fleet.py View File

@@ -1,8 +1,6 @@
import os import os
from typing import List, Union, Optional, Dict, Tuple, Callable from typing import List, Union, Optional, Dict, Tuple, Callable


from fastNLP.core.utils.paddle_utils import get_device_from_visible

from .paddle_driver import PaddleDriver from .paddle_driver import PaddleDriver
from .fleet_launcher import FleetLauncher from .fleet_launcher import FleetLauncher
from .utils import ( from .utils import (
@@ -19,7 +17,9 @@ from fastNLP.core.utils import (
check_user_specific_params, check_user_specific_params,
is_in_paddle_dist, is_in_paddle_dist,
is_in_paddle_dist, is_in_paddle_dist,
get_paddle_device_id,
) )
from fastNLP.core.utils.paddle_utils import _convert_data_device
from fastNLP.envs.distributed import rank_zero_rm from fastNLP.envs.distributed import rank_zero_rm
from fastNLP.core.samplers import ( from fastNLP.core.samplers import (
ReproduceBatchSampler, ReproduceBatchSampler,
@@ -31,7 +31,12 @@ from fastNLP.core.samplers import (
re_instantiate_sampler, re_instantiate_sampler,
conversion_between_reproducible_and_unrepeated_sampler, conversion_between_reproducible_and_unrepeated_sampler,
) )
from fastNLP.envs.env import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_GLOBAL_SEED, FASTNLP_NO_SYNC
from fastNLP.envs.env import (
FASTNLP_DISTRIBUTED_CHECK,
FASTNLP_GLOBAL_SEED,
FASTNLP_NO_SYNC,
USER_CUDA_VISIBLE_DEVICES,
)
from fastNLP.core.log import logger from fastNLP.core.log import logger


if _NEED_IMPORT_PADDLE: if _NEED_IMPORT_PADDLE:
@@ -51,7 +56,7 @@ class PaddleFleetDriver(PaddleDriver):
def __init__( def __init__(
self, self,
model, model,
parallel_device: Optional[Union[List[int], int]],
parallel_device: Optional[Union[List[str], str]],
is_pull_by_paddle_run: bool = False, is_pull_by_paddle_run: bool = False,
fp16: bool = False, fp16: bool = False,
**kwargs **kwargs
@@ -185,6 +190,8 @@ class PaddleFleetDriver(PaddleDriver):
不管是什么情况,`PaddleFleetDriver` 在 `setup` 函数的最后,都会将所有进程的 pid 主动记录下来,这样当一个进程出现 exception 后, 不管是什么情况,`PaddleFleetDriver` 在 `setup` 函数的最后,都会将所有进程的 pid 主动记录下来,这样当一个进程出现 exception 后,
driver 的 on_exception 函数就会被 trainer 调用,其会调用 os.kill 指令将其它进程 kill 掉; driver 的 on_exception 函数就会被 trainer 调用,其会调用 os.kill 指令将其它进程 kill 掉;
""" """
if USER_CUDA_VISIBLE_DEVICES not in os.environ:
raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
super(PaddleFleetDriver, self).__init__(model, fp16=fp16, **kwargs) super(PaddleFleetDriver, self).__init__(model, fp16=fp16, **kwargs)


# 如果不是通过 launch 启动,要求用户必须传入 parallel_device # 如果不是通过 launch 启动,要求用户必须传入 parallel_device
@@ -213,25 +220,6 @@ class PaddleFleetDriver(PaddleDriver):
"you initialize the paddle distribued process out of our control.") "you initialize the paddle distribued process out of our control.")


self.outside_fleet = True self.outside_fleet = True
# 用户只有将模型上传到对应机器上后才能用 DataParallel 包裹,因此如果用户在外面初始化了 Fleet,那么在 PaddleFleetDriver 中
# 我们就直接将 model_device 置为 None;
self._model_device = None

# 当参数 `device` 为 None 时并且该参数不为 None,表示将对应的数据移到指定的机器上;
self._data_device = kwargs.get("data_device", None)
if self._data_device is not None:
if isinstance(self._data_device, int):
if self._data_device < 0:
raise ValueError("Parameter `data_device` can not be smaller than 0.")
_could_use_device_num = paddle.device.cuda.device_count()
if self._data_device >= _could_use_device_num:
raise ValueError("The gpu device that parameter `device` specifies is not existed.")
self._data_device = f"gpu:{self._data_device}"
elif not isinstance(self._data_device, str):
raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.")
if self.outside_fleet and paddle.device.get_device() != self._data_device:
logger.warning("`Parameter data_device` is not equal to paddle.deivce.get_device(), "
"please keep them equal to avoid some potential bugs.")


self.world_size = None self.world_size = None
self.global_rank = 0 self.global_rank = 0
@@ -304,7 +292,8 @@ class PaddleFleetDriver(PaddleDriver):
else: else:
# 已经设置过一次,保证参数必须是一样的 # 已经设置过一次,保证参数必须是一样的
pre_gpus = os.environ[FASTNLP_DISTRIBUTED_CHECK] pre_gpus = os.environ[FASTNLP_DISTRIBUTED_CHECK]
pre_gpus = [int (x) for x in pre_gpus.split(",")]
pre_gpus = [int(x) for x in pre_gpus.split(",")]
cur_gpus = [get_paddle_device_id(g) for g in self.parallel_device]
if sorted(pre_gpus) != sorted(self.parallel_device): if sorted(pre_gpus) != sorted(self.parallel_device):
raise RuntimeError("Notice you are using `PaddleFleetDriver` after one instantiated `PaddleFleetDriver`, it is not" raise RuntimeError("Notice you are using `PaddleFleetDriver` after one instantiated `PaddleFleetDriver`, it is not"
"allowed that your second `PaddleFleetDriver` has a new setting of parameters `parallel_device`.") "allowed that your second `PaddleFleetDriver` has a new setting of parameters `parallel_device`.")
@@ -410,8 +399,6 @@ class PaddleFleetDriver(PaddleDriver):


@property @property
def data_device(self): def data_device(self):
if self.outside_fleet:
return self._data_device
return self.model_device return self.model_device


def model_call(self, batch, fn: Callable, signature_fn: Optional[Callable]) -> Dict: def model_call(self, batch, fn: Callable, signature_fn: Optional[Callable]) -> Dict:
@@ -565,7 +552,7 @@ class PaddleFleetDriver(PaddleDriver):


def broadcast_object(self, obj, src:int=0, group=None, **kwargs): def broadcast_object(self, obj, src:int=0, group=None, **kwargs):
# 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误 # 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误
device = get_device_from_visible(self.data_device)
device = _convert_data_device(self.data_device)
return fastnlp_paddle_broadcast_object(obj, src, device=device, group=group) return fastnlp_paddle_broadcast_object(obj, src, device=device, group=group)


def all_gather(self, obj, group=None) -> List: def all_gather(self, obj, group=None) -> List:


+ 5
- 2
fastNLP/core/drivers/paddle_driver/fleet_launcher.py View File

@@ -11,11 +11,14 @@ from fastNLP.envs.env import (
FASTNLP_LOG_LEVEL, FASTNLP_LOG_LEVEL,
FASTNLP_GLOBAL_SEED, FASTNLP_GLOBAL_SEED,
) )
from fastNLP.core.utils import get_paddle_device_id
from .utils import ( from .utils import (
find_free_ports, find_free_ports,
reset_seed, reset_seed,
) )


__all__ = []

# 记录各个进程信息 # 记录各个进程信息
class SubTrainer(object): class SubTrainer(object):
""" """
@@ -34,11 +37,11 @@ class FleetLauncher:
""" """
def __init__( def __init__(
self, self,
devices: List[int],
devices: List[str],
output_from_new_proc: str = "only_error" output_from_new_proc: str = "only_error"
): ):


self.devices = devices
self.devices = [ get_paddle_device_id(g) for g in devices]
self.output_from_new_proc = output_from_new_proc self.output_from_new_proc = output_from_new_proc


self.setup() self.setup()


+ 28
- 18
fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py View File

@@ -7,50 +7,58 @@ from .single_device import PaddleSingleDriver
from .fleet import PaddleFleetDriver from .fleet import PaddleFleetDriver


from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.core.utils import is_in_paddle_launch_dist
from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES
from fastNLP.core.utils import is_in_paddle_launch_dist, get_paddle_gpu_str
from fastNLP.core.log import logger from fastNLP.core.log import logger


if _NEED_IMPORT_PADDLE: if _NEED_IMPORT_PADDLE:
import paddle import paddle


__all__ = []

def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[int]]], def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[int]]],
model: "paddle.nn.Layer", **kwargs) -> PaddleDriver: model: "paddle.nn.Layer", **kwargs) -> PaddleDriver:
r""" r"""
用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去;
1、如果检测到当前进程为用户通过 `python -m paddle.distributed.launch xxx.py` 方式拉起的,则将
设备自动设置为用户指定的设备(由于我们在引入 fastNLP 进行了特殊的设置,因此可以通过 `CUDA_VISIBLE_DEVICES` 获取)
2、如果检测到输入的 `driver` 是 `paddle` 但 `device` 包含了多个设备,那么我们会给出警告并且自动返回多卡的 Driver
3、如果检测到输入的 `driver` 是 `fleet` 但 `device` 仅有一个设备,那么我们会给出警告但仍旧返回多卡的 Driver
用来根据参数 ``device`` 来确定并且初始化一个具体的 ``Driver`` 实例。

1. 如果检测到当前进程为用户通过 ``python -m paddle.distributed.launch xxx.py`` 方式拉起的,则将
设备自动设置为用户指定的设备(由于我们要求分布式训练必须进行 ``backend`` 的设置,因此可以通过 ``CUDA_VISIBLE_DEVICES`` 获取)
2. 如果 ``device`` 包含了多个设备,则返回一个 :class:`~fastNLP.core.PaddleFleetDriver` 实例,否则返回
单卡的 :class:`~fastNLP.core.PaddleSingleDriver` 实例


:param driver: 使用的 ``driver`` 类型,在这个函数中仅支持 ``paddle``
:param device: 该参数的格式与 `Trainer` 对参数 `device` 的要求一致;
:param driver: 使用的 ``driver`` 类型,在这个函数中仅支持 ``paddle``
:param device: 该参数的格式与 ``Trainer`` 对参数 ``device`` 的要求一致;
:param model: 训练或者评测的具体的模型; :param model: 训练或者评测的具体的模型;


:return: 返回构造的 `Driver` 实例。
:return: 一个 :class:`~fastNLP.core.PaddleSingleDriver` 或 :class:`~fastNLP.core.PaddleFleetDriver` 实例;
""" """
if driver != "paddle": if driver != "paddle":
raise ValueError("When initialize PaddleDriver, parameter `driver` must be 'paddle'.") raise ValueError("When initialize PaddleDriver, parameter `driver` must be 'paddle'.")
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if is_in_paddle_launch_dist(): if is_in_paddle_launch_dist():
if user_visible_devices is None:
raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
if device is not None: if device is not None:
logger.warning_once("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull " logger.warning_once("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull "
"up your script. And we will directly get the local device via "
"and `os.environ['CUDA_VISIBLE_DEVICES']``.")
device = [int(g) for g in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
# TODO 目前一个进程仅对应一个卡,所以暂时传入一个 int
"up your script. And we will directly get the local device via environment variables.")
_visible_list = user_visible_devices.split(",")
device = [ f"gpu:{_visible_list.index(g) }" for g in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
# TODO 目前一个进程仅对应一个卡,所以暂时传入单个
return PaddleFleetDriver(model, device[0], True, **kwargs) return PaddleFleetDriver(model, device[0], True, **kwargs)


user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES")
if user_visible_devices is None: if user_visible_devices is None:
raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
"`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
_could_use_device_num = len(user_visible_devices.split(","))
_could_use_device_num = paddle.device.cuda.device_count()
else:
_could_use_device_num = len(user_visible_devices.split(","))

if isinstance(device, int): if isinstance(device, int):
if device < 0 and device != -1: if device < 0 and device != -1:
raise ValueError("Parameter `device` can only be '-1' when it is smaller than 0.") raise ValueError("Parameter `device` can only be '-1' when it is smaller than 0.")
if device >= _could_use_device_num: if device >= _could_use_device_num:
raise ValueError("The gpu device that parameter `device` specifies is not existed.") raise ValueError("The gpu device that parameter `device` specifies is not existed.")
if device == -1: if device == -1:
device = list(range(_could_use_device_num))
device = [ get_paddle_gpu_str(g) for g in range(_could_use_device_num)]
elif isinstance(device, Sequence) and not isinstance(device, str): elif isinstance(device, Sequence) and not isinstance(device, str):
device = list(set(device)) device = list(set(device))
for each in device: for each in device:
@@ -61,8 +69,10 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[
elif each >= _could_use_device_num: elif each >= _could_use_device_num:
raise ValueError("When parameter `device` is 'Sequence' type, the value in it should not be bigger than" raise ValueError("When parameter `device` is 'Sequence' type, the value in it should not be bigger than"
" the available gpu number.") " the available gpu number.")
device = [get_paddle_gpu_str(g) for g in device]
elif device is not None and not isinstance(device, str): elif device is not None and not isinstance(device, str):
raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.") raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.")
if isinstance(device, List): if isinstance(device, List):
return PaddleFleetDriver(model, device, **kwargs) return PaddleFleetDriver(model, device, **kwargs)
else: else:


+ 5
- 2
fastNLP/core/drivers/paddle_driver/paddle_driver.py View File

@@ -7,10 +7,13 @@ from dataclasses import dataclass


import numpy as np import numpy as np


from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES

from .utils import _build_fp16_env, optimizer_state_to_device, DummyGradScaler from .utils import _build_fp16_env, optimizer_state_to_device, DummyGradScaler
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.core.drivers.driver import Driver from fastNLP.core.drivers.driver import Driver
from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device, get_device_from_visible
from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device
from fastNLP.core.utils.paddle_utils import _convert_data_device
from fastNLP.envs import ( from fastNLP.envs import (
FASTNLP_SEED_WORKERS, FASTNLP_SEED_WORKERS,
FASTNLP_MODEL_FILENAME, FASTNLP_MODEL_FILENAME,
@@ -369,7 +372,7 @@ class PaddleDriver(Driver):


:return: 将移动到指定机器上的 batch 对象返回; :return: 将移动到指定机器上的 batch 对象返回;
""" """
device = get_device_from_visible(self.data_device)
device = _convert_data_device(self.data_device)
return paddle_move_data_to_device(batch, device) return paddle_move_data_to_device(batch, device)


@staticmethod @staticmethod


+ 6
- 10
fastNLP/core/drivers/paddle_driver/single_device.py View File

@@ -8,10 +8,10 @@ from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES
from fastNLP.core.utils import ( from fastNLP.core.utils import (
auto_param_call, auto_param_call,
get_device_from_visible,
get_paddle_gpu_str, get_paddle_gpu_str,
get_paddle_device_id, get_paddle_device_id,
) )
from fastNLP.core.utils.paddle_utils import _convert_data_device
from fastNLP.core.utils.utils import _get_fun_msg from fastNLP.core.utils.utils import _get_fun_msg
from fastNLP.core.samplers import ( from fastNLP.core.samplers import (
ReproducibleBatchSampler, ReproducibleBatchSampler,
@@ -40,9 +40,6 @@ class PaddleSingleDriver(PaddleDriver):
raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`") raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`")


cuda_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) cuda_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if cuda_visible_devices is None:
raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
"`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
if cuda_visible_devices == "": if cuda_visible_devices == "":
device = "cpu" device = "cpu"
logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to" logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to"
@@ -54,11 +51,9 @@ class PaddleSingleDriver(PaddleDriver):
raise ValueError("Parameter `device` can not be None in `PaddleSingleDriver`.") raise ValueError("Parameter `device` can not be None in `PaddleSingleDriver`.")


if device != "cpu": if device != "cpu":
if isinstance(device, int):
device_id = device
else:
device_id = get_paddle_device_id(device)
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices.split(",")[device_id]
device_id = get_paddle_device_id(device)
if cuda_visible_devices is not None:
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices.split(",")[device_id]
self.model_device = get_paddle_gpu_str(device) self.model_device = get_paddle_gpu_str(device)


self.local_rank = 0 self.local_rank = 0
@@ -69,7 +64,8 @@ class PaddleSingleDriver(PaddleDriver):
r""" r"""
该函数用来初始化训练环境,用于设置当前训练的设备,并将模型迁移到对应设备上。 该函数用来初始化训练环境,用于设置当前训练的设备,并将模型迁移到对应设备上。
""" """
device = get_device_from_visible(self.model_device, output_type=str)
device = _convert_data_device(self.data_device)

paddle.device.set_device(device) paddle.device.set_device(device)
with contextlib.redirect_stdout(None): with contextlib.redirect_stdout(None):
self.model.to(device) self.model.to(device)


+ 7
- 8
fastNLP/core/drivers/torch_driver/initialize_torch_driver.py View File

@@ -10,19 +10,18 @@ from .ddp import TorchDDPDriver
from fastNLP.core.log import logger from fastNLP.core.log import logger
from fastNLP.envs import FASTNLP_BACKEND_LAUNCH from fastNLP.envs import FASTNLP_BACKEND_LAUNCH


__all__ = []


def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.device", int, List[int]]], def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.device", int, List[int]]],
model: "torch.nn.Module", **kwargs) -> TorchDriver: model: "torch.nn.Module", **kwargs) -> TorchDriver:
r""" r"""
用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去;
注意如果输入的 `device` 如果和 `driver` 对应不上就直接报错;
用来根据参数 ``driver` 和 ``device`` 来确定并且初始化一个具体的 ``Driver`` 实例然后返回回去;


:param driver: 该参数的值应为以下之一:["torch", "torch_ddp", "fairscale"];
:param device: 该参数的格式与 `Trainer` 对参数 `device` 的要求一致;
:param driver: 该参数的值应为以下之一:``["torch", "fairscale"]``
:param device: 该参数的格式与 ``Trainer`` 对参数 ``device`` 的要求一致;
:param model: 训练或者评测的具体的模型; :param model: 训练或者评测的具体的模型;


:return: 返回一个元组,元组的第一个值是具体的基于 pytorch 的 `Driver` 实例,元组的第二个值是该 driver 的名字(用于检测一个脚本中
先后 driver 的次序的正确问题);
:return: 返回一个 :class:`~fastNLP.core.TorchSingleDriver` 或 :class:`~fastNLP.core.TorchDDPDriver` 实例;
""" """
# world_size 和 rank # world_size 和 rank
if FASTNLP_BACKEND_LAUNCH in os.environ: if FASTNLP_BACKEND_LAUNCH in os.environ:
@@ -55,8 +54,8 @@ def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.devi
elif each < 0: elif each < 0:
raise ValueError("When parameter `device` is 'Sequence' type, the value in it should be bigger than 0.") raise ValueError("When parameter `device` is 'Sequence' type, the value in it should be bigger than 0.")
elif each >= _could_use_device_num: elif each >= _could_use_device_num:
raise ValueError("When parameter `device` is 'Sequence' type, the value in it should not be bigger than"
" the available gpu number.")
raise ValueError(f"When parameter `device` is 'Sequence' type, the value in it should not be bigger than"
f" the available gpu number:{_could_use_device_num}.")
device = [torch.device(f"cuda:{w}") for w in device] device = [torch.device(f"cuda:{w}") for w in device]
elif device is not None and not isinstance(device, torch.device): elif device is not None and not isinstance(device, torch.device):
raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.") raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.")


+ 6
- 0
fastNLP/core/drivers/torch_driver/torch_driver.py View File

@@ -167,6 +167,12 @@ class TorchDriver(Driver):
""" """
model = self.unwrap_model() model = self.unwrap_model()
res = torch.load(filepath, map_location='cpu') res = torch.load(filepath, map_location='cpu')
if isinstance(res, dict) and only_state_dict is False:
logger.rank_zero_warning(f"It seems like that {filepath} only contains state, you may need to use "
f"`only_state_dict=True`")
elif not isinstance(res, dict) and only_state_dict is True:
logger.rank_zero_warning(f"It seems like that {filepath} is not state, you may need to use "
f"`only_state_dict=False`")
if only_state_dict: if only_state_dict:
model.load_state_dict(res) model.load_state_dict(res)
else: else:


+ 0
- 5
fastNLP/core/drivers/torch_paddle_driver/__init__.py View File

@@ -1,5 +0,0 @@
__all__ = [
"TorchPaddleDriver",
]

from .torch_paddle_driver import TorchPaddleDriver

+ 0
- 193
fastNLP/core/drivers/torch_paddle_driver/torch_paddle_driver.py View File

@@ -1,193 +0,0 @@
from typing import Optional, Dict, Union, Callable, Tuple

from fastNLP.envs.imports import _NEED_IMPORT_PADDLE, _NEED_IMPORT_TORCH
from fastNLP.core.utils.utils import _get_fun_msg


if _NEED_IMPORT_PADDLE:
import paddle
from paddle.io import DataLoader as PaddleDataLoader
from paddle.optimizer import Optimizer as PaddleOptimizer

if _NEED_IMPORT_TORCH:
import torch
from torch.utils.data import DataLoader as TorchDataLoader
from torch.optim import Optimizer as TorchOptimizer

from fastNLP.core.drivers.driver import Driver
from fastNLP.envs.distributed import rank_zero_call
from fastNLP.core.utils.utils import auto_param_call, apply_to_collection
from fastNLP.core.log.logger import logger
from fastNLP.modules.mix_modules.mix_module import MixModule


__all__ = [
"TorchPaddleDriver",
]

class TorchPaddleDriver(Driver):
"""
针对torch和paddle混合模型的driver
由于是两种不同的框架不方便实现多卡,暂时先实现CPU和GPU单卡的功能
"""
def __init__(self, model, device: Optional[str] = None, **kwargs):
super(TorchPaddleDriver, self).__init__(model)

self.model_device = device
self.torch_non_blocking = kwargs.get("torch_non_blocking", None)
self.paddle_blocking = kwargs.get("paddle_blocking", None)

self._data_device = kwargs.get("_data_device", None)
if isinstance(self._data_device, int):
# 将data_device设置为cuda:x的字符串形式
if self._data_device < 0:
raise ValueError("Parameter `_data_device` can not be smaller than 0.")
_could_use_device_num = paddle.device.cuda.device_count()
if self._data_device >= _could_use_device_num:
raise ValueError("The gpu device that parameter `device` specifies is not existed.")
self._data_device = f"cuda:{self._data_device}"
elif self._data_device is not None:
raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.")

def setup(self):
if self.model_device is not None:
paddle.device.set_device(self.model_device.replace("cuda", "gpu"))
self.model.to(self.model_device)

@staticmethod
def check_dataloader_legality(dataloader, dataloader_name, is_train: bool = False):
if is_train:
if not isinstance(dataloader, (TorchDataLoader, PaddleDataLoader)):
raise ValueError(f"Parameter `{dataloader_name}` should be 'torch.util.data.DataLoader' or `paddle.io.dataloader` type, not {type(dataloader)}.")
else:
if not isinstance(dataloader, Dict):
raise ValueError(f"Parameter `{dataloader_name}` should be 'Dict' type, not {type(dataloader)}.")
else:
for each_dataloader in dataloader.values():
if not isinstance(each_dataloader, (TorchDataLoader, PaddleDataLoader)):
raise ValueError(f"Each dataloader of parameter `{dataloader_name}` should be "
f"'torch.util.data.DataLoader' or `paddle.io.dataloader` "
f"type, not {type(each_dataloader)}.")

@staticmethod
def _check_optimizer_legality(optimizers):
for each_optimizer in optimizers:
if not isinstance(each_optimizer, (TorchOptimizer, PaddleOptimizer)):
raise ValueError(f"Each optimizers of parameter `optimizers` should be "
f"'torch.optim.Optimizer' or 'paddle.optimizers.Optimizer' type, "
f"not {type(each_optimizer)}.")

def step(self):
for optimizer in self.optimizers:
optimizer.step()

def backward(self, loss):
loss.backward()

def zero_grad(self):
for optimizer in self.optimizers:
if isinstance(optimizer, TorchOptimizer):
optimizer.zero_grad()
elif isinstance(optimizer, PaddleOptimizer):
optimizer.clear_grad()
else:
raise ValueError("Unknown optimizers type.")

def model_call(self, batch, fn: Callable, signature_fn: Optional[Callable]) -> Dict:
if isinstance(batch, Dict) and not self.wo_auto_param_call:
return auto_param_call(fn, batch, signature_fn=signature_fn)
else:
return fn(batch)

def get_model_call_fn(self, fn: str) -> Tuple:
if hasattr(self.model, fn):
fn = getattr(self.model, fn)
if not callable(fn):
raise RuntimeError(f"The `{fn}` attribute is not `Callable`.")
logger.debug(f'Use {_get_fun_msg(fn, with_fp=False)}...')
return fn, None
elif fn in {"train_step", "evaluate_step"}:
logger.debug(f'Use {_get_fun_msg(self.model.forward, with_fp=False)}...')
return self.model, self.model.forward
else:
raise RuntimeError(f"There is no `{fn}` method in your {type(self.model)}.")

def predict_step(self, batch):
if isinstance(batch, Dict):
return auto_param_call(self._predict_step, batch)
else:
return self._predict_step(batch)

@rank_zero_call
def save_model(self, filepath: str, only_state_dict: bool = True, model_save_fn: Optional[Callable] = None):
r"""
暂时不提供保存整个模型的方法
"""
if only_state_dict == False:
logger.warn("TorchPaddleModule only support saving state dicts now.")
if model_save_fn is not None:
model_save_fn(filepath)
else:
model = self.unwrap_model()
self.move_model_to_device(model, "cpu")
self.model.save(filepath)
self.move_model_to_device(model, self.model_device)

def load_model(self, filepath: str):
"""
加载模型的加载函数;

:param filepath: 保存文件的文件位置(需要包括文件名);
:return:
"""
return self.model.load(filepath)

def save(self):
...

def load(self):
...

@staticmethod
def move_model_to_device(model: MixModule, device: str):
if device is not None:
model.to(device)

def unwrap_model(self):
return self.model

@staticmethod
def tensor_to_numeric(tensor):
if tensor is None:
return None

def _translate(_data):
return _data.tolist()

return apply_to_collection(
data=tensor,
dtype=(paddle.Tensor, torch.Tensor),
function=_translate
)

def set_model_mode(self, mode: str):
assert mode in {"train", "eval"}
getattr(self.model, mode)()

def get_model_device(self):
return self.model_device

@property
def data_device(self):
if self.model_device is not None:
return self.model_device
else:
return self._data_device

def set_model_mode(self, mode: str):
assert mode in {"train", "eval"}
getattr(self.model, mode)()

def set_sampler_epoch(self, dataloader: Union['TorchDataLoader', 'PaddleDataLoader'], cur_epoch_idx):
# 保证 ddp 训练时的 shuffle=True 时的正确性,因为需要保证每一个进程上的 sampler 的shuffle 的随机数种子是一样的;
return dataloader

+ 0
- 4
fastNLP/core/drivers/torch_paddle_driver/utils.py View File

@@ -1,4 +0,0 @@
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE

if _NEED_IMPORT_PADDLE:
pass

+ 4
- 2
fastNLP/core/log/print.py View File

@@ -1,7 +1,7 @@
__all__ = [ __all__ = [
'print' 'print'
] ]
from logging import INFO
from .logger import logger from .logger import logger




@@ -22,4 +22,6 @@ def print(*args, sep=' ', end='\n', file=None, flush=False):
:return: :return:
""" """
line = sep.join(map(str, args)) line = sep.join(map(str, args))
logger.info(line)
if logger.isEnabledFor(INFO):
kwargs = logger._add_rank_info({})
logger._log(INFO, line, args, **kwargs)

+ 4
- 2
fastNLP/core/metrics/backend/paddle_backend/backend.py View File

@@ -1,12 +1,14 @@
import os
from typing import List, Any from typing import List, Any


import numpy as np import numpy as np


from fastNLP.core.metrics.backend import Backend from fastNLP.core.metrics.backend import Backend
from fastNLP.core.utils.paddle_utils import paddle_to, get_device_from_visible
from fastNLP.core.utils.paddle_utils import paddle_to, _convert_data_device
from fastNLP.core.metrics.utils import AggregateMethodError from fastNLP.core.metrics.utils import AggregateMethodError
from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES


if _NEED_IMPORT_PADDLE: if _NEED_IMPORT_PADDLE:
import paddle import paddle
@@ -79,7 +81,7 @@ class PaddleBackend(Backend):
raise ValueError(f"tensor: {tensor} can not convert to ndarray!") raise ValueError(f"tensor: {tensor} can not convert to ndarray!")


def move_tensor_to_device(self, tensor, device): def move_tensor_to_device(self, tensor, device):
device = get_device_from_visible(device)
device = _convert_data_device(device)
return paddle_to(tensor, device) return paddle_to(tensor, device)


def all_gather_object(self, obj, group=None) -> List: def all_gather_object(self, obj, group=None) -> List:


+ 1
- 1
fastNLP/core/metrics/metric.py View File

@@ -84,7 +84,7 @@ class Metric:
def _sync_get_metric(self, get_metric): def _sync_get_metric(self, get_metric):
@functools.wraps(get_metric) @functools.wraps(get_metric)
def _wrap_get_metric(*args, **kwargs): def _wrap_get_metric(*args, **kwargs):
assert self._updated, f"You have to call `{self.__class__.__name__}` update() function before calling " \
assert self._updated, f"You have to call `{self.__class__.__name__}'s update() function before calling " \
f"get_metric()." f"get_metric()."
with self.sync(recover=True, aggregate=self.aggregate_when_get_metric): with self.sync(recover=True, aggregate=self.aggregate_when_get_metric):
results = get_metric(*args, **kwargs) results = get_metric(*args, **kwargs)


+ 18
- 11
fastNLP/core/samplers/reproducible_batch_sampler.py View File

@@ -366,17 +366,22 @@ class BucketedBatchSampler(ReproducibleBatchSampler):
def __init__(self, dataset, length: Union[List[int], str], batch_size:int = 32, num_batch_per_bucket:int = 10, def __init__(self, dataset, length: Union[List[int], str], batch_size:int = 32, num_batch_per_bucket:int = 10,
shuffle: bool = True, drop_last: bool = False, seed: int = 0, **kwargs): shuffle: bool = True, drop_last: bool = False, seed: int = 0, **kwargs):
""" """
首先按照 sample 的长度排序,然后按照 batch_size*num_batch_per_bucket 为一个桶的大小,sample 只会在这个桶内进行组合,这样
每个 batch 中的 padding 数量会比较少 (因为桶内的数据的长度都接近)。
首先按照 ``sample`` 的长度排序,然后按照 batch_size*num_batch_per_bucket 为一个桶的大小,``sample`` 只会在这个桶内进行组
合,这样每个 ``batch`` 中的 ``padding`` 数量会比较少 (因为桶内的数据的长度都接近)。


:param dataset: 实现了 __len__ 方法的数据容器。 :param dataset: 实现了 __len__ 方法的数据容器。
:param length: 如果为 List,应当与 dataset 有一样的长度,表示 dataset 中每个元素的数量;仅当传入的 dataset 为 fastNLP 的
DataSet 时支持传入 str,会将该str理解为 dataset 的 field 名称,若 field 中的元素为 int,则认为该值是 sample 的长度。
如果否则使用 len() 函数得到每个 sample 中这个 field 的长度。
:param length: 每条数据的长度。

* 为 ``List[int]`` 时
应当与 dataset 有一样的长度,表示 dataset 中每个元素的数量;
* 为 ``str`` 时
仅当传入的 ``dataset`` 是 :class:`fastNLP.DataSet` 时,允许传入 `str` ,该 `str` 将被认为是 ``dataset`` 中的
``field`` 。若 field 中的元素为 ``int``,则认为该值是 sample 的长度;若不为 ``int`` ,则尝试使用 ``len`` 方法
获取该 ``field`` 中每个元素的长度。
:param batch_size: 每个 batch 的大小 :param batch_size: 每个 batch 的大小
:param num_batch_per_bucket: 多少个 batch 组成一个桶,数据只会在一个桶内进行 shuffle 。
:param shuffle: 如果为 True,将不进行 shuffle,实际上数据会以从长到短的方式输出。
:param drop_last: 如果最后一个 batch 的 sample 数量无法凑齐 batch_size 这么多,是否需要丢掉。
:param num_batch_per_bucket: 多少个 ``batch`` 组成一个桶,数据只会在一个桶内进行 ``shuffle``
:param shuffle: 如果为 True,将不进行 ``shuffle``,实际上数据会以从长到短的方式输出。
:param drop_last: 如果最后一个 `batch```sample`` 数量无法凑齐 ``batch_size`` 这么多,是否需要丢掉。
:param seed: 设置的随机数种子 :param seed: 设置的随机数种子
:param kwargs: fastNLP 保留使用 :param kwargs: fastNLP 保留使用
""" """
@@ -386,10 +391,12 @@ class BucketedBatchSampler(ReproducibleBatchSampler):
if not isinstance(length[0], int): if not isinstance(length[0], int):
length = list(map(len, length)) length = list(map(len, length))
else: else:
assert len(length) == len(dataset), "When the dataset is not fastNLP.DataSet, " \
"the length parameter can only be List[int]"
types = set(map(type, length))
assert isinstance(length, list) and len(types)==1 and types.pop()==int, \
"When the dataset is not fastNLP.DataSet, the length parameter can only be List[int]"


assert len(length) == len(dataset), "The length of `data` and `length` should be equal."
assert len(length) == len(dataset), f"The length of `dataset`({len(dataset)}) and " \
f"`length`({len(length)}) should be equal."


self.dataset = dataset self.dataset = dataset
self.length = np.array(length, dtype=int) # 按照长到短排列的序号。 self.length = np.array(length, dtype=int) # 按照长到短排列的序号。


+ 18
- 10
fastNLP/core/samplers/reproducible_sampler.py View File

@@ -55,6 +55,7 @@ class ReproducibleSampler:
class RandomSampler(ReproducibleSampler): class RandomSampler(ReproducibleSampler):
def __init__(self, dataset, shuffle: bool = True, seed: int = 0, **kwargs): def __init__(self, dataset, shuffle: bool = True, seed: int = 0, **kwargs):
""" """
随机顺序的 Sampler 。


:param dataset: 实现了 __len__ 方法的数据容器 :param dataset: 实现了 __len__ 方法的数据容器
:param shuffle: 是否在每次 iterate 的时候打乱顺序。 :param shuffle: 是否在每次 iterate 的时候打乱顺序。
@@ -169,9 +170,8 @@ class RandomSampler(ReproducibleSampler):
def set_epoch(self, epoch: int) -> None: def set_epoch(self, epoch: int) -> None:
self.epoch = epoch self.epoch = epoch


def set_distributed(self, num_replicas, rank, pad=True):
def set_distributed(self, num_replicas:int, rank:int, pad:bool=True):
""" """
该方法本质上等同于 ddp 情形下的没有完成的初始化,应当在初始化该 sampler 本身后立即被调用;


:param num_replicas: :param num_replicas:
:param rank: :param rank:
@@ -215,7 +215,7 @@ class RandomSampler(ReproducibleSampler):
class SequentialSampler(RandomSampler): class SequentialSampler(RandomSampler):
def __init__(self, dataset, **kwargs): def __init__(self, dataset, **kwargs):
""" """
按照顺序读取 dataset 。在多卡情况下,间隔读取,例如,在两卡情况下,卡0取 [0,2,4,..], 卡1取 [1,3,5...]。
按照顺序读取 ``dataset`` 。在多卡情况下,间隔读取,例如,在两卡情况下,卡 0 ``[0,2,4,..]``, 卡1取 ``[1,3,5...]``


:param dataset: 实现了 __len__ 方法的数据容器。 :param dataset: 实现了 __len__ 方法的数据容器。
:param kwargs: :param kwargs:
@@ -285,13 +285,20 @@ class SequentialSampler(RandomSampler):
class SortedSampler(SequentialSampler): class SortedSampler(SequentialSampler):
def __init__(self, dataset, length:Union[str, List], **kwargs): def __init__(self, dataset, length:Union[str, List], **kwargs):
""" """
将 dataset 中的数据根据 length 从长到短进行迭代。在多卡情况下,由于padding 最后一个 sample 可能是最长的那个 sample。
将 ``dataset`` 中的数据根据 ``length`` 从长到短进行迭代。在多卡情况下,由于 ``padding`` , 最后一个 ``sample`` 可能是最长
的那个 ``sample`` 。


:param dataset: 实现了 __len__ 方法的数据容器。 :param dataset: 实现了 __len__ 方法的数据容器。
:param length: 如果为 List,应当与 dataset 有一样的长度,表示 dataset 中每个元素的数量;仅当传入的 dataset 为 fastNLP 的
DataSet 时支持传入 str,会将该str理解为 dataset 的 field 名称,若 field 中的元素为 int,则认为该值是 sample 的长度。
:param seed: 设置的随机数种子
:param kwargs: fastNLP 保留使用
:param length: 每条数据的长度。

* 为 ``List[int]`` 时
应当与 dataset 有一样的长度,表示 dataset 中每个元素的数量;
* 为 ``str`` 时
仅当传入的 ``dataset`` 是 :class:`fastNLP.DataSet` 时,允许传入 `str` ,该 `str` 将被认为是 ``dataset`` 中的
``field`` 。若 field 中的元素为 ``int``,则认为该值是 sample 的长度;若不为 ``int`` ,则尝试使用 ``len`` 方法
获取该 ``field`` 中每个元素的长度。
:param seed: 设置的随机数种子。
:param kwargs: fastNLP 保留使用。
""" """
super().__init__(dataset=dataset, **kwargs) super().__init__(dataset=dataset, **kwargs)
if isinstance(dataset, DataSet) and isinstance(length, str): if isinstance(dataset, DataSet) and isinstance(length, str):
@@ -299,8 +306,9 @@ class SortedSampler(SequentialSampler):
if not isinstance(length[0], int): if not isinstance(length[0], int):
length = list(map(len, length)) length = list(map(len, length))
else: else:
assert len(length) == len(dataset), "When the dataset is not fastNLP.DataSet, " \
"the length parameter can only be List[int]"
types = set(map(type, length))
assert isinstance(length, list) and len(types)==1 and types.pop()==int, \
"When the dataset is not fastNLP.DataSet, the length parameter can only be List[int]"


assert len(length) == len(dataset), "The length of `data` and `length` should be equal." assert len(length) == len(dataset), "The length of `data` and `length` should be equal."




+ 1
- 4
fastNLP/core/utils/__init__.py View File

@@ -2,7 +2,6 @@ __all__ = [
'cache_results', 'cache_results',
'is_jittor_dataset', 'is_jittor_dataset',
'jittor_collate_wraps', 'jittor_collate_wraps',
'get_device_from_visible',
'paddle_to', 'paddle_to',
'paddle_move_data_to_device', 'paddle_move_data_to_device',
'get_paddle_device_id', 'get_paddle_device_id',
@@ -11,7 +10,6 @@ __all__ = [
'is_in_fnlp_paddle_dist', 'is_in_fnlp_paddle_dist',
'is_in_paddle_launch_dist', 'is_in_paddle_launch_dist',
'f_rich_progress', 'f_rich_progress',
'torch_paddle_move_data_to_device',
'torch_move_data_to_device', 'torch_move_data_to_device',
'get_fn_arg_names', 'get_fn_arg_names',
'auto_param_call', 'auto_param_call',
@@ -29,10 +27,9 @@ __all__ = [


from .cache_results import cache_results from .cache_results import cache_results
from .jittor_utils import is_jittor_dataset, jittor_collate_wraps from .jittor_utils import is_jittor_dataset, jittor_collate_wraps
from .paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \
from .paddle_utils import paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \
is_in_fnlp_paddle_dist, is_in_paddle_launch_dist is_in_fnlp_paddle_dist, is_in_paddle_launch_dist
from .rich_progress import f_rich_progress from .rich_progress import f_rich_progress
from .torch_paddle_utils import torch_paddle_move_data_to_device
from .torch_utils import torch_move_data_to_device from .torch_utils import torch_move_data_to_device
from .utils import * from .utils import *




+ 1
- 1
fastNLP/core/utils/dummy_class.py View File

@@ -1,4 +1,4 @@
import functools
__all__ = []


class DummyClass: class DummyClass:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):


+ 8
- 1
fastNLP/core/utils/jittor_utils.py View File

@@ -15,6 +15,12 @@ from fastNLP.core.dataset import Instance




def is_jittor_dataset(dataset) -> bool: def is_jittor_dataset(dataset) -> bool:
"""
判断传入的 ``dataset`` 是否是 :class:`jittor.dataset.Dataset` 类型

:param dataset: 数据集;
:return: 当前 ``dataset`` 是否为 ``jittor`` 的数据集类型;
"""
try: try:
if isinstance(dataset, jt.dataset.Dataset): if isinstance(dataset, jt.dataset.Dataset):
return True return True
@@ -26,7 +32,8 @@ def is_jittor_dataset(dataset) -> bool:


def jittor_collate_wraps(func, auto_collator: Callable): def jittor_collate_wraps(func, auto_collator: Callable):
""" """
对jittor的collate_fn进行wrap封装, 如果数据集为mapping类型,那么采用auto_collator,否则还是采用jittor自带的collate_batch
对 ``jittor`` 的 ``collate_fn`` 进行 ``wrap`` 封装,。如果数据集为 ``mapping`` 类型,那么采用 ``auto_collator`` ,否则
还是采用 ``jittor`` 的 ``collate_batch``。


:param func: :param func:
:param auto_collator: :param auto_collator:


+ 75
- 63
fastNLP/core/utils/paddle_utils.py View File

@@ -1,5 +1,4 @@
__all__ = [ __all__ = [
"get_device_from_visible",
"paddle_to", "paddle_to",
"paddle_move_data_to_device", "paddle_move_data_to_device",
"get_paddle_gpu_str", "get_paddle_gpu_str",
@@ -21,73 +20,90 @@ if _NEED_IMPORT_PADDLE:


from .utils import apply_to_collection from .utils import apply_to_collection


def get_device_from_visible(device: Union[str, int], output_type=int):
def _convert_data_device(device: Union[str, int]) -> str:
""" """
在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。
如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。
用于转换 ``driver`` 的 ``data_device`` 的函数。如果用户设置了 ``FASTNLP_BACKEND=paddle``,那么 ``fastNLP`` 会将
可见的设备保存在 ``USER_CUDA_VISIBLE_DEVICES`` 中,并且将 ``CUDA_VISIBLE_DEVICES`` 设置为可见的第一张显卡;这是为
了顺利执行 ``paddle`` 的分布式训练而设置的。
在这种情况下,单纯使用 ``driver.data_device`` 是无效的。比如在分布式训练中将设备设置为 ``[0,2,3]`` ,且用户设置了
``CUDA_VISIBLE_DEVICES=3,4,5,6`` ,那么在 ``rank1``的进程中有::


:param device: 未转化的设备名
:param output_type: 返回值的类型
:return: 转化后的设备id
"""
if output_type not in [int, str]:
raise ValueError("Parameter `output_type` should be one of these types: [int, str]")
if device == "cpu":
return device
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if user_visible_devices is None:
raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
"`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
idx = get_paddle_device_id(device)
# 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备
if user_visible_devices is None:
raise RuntimeError("This situation cannot happen, please report a bug to us.")
idx = user_visible_devices.split(",")[idx]

cuda_visible_devices_list = cuda_visible_devices.split(',')
if idx not in cuda_visible_devices_list:
raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}]. ")
res = cuda_visible_devices_list.index(idx)
if output_type == int:
return res
else:
return f"gpu:{res}"
os.environ["CUDA_VISIBLE_DEVICES"] = "5"
os.environ["USER_CUDA_VISIBLE_DEVICES"] = "3,4,5,6"
driver.data_device = "gpu:2" # 为了向用户正确地反映他们设置的设备减少歧义,因此这里没有设置为 "gpu:5"

此时我们便需要通过这个函数将 ``data_device`` 转换为 ``gpu:0``。具体过程便是通过索引 **2** 在 ``USER_CUDA_VISIBLE_DEVICES`` 中
找到设备 **5**,然后在 ``CUDA_VISIBLE_DEVICES`` 中找到设备 **5** 的索引 **0** 返回。

.. note::


def paddle_to(data, device: Union[str, int]):
在分布式单进程仅支持单卡的情况下中,这个函数实际等同于直接转换为 ``gpu:0`` 返回。

:param device: 未转化的设备;
:return: 转化后的设备,格式为 ``gpu:x``;
"""
try:
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if device == "cpu" or user_visible_devices is None:
# 传入的是 CPU,或者没有设置 USER_CUDA_VISIBLE_DEVICES
# 此时不需要进行转换
return get_paddle_gpu_str(device)

idx = get_paddle_device_id(device)
idx = user_visible_devices.split(",")[idx]
# 此时 CUDA_VISIBLE_DEVICES 一定不是 None
cuda_visible_devices_list = os.getenv("CUDA_VISIBLE_DEVICES").split(',')
return f"gpu:{cuda_visible_devices_list.index(idx)}"
except Exception as e:
raise ValueError(f"Can't convert device {device} when USER_CUDA_VISIBLE_DEVICES={user_visible_devices} "
"and CUDA_VISIBLE_DEVICES={cuda_visible_devices}. If this situation happens, please report this bug to us.")

def paddle_to(data: "paddle.Tensor", device: Union[str, int]) -> "paddle.Tensor":
""" """
将 `data` 迁移到指定的 `device` 上
将 ``data`` 迁移到指定的 ``device`` 上。``paddle.Tensor`` 没有类似 ``torch.Tensor`` 的 ``to`` 函数,该函数
只是集成了 :func:`paddle.Tensor.cpu` 和 :func:`paddle.Tensor.cuda` 两个函数。


:param data: 要迁移的张量
:param device: 目标设备,可以是 `str` 或 `int`
:return: 迁移后的张量
:param data: 要迁移的张量
:param device: 目标设备,可以是 ``str`` 或 ``int`` 类型;
:return: 迁移后的张量
""" """


if device == "cpu": if device == "cpu":
return data.cpu() return data.cpu()
else: else:
# device = get_device_from_visible(device, output_type=int)
return data.cuda(get_paddle_device_id(device)) return data.cuda(get_paddle_device_id(device))



def get_paddle_gpu_str(device: Union[str, int]):
def get_paddle_gpu_str(device: Union[str, int]) -> str:
""" """
获得 `gpu:x` 类型的设备名
获得 ``gpu:x`` 格式的设备名::


:param device: 设备编号或设备名
:return: 返回对应的 `gpu:x` 格式的设备名
>>> get_paddle_gpu_str(1)
'gpu:1'
>>> get_paddle_gpu_str("cuda:1")
'gpu:1'

:param device: 设备编号或设备名;
:return: 返回对应的 ``gpu:x`` 格式的设备名;
""" """
if isinstance(device, str): if isinstance(device, str):
return device.replace("cuda", "gpu") return device.replace("cuda", "gpu")
return f"gpu:{device}" return f"gpu:{device}"




def get_paddle_device_id(device: Union[str, int]):
def get_paddle_device_id(device: Union[str, int]) -> int:
""" """
获得 gpu 的设备id
获得 ``device`` 的设备编号::

>>> get_paddle_device_id("gpu:1")
1
>>> get_paddle_device_id("gpu")
0

请注意不要向这个函数中传入 ``cpu``。


:param: device: 设备编号或设备名
:return: 设备对应的编号
:param: device: 设备编号或设备名
:return: 设备对应的编号
""" """
if isinstance(device, int): if isinstance(device, int):
return device return device
@@ -109,21 +125,17 @@ def get_paddle_device_id(device: Union[str, int]):


return device_id return device_id


def paddle_move_data_to_device(batch: Any, device: Optional[str] = None,
data_device: Optional[str] = None) -> Any:
def paddle_move_data_to_device(batch: Any, device: Optional[Union[str, int]]) -> Any:
r""" r"""
将数据集合传输到给定设备。只有paddle.Tensor对象会被传输到设备中,其余保持不变
``paddle`` 的数据集合传输到给定设备。只有 :class:`paddle.Tensor` 对象会被传输到设备中,其余保持不变


:param batch:
:param device: `cpu`, `gpu` or `gpu:x`
:param data_device:
:return: 相同的集合,但所有包含的张量都驻留在新设备上
:param batch: 需要进行迁移的数据集合;
:param device: 目标设备。可以是显卡设备的编号,或是``cpu``, ``gpu`` 或 ``gpu:x`` 格式的字符串;当这个参数
为 `None`` 时,不会执行任何操作。
:return: 迁移到新设备上的数据集合
""" """
if device is None: if device is None:
if data_device is not None:
device = data_device
else:
return batch
return batch


def batch_to(data: Any) -> Any: def batch_to(data: Any) -> Any:
return paddle_to(data, device) return paddle_to(data, device)
@@ -131,22 +143,22 @@ def paddle_move_data_to_device(batch: Any, device: Optional[str] = None,
return apply_to_collection(batch, dtype=paddle.Tensor, function=batch_to) return apply_to_collection(batch, dtype=paddle.Tensor, function=batch_to)




def is_in_paddle_dist():
def is_in_paddle_dist() -> bool:
""" """
判断是否处于分布式的进程下,使用 global_rank 和 selected_gpus 判断
判断是否处于 ``paddle`` 分布式的进程下,使用 ``PADDLE_RANK_IN_NODE`` 和 ``FLAGS_selected_gpus`` 判断。
""" """
return ('PADDLE_RANK_IN_NODE' in os.environ and 'FLAGS_selected_gpus' in os.environ) return ('PADDLE_RANK_IN_NODE' in os.environ and 'FLAGS_selected_gpus' in os.environ)




def is_in_fnlp_paddle_dist():
def is_in_fnlp_paddle_dist() -> bool:
""" """
判断是否处于 FastNLP 拉起的分布式进程中
判断是否处于 ``fastNLP`` 拉起的 ``paddle`` 分布式进程中
""" """
return FASTNLP_DISTRIBUTED_CHECK in os.environ return FASTNLP_DISTRIBUTED_CHECK in os.environ




def is_in_paddle_launch_dist():
def is_in_paddle_launch_dist() -> bool:
""" """
判断是否处于 launch 启动的分布式进程中
判断是否处于 ``python -m paddle.distributed.launch`` 方法启动的 ``paddle`` 分布式进程中
""" """
return FASTNLP_BACKEND_LAUNCH in os.environ return FASTNLP_BACKEND_LAUNCH in os.environ

+ 2
- 3
fastNLP/core/utils/rich_progress.py View File

@@ -1,7 +1,6 @@
""" """
该文件用于为fastNLP提供一个统一的progress bar管理,通过共用一个Task对象,trainer中的progress bar和evaluation中的progress bar才能
不冲突

该文件用于为 ``fastNLP`` 提供一个统一的 ``progress bar`` 管理,通过共用一个``Task`` 对象, :class:`~fastNLP.core.Trainer` 中
的 ``progress bar`` 和 :class:`~fastNLP.core.Evaluator` 中的 ``progress bar`` 才能不冲突
""" """
import sys import sys
from typing import Any, Union, Optional from typing import Any, Union, Optional


+ 0
- 49
fastNLP/core/utils/torch_paddle_utils.py View File

@@ -1,49 +0,0 @@
from typing import Any, Optional

from fastNLP.envs.imports import _NEED_IMPORT_PADDLE, _NEED_IMPORT_TORCH

if _NEED_IMPORT_PADDLE:
import paddle

if _NEED_IMPORT_TORCH:
import torch

__all__ = [
"torch_paddle_move_data_to_device",
]

from .utils import apply_to_collection
from .paddle_utils import paddle_to


def torch_paddle_move_data_to_device(batch: Any, device: Optional[str] = None, non_blocking: Optional[bool] = True,
data_device: Optional[str] = None) -> Any:
r"""
将数据集合传输到给定设备。只有paddle.Tensor和torch.Tensor对象会被传输到设备中,其余保持不变

:param batch:
:param device:
:param non_blocking:
:param data_device:
:return: 相同的集合,但所有包含的张量都驻留在新设备上;
"""

if device is None:
if data_device is not None:
device = data_device
else:
return batch

torch_device = device.replace("gpu", "cuda")
paddle_device = device.replace("cuda", "gpu")

def batch_to(data: Any) -> Any:
if isinstance(data, torch.Tensor):
data = data.to(torch_device, non_blocking=non_blocking)
elif isinstance(data, paddle.Tensor):
data = paddle_to(data, paddle_device)
return data

return apply_to_collection(batch, dtype=(paddle.Tensor, torch.Tensor), function=batch_to)

+ 5
- 5
fastNLP/core/utils/torch_utils.py View File

@@ -44,12 +44,12 @@ class TorchTransferableDataType(ABC):
def torch_move_data_to_device(batch: Any, device: Optional[Union[str, "torch.device"]] = None, def torch_move_data_to_device(batch: Any, device: Optional[Union[str, "torch.device"]] = None,
non_blocking: Optional[bool] = True) -> Any: non_blocking: Optional[bool] = True) -> Any:
r""" r"""
将数据集合传输到给定设备。任何定义方法 “to(device)” 的对象都将被移动并且集合中的所有其他对象将保持不变;
在 ``pytorch`` 中将数据集合 ``batch`` 传输到给定设备。任何定义方法 ``to(device)`` 的对象都将被移动并且集合中的所有其他对象将保持不变;


:param batch: 应当迁移的数据;
:param device: 数据应当迁移到的设备;当该参数的值为 None 时,表示迁移数据的操作由用户自己完成,我们不需要经管
:param non_blocking: pytorch 的迁移数据方法 `to` 的参数;
:return: 相同的集合,但所有包含的张量都驻留在新设备上
:param batch: 需要迁移的数据;
:param device: 数据应当迁移到的设备;当该参数的值为 ``None`` 时则不执行任何操作
:param non_blocking: ``pytorch`` 的数据迁移方法 ``to`` 的参数;
:return: 迁移到新设备上的数据集合
""" """
if device is None: if device is None:
return batch return batch


+ 99
- 84
fastNLP/core/utils/utils.py View File

@@ -10,10 +10,6 @@ from typing import Callable, List, Any, Dict, AnyStr, Union, Mapping, Sequence
from typing import Tuple, Optional from typing import Tuple, Optional
from time import sleep from time import sleep


try:
from typing import Literal, Final
except ImportError:
from typing_extensions import Literal, Final
import os import os
from contextlib import contextmanager from contextlib import contextmanager
from functools import wraps from functools import wraps
@@ -22,7 +18,6 @@ import numpy as np
from pathlib import Path from pathlib import Path


from fastNLP.core.log import logger from fastNLP.core.log import logger
from ...envs import SUPPORT_BACKENDS




__all__ = [ __all__ = [
@@ -43,10 +38,16 @@ __all__ = [


def get_fn_arg_names(fn: Callable) -> List[str]: def get_fn_arg_names(fn: Callable) -> List[str]:
r""" r"""
返回一个函数的所有参数的名字;
该函数可以返回一个函数所有参数的名字::

>>> def function(a, b=1):
... return a
...
>>> get_fn_arg_names(function)
['a', 'b']


:param fn: 需要查询的函数; :param fn: 需要查询的函数;
:return: 一个列表,其中的元素则是查询函数的参数的字符串名字;
:return: 包含函数 ``fn`` 参数名的列表
""" """
return list(inspect.signature(fn).parameters) return list(inspect.signature(fn).parameters)


@@ -54,24 +55,18 @@ def get_fn_arg_names(fn: Callable) -> List[str]:
def auto_param_call(fn: Callable, *args, signature_fn: Optional[Callable] = None, def auto_param_call(fn: Callable, *args, signature_fn: Optional[Callable] = None,
mapping: Optional[Dict[AnyStr, AnyStr]] = None) -> Any: mapping: Optional[Dict[AnyStr, AnyStr]] = None) -> Any:
r""" r"""
该函数会根据输入函数的形参名从*args(因此都需要是dict类型)中找到匹配的值进行调用,如果传入的数据与fn的形参不匹配,可以通过mapping
参数进行转换。mapping参数中的一对(key,value)表示以这个key在*args中找到值,并将这个值传递给形参名为value的参数。

1.该函数用来提供给用户根据字符串匹配从而实现自动调用;
2.注意 mapping 默认为 None,如果你希望指定输入和运行函数的参数的对应方式,那么你应当让 mapping 为一个这样的字典传入进来;
如果 mapping 不为 None,那么我们一定会先使用 mapping 将输入的字典的 keys 修改过来,因此请务必亲自检查 mapping 的正确性;
3.如果输入的函数的参数有默认值,那么如果之后的输入中没有该参数对应的值,我们就会使用该参数对应的默认值,否则也会使用之后的输入的值;
4.如果输入的函数是一个 `partial` 函数,情况同 '3.',即和默认参数的情况相同;

:param fn: 用来进行实际计算的函数,其参数可以包含有默认值;
:param args: 一系列的位置参数,应当为一系列的字典,我们需要从这些输入中提取 `fn` 计算所需要的实际参数;
:param signature_fn: 函数,用来替换 `fn` 的函数签名,如果该参数不为 None,那么我们首先会从该函数中提取函数签名,然后通过该函数签名提取
参数值后,再传给 `fn` 进行实际的运算;
:param mapping: 一个字典,用来更改其前面的字典的键值;
该函数会根据输入函数的形参名从 ``*args`` (均为 ``dict`` 类型)中找到匹配的值进行调用,如果传入的数据与 ``fn`` 的形参不匹配,可以通过
``mapping`` 参数进行转换。``mapping`` 参数中的一对 ``(key, value)`` 表示在 ``*args`` 中找到 ``key`` 对应的值,并将这个值传递给形参中名为
``value`` 的参数。


:return: 返回 `fn` 运行的结果;
1. 该函数用来提供给用户根据字符串匹配从而实现自动调用;
2. 注意 ``mapping`` 默认为 ``None``,如果你希望指定输入和运行函数的参数的对应方式,那么你应当让 ``mapping`` 为一个字典传入进来;
如果 ``mapping`` 不为 ``None``,那么我们一定会先使用 ``mapping`` 将输入的字典的 ``keys`` 修改过来,因此请务必亲自检查 ``mapping`` 的正确性;
3. 如果输入的函数的参数有默认值,那么如果之后的输入中没有该参数对应的值,我们就会使用该参数对应的默认值,否则也会使用之后的输入的值;
4. 如果输入的函数是一个 ``partial`` 函数,情况同第三点,即和默认参数的情况相同;


Examples:: Examples::

>>> # 1 >>> # 1
>>> loss_fn = CrossEntropyLoss() # 如果其需要的参数为 def CrossEntropyLoss(y, pred); >>> loss_fn = CrossEntropyLoss() # 如果其需要的参数为 def CrossEntropyLoss(y, pred);
>>> batch = {"x": 20, "y": 1} >>> batch = {"x": 20, "y": 1}
@@ -84,6 +79,14 @@ def auto_param_call(fn: Callable, *args, signature_fn: Optional[Callable] = None
>>> print(auto_param_call(test_fn, {"x": 10}, {"y": 20, "a": 30})) # res: 70 >>> print(auto_param_call(test_fn, {"x": 10}, {"y": 20, "a": 30})) # res: 70
>>> print(auto_param_call(partial(test_fn, a=100), {"x": 10}, {"y": 20})) # res: 140 >>> print(auto_param_call(partial(test_fn, a=100), {"x": 10}, {"y": 20})) # res: 140
>>> print(auto_param_call(partial(test_fn, a=100), {"x": 10}, {"y": 20, "a": 200})) # res: 240 >>> print(auto_param_call(partial(test_fn, a=100), {"x": 10}, {"y": 20, "a": 200})) # res: 240

:param fn: 用来进行实际计算的函数,其参数可以包含有默认值;
:param args: 一系列的位置参数,应当为一系列的字典,我们需要从这些输入中提取 ``fn`` 计算所需要的实际参数;
:param signature_fn: 函数,用来替换 ``fn`` 的函数签名,如果该参数不为 ``None``,那么我们首先会从该函数中提取函数签名,然后通过该函数签名提取
参数值后,再传给 ``fn`` 进行实际的运算;
:param mapping: 一个字典,用来更改其前面的字典的键值;

:return: 返回 ``fn`` 运行的结果;
""" """


if signature_fn is not None: if signature_fn is not None:
@@ -164,13 +167,13 @@ def _get_keys(args:List[Dict]) -> List[List[str]]:


def _get_fun_msg(fn, with_fp=True)->str: def _get_fun_msg(fn, with_fp=True)->str:
""" """
获取函数的基本信息,帮助报错
ex:
print(_get_fun_msg(_get_fun_msg))
# `_get_fun_msg(fn) -> str`(In file:/Users/hnyan/Desktop/projects/fastNLP/fastNLP/fastNLP/core/utils/utils.py)
获取函数的基本信息,帮助报错::
>>>> print(_get_fun_msg(_get_fun_msg))
`_get_fun_msg(fn) -> str`(In file:/Users/hnyan/Desktop/projects/fastNLP/fastNLP/fastNLP/core/utils/utils.py)


:param callable fn: :param callable fn:
:param with_fp: 是否包含函数所在的文件信息
:param with_fp: 是否包含函数所在的文件信息
:return: :return:
""" """
if isinstance(fn, functools.partial): if isinstance(fn, functools.partial):
@@ -226,13 +229,13 @@ def _check_valid_parameters_number(fn, expected_params:List[str], fn_name=None):


def check_user_specific_params(user_params: Dict, fn: Callable): def check_user_specific_params(user_params: Dict, fn: Callable):
""" """
该函数使用用户的输入来对指定函数的参数进行赋值;
主要用于一些用户无法直接调用函数的情况;
该函数主要的作用在于帮助检查用户对使用函数 fn 的参数输入是否有误;
该函数使用用户的输入来对指定函数的参数进行赋值,主要用于一些用户无法直接调用函数的情况;
主要作用在于帮助检查用户对使用函数 ``fn`` 的参数输入是否有误;


:param user_params: 用户指定的参数的值,应当是一个字典,其中 key 表示每一个参数的名字,value 为每一个参数应当的值;
:param fn: 会被调用的函数;
:return: 返回一个字典,其中为在之后调用函数 fn 时真正会被传进去的参数的值;
:param user_params: 用户指定的参数的值,应当是一个字典,其中 ``key`` 表示每一个参数的名字,
``value`` 为每一个参数的值;
:param fn: 将要被调用的函数;
:return: 返回一个字典,其中为在之后调用函数 ``fn`` 时真正会被传进去的参数的值;
""" """


fn_arg_names = get_fn_arg_names(fn) fn_arg_names = get_fn_arg_names(fn)
@@ -243,6 +246,9 @@ def check_user_specific_params(user_params: Dict, fn: Callable):




def dataclass_to_dict(data: "dataclasses.dataclass") -> Dict: def dataclass_to_dict(data: "dataclasses.dataclass") -> Dict:
"""
将传入的 ``dataclass`` 实例转换为字典。
"""
if not is_dataclass(data): if not is_dataclass(data):
raise TypeError(f"Parameter `data` can only be `dataclass` type instead of {type(data)}.") raise TypeError(f"Parameter `data` can only be `dataclass` type instead of {type(data)}.")
_dict = dict() _dict = dict()
@@ -253,23 +259,33 @@ def dataclass_to_dict(data: "dataclasses.dataclass") -> Dict:


def match_and_substitute_params(mapping: Optional[Union[Callable, Dict]] = None, data: Optional[Any] = None) -> Any: def match_and_substitute_params(mapping: Optional[Union[Callable, Dict]] = None, data: Optional[Any] = None) -> Any:
r""" r"""
用来实现将输入:batch,或者输出:outputs,通过 `mapping` 将键值进行更换的功能;
该函数应用于 `input_mapping` 和 `output_mapping`;
对于 `input_mapping`,该函数会在 `TrainBatchLoop` 中取完数据后立刻被调用;
对于 `output_mapping`,该函数会在 `Trainer.train_step` 以及 `Evaluator.train_step` 中得到结果后立刻被调用;
用来实现将输入的 ``batch`` 或者输出的 ``outputs`` 通过 ``mapping`` 将键值进行更换的功能;
该函数应用于 ``input_mapping`` 和 ``output_mapping``;


转换的逻辑按优先级依次为:
* 对于 ``input_mapping``,该函数会在 :class:`~fastNLP.core.controllers.TrainBatchLoop` 中取完数据后立刻被调用;
* 对于 ``output_mapping``,该函数会在 :class:`~fastNLP.core.Trainer` 的 :meth:`~fastNLP.core.Trainer.train_step`
以及 :class:`~fastNLP.core.Evaluator` 的 :meth:`~fastNLP.core.Evaluator.train_step` 中得到结果后立刻被调用;


1. 如果 `mapping` 是一个函数,那么会直接返回 `mapping(data)`;
2. 如果 `mapping` 是一个 `Dict`,那么 `data` 的类型只能为以下三种: [`Dict`, `dataclass`, `Sequence`];
如果 `data` 是 `Dict`,那么该函数会将 `data` 的 key 替换为 mapping[key];
如果 `data` 是 `dataclass`,那么该函数会先使用 `dataclasses.asdict` 函数将其转换为 `Dict`,然后进行转换;
如果 `data` 是 `Sequence`,那么该函数会先将其转换成一个对应的 `Dict`:{"_0": list[0], "_1": list[1], ...},然后使用
mapping对这个 `Dict` 进行转换,如果没有匹配上mapping中的key则保持"_number"这个形式。
转换的逻辑按优先级依次为:


:param mapping: 用于转换的字典或者函数;mapping是函数时,返回值必须为字典类型。
1. 如果 ``mapping`` 是一个函数,那么会直接返回 ``mapping(data)``;
2. 如果 ``mapping`` 是一个 ``Dict``,那么 ``data`` 的类型只能为以下三种: ``[Dict, dataclass, Sequence]``;
* 如果 ``data`` 是 ``Dict``,那么该函数会将 ``data`` 的 ``key`` 替换为 ``mapping[key]``;
* 如果 ``data`` 是 ``dataclass``,那么该函数会先使用 :func:`dataclasses.asdict` 函数将其转换为 ``Dict``,然后进行转换;
* 如果 ``data`` 是 ``Sequence``,那么该函数会先将其转换成一个对应的字典::
{
"_0": list[0],
"_1": list[1],
...
}

然后使用 ``mapping`` 对这个 ``Dict`` 进行转换,如果没有匹配上 ``mapping`` 中的 ``key`` 则保持 ``\'\_number\'`` 这个形式。

:param mapping: 用于转换的字典或者函数;当 ``mapping`` 是函数时,返回值必须为字典类型;
:param data: 需要被转换的对象; :param data: 需要被转换的对象;
:return: 返回转换好的结果;
:return: 返回转换的结果;
""" """
if mapping is None: if mapping is None:
return data return data
@@ -320,21 +336,20 @@ def apply_to_collection(
include_none: bool = True, include_none: bool = True,
**kwargs: Any, **kwargs: Any,
) -> Any: ) -> Any:
"""将函数 function 递归地在 data 中的元素执行,但是仅在满足元素为 dtype 时执行。

this function credit to: https://github.com/PyTorchLightning/pytorch-lightning
Args:
data: the collection to apply the function to
dtype: the given function will be applied to all elements of this dtype
function: the function to apply
*args: positional arguments (will be forwarded to calls of ``function``)
wrong_dtype: the given function won't be applied if this type is specified and the given collections
is of the ``wrong_dtype`` even if it is of type ``dtype``
include_none: Whether to include an element if the output of ``function`` is ``None``.
**kwargs: keyword arguments (will be forwarded to calls of ``function``)

Returns:
The resulting collection
"""
递归地对 ``data`` 中的元素执行函数 ``function``,且仅在满足元素为 ``dtype`` 时执行。

该函数参考了 `pytorch-lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_ 的实现

:param data: 需要进行处理的数据集合或数据;
:param dtype: 数据的类型,函数 ``function`` 只会被应用于 ``data`` 中类型为 ``dtype`` 的数据;
:param function: 对数据进行处理的函数;
:param args: ``function`` 所需要的其它参数;
:param wrong_dtype: ``function`` 一定不会生效的数据类型。如果数据既是 ``wrong_dtype`` 类型又是 ``dtype`` 类型
那么也不会生效;
:param include_none: 是否包含执行结果为 ``None`` 的数据,默认为 ``True``;
:param kwargs: ``function`` 所需要的其它参数;
:return: 经过 ``function`` 处理后的数据集合;
""" """
# Breaking condition # Breaking condition
if isinstance(data, dtype) and (wrong_dtype is None or not isinstance(data, wrong_dtype)): if isinstance(data, dtype) and (wrong_dtype is None or not isinstance(data, wrong_dtype)):
@@ -402,18 +417,20 @@ def apply_to_collection(
@contextmanager @contextmanager
def nullcontext(): def nullcontext():
r""" r"""
用来实现一个什么 dummy 的 context 上下文环境;
实现一个什么都不做的上下文环境。
""" """
yield yield




def sub_column(string: str, c: int, c_size: int, title: str) -> str: def sub_column(string: str, c: int, c_size: int, title: str) -> str:
r""" r"""
:param string: 要被截断的字符串
:param c: 命令行列数
:param c_size: instance或dataset field数
:param title: 列名
:return: 对一个过长的列进行截断的结果
对传入的字符串进行截断,方便在命令行中显示。

:param string: 要被截断的字符串;
:param c: 命令行列数;
:param c_size: :class:`~fastNLP.core.Instance` 或 :class:`fastNLP.core.DataSet` 的 ``field`` 数目;
:param title: 列名;
:return: 对一个过长的列进行截断的结果;
""" """
avg = max(int(c / c_size / 2), len(title)) avg = max(int(c / c_size / 2), len(title))
string = str(string) string = str(string)
@@ -442,18 +459,17 @@ def _is_iterable(value):


def pretty_table_printer(dataset_or_ins) -> PrettyTable: def pretty_table_printer(dataset_or_ins) -> PrettyTable:
r""" r"""
:param dataset_or_ins: 传入一个dataSet或者instance
用于在 ``fastNLP`` 中展示数据的函数::


.. code-block::

ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2], field_3=["a", "b", "c"])
>>> ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2], field_3=["a", "b", "c"])
+-----------+-----------+-----------------+ +-----------+-----------+-----------------+
| field_1 | field_2 | field_3 | | field_1 | field_2 | field_3 |
+-----------+-----------+-----------------+ +-----------+-----------+-----------------+
| [1, 1, 1] | [2, 2, 2] | ['a', 'b', 'c'] | | [1, 1, 1] | [2, 2, 2] | ['a', 'b', 'c'] |
+-----------+-----------+-----------------+ +-----------+-----------+-----------------+


:return: 以 pretty table的形式返回根据terminal大小进行自动截断
:param dataset_or_ins: 要展示的 :class:`~fastNLP.core.DataSet` 或者 :class:`~fastNLP.core.Instance` 实例;
:return: 根据命令行大小进行自动截断的数据表格;
""" """
x = PrettyTable() x = PrettyTable()
try: try:
@@ -486,7 +502,7 @@ def pretty_table_printer(dataset_or_ins) -> PrettyTable:




class Option(dict): class Option(dict):
r"""a dict can treat keys as attributes"""
r"""将键转化为属性的字典类型"""


def __getattr__(self, item): def __getattr__(self, item):
try: try:
@@ -516,11 +532,10 @@ _emitted_deprecation_warnings = set()




def deprecated(help_message: Optional[str] = None): def deprecated(help_message: Optional[str] = None):
"""Decorator to mark a function as deprecated.
"""
标记当前功能已经过时的装饰器。


Args:
help_message (`Optional[str]`): An optional message to guide the user on how to
switch to non-deprecated usage of the library.
:param help_message: 一段指引信息,告知用户如何将代码切换为当前版本提倡的用法;
""" """


def decorator(deprecated_function: Callable): def decorator(deprecated_function: Callable):
@@ -549,11 +564,10 @@ def deprecated(help_message: Optional[str] = None):
return decorator return decorator




def seq_len_to_mask(seq_len, max_len=None):
def seq_len_to_mask(seq_len, max_len: Optional[int]):
r""" r"""


将一个表示sequence length的一维数组转换为二维的mask,不包含的位置为0。
转变 1-d seq_len到2-d mask.
将一个表示 ``sequence length`` 的一维数组转换为二维的 ``mask`` ,不包含的位置为 **0**。


.. code-block:: .. code-block::


@@ -570,10 +584,11 @@ def seq_len_to_mask(seq_len, max_len=None):
>>>print(mask.size()) >>>print(mask.size())
torch.Size([14, 100]) torch.Size([14, 100])


:param np.ndarray,torch.LongTensor seq_len: shape将是(B,)
:param int max_len: 将长度pad到这个长度。默认(None)使用的是seq_len中最长的长度。但在nn.DataParallel的场景下可能不同卡的seq_len会有
区别,所以需要传入一个max_len使得mask的长度是pad到该长度。
:return: np.ndarray, torch.Tensor 。shape将是(B, max_length), 元素类似为bool或torch.uint8
:param seq_len: 大小为 ``(B,)`` 的长度序列;
:param int max_len: 将长度补齐或截断到 ``max_len``。默认情况(为 ``None``)使用的是 ``seq_len`` 中最长的长度;
但在 :class:`torch.nn.DataParallel` 等分布式的场景下可能不同卡的 ``seq_len`` 会有区别,所以需要传入
一个 ``max_len`` 使得 ``mask`` 的补齐或截断到该长度。
:return: 大小为 ``(B, max_len)`` 的 ``mask``, 元素类型为 ``bool`` 或 ``uint8``
""" """
if isinstance(seq_len, np.ndarray): if isinstance(seq_len, np.ndarray):
assert len(np.shape(seq_len)) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}." assert len(np.shape(seq_len)) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}."


+ 19
- 4
fastNLP/envs/set_backend.py View File

@@ -51,23 +51,33 @@ def _set_backend():
assert _module_available(backend), f"You must have {backend} available to use {backend} backend." assert _module_available(backend), f"You must have {backend} available to use {backend} backend."
assert 'paddle' not in sys.modules, "You have to use `set_backend()` before `import paddle`." assert 'paddle' not in sys.modules, "You have to use `set_backend()` before `import paddle`."
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
if 'PADDLE_RANK_IN_NODE' in os.environ and 'FLAGS_selected_gpus' in os.environ: if 'PADDLE_RANK_IN_NODE' in os.environ and 'FLAGS_selected_gpus' in os.environ:
# 在分布式子进程下,根据 USER_VISIBLE_DEVICES 得到进程真正占有的设备 # 在分布式子进程下,根据 USER_VISIBLE_DEVICES 得到进程真正占有的设备
selected_gpus = os.environ['FLAGS_selected_gpus'].split(',') selected_gpus = os.environ['FLAGS_selected_gpus'].split(',')
if user_visible_devices is not None: if user_visible_devices is not None:
# 用户通过 CUDA_VISIBLE_DEVICES 启动了分布式训练
# 用户使用 fastNLP 启动了分布式训练
# 此时经过 set_backend,用户的设置会保存在 USER_CUDA_VISIBLE_DEVICES 中 # 此时经过 set_backend,用户的设置会保存在 USER_CUDA_VISIBLE_DEVICES 中
# 我们需要从中找到真正使用的设备编号
# 我们需要从中转换为用户找到真正使用的设备编号
user_visible_devices = user_visible_devices.split(",") user_visible_devices = user_visible_devices.split(",")
selected_gpus = ",".join([user_visible_devices[int(i)] for i in selected_gpus])
selected_gpus = [user_visible_devices[int(i)] for i in selected_gpus]
# 没有找到 USER_CUDA_VISIBLE_DEVICES,说明用户是直接用 launch 启动的
elif cuda_visible_devices:
# 用户设置了可见设备,需要进行转换
# 如 CUDA_VISIBLE_DEVICES = 0,2,3 --gpus=0,2,3
# 在 rank1 中此时 selected_gpus = ['1'],需要转换为设备 2
os.environ[USER_CUDA_VISIBLE_DEVICES] = cuda_visible_devices
cuda_visible_devices = cuda_visible_devices.split(",")
selected_gpus = [cuda_visible_devices[int(i)] for i in selected_gpus]
else: else:
# 没有找到 USER_CUDA_VISIBLE_DEVICES,则将之设置为所有的设备
# 用户没有设置可见设备,则赋值成所有的设备
os.environ[USER_CUDA_VISIBLE_DEVICES] = ",".join(map(str, list( os.environ[USER_CUDA_VISIBLE_DEVICES] = ",".join(map(str, list(
range(get_gpu_count()) range(get_gpu_count())
))) )))
os.environ['CUDA_VISIBLE_DEVICES'] = ",".join(selected_gpus) os.environ['CUDA_VISIBLE_DEVICES'] = ",".join(selected_gpus)
os.environ['FLAGS_selected_gpus'] = ",".join([str(g) for g in range(len(selected_gpus))]) os.environ['FLAGS_selected_gpus'] = ",".join([str(g) for g in range(len(selected_gpus))])
os.environ['FLAGS_selected_accelerators'] = ",".join([str(g) for g in range(len(selected_gpus))]) os.environ['FLAGS_selected_accelerators'] = ",".join([str(g) for g in range(len(selected_gpus))])
elif 'CUDA_VISIBLE_DEVICES' in os.environ: elif 'CUDA_VISIBLE_DEVICES' in os.environ:
# 主进程中,用户设置了 CUDA_VISIBLE_DEVICES # 主进程中,用户设置了 CUDA_VISIBLE_DEVICES
# 将用户设置的 CUDA_VISIBLE_DEVICES hack 掉 # 将用户设置的 CUDA_VISIBLE_DEVICES hack 掉
@@ -91,6 +101,11 @@ def _set_backend():
elif backend == 'torch': elif backend == 'torch':
assert _module_available(backend), f"You must have {backend} available to use {backend} backend." assert _module_available(backend), f"You must have {backend} available to use {backend} backend."


if 'PADDLE_RANK_IN_NODE' in os.environ and 'FLAGS_selected_gpus' in os.environ \
and "USER_CUDA_VISIBLE_DEVICES" not in os.environ:
# 当用户没有设置 backend 并且使用 launch 启动了多卡,应该提醒用户进行设置
raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.")



def set_env(global_seed=None): def set_env(global_seed=None):
""" """


+ 5
- 3
fastNLP/envs/utils.py View File

@@ -6,6 +6,7 @@ from packaging.version import Version
import subprocess import subprocess
import pkg_resources import pkg_resources


__all__ = []


def _module_available(module_path: str) -> bool: def _module_available(module_path: str) -> bool:
"""Check if a path is available in your environment. """Check if a path is available in your environment.
@@ -48,10 +49,11 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
pkg_version = Version(pkg_version.base_version) pkg_version = Version(pkg_version.base_version)
return op(pkg_version, Version(version)) return op(pkg_version, Version(version))


def get_gpu_count():
def get_gpu_count() -> int:
""" """
利用命令行获取gpu数目的函数
:return: gpu数目,如果没有显卡设备则为-1
利用命令行获取 ``gpu`` 数目的函数

:return: 显卡数目,如果没有显卡设备则为-1
""" """
try: try:
lines = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv']) lines = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv'])


+ 2
- 2
fastNLP/modules/__init__.py View File

@@ -1,9 +1,9 @@
__all__ = [ __all__ = [
"MixModule",
# "MixModule",
"torch2paddle", "torch2paddle",
"paddle2torch", "paddle2torch",
"torch2jittor", "torch2jittor",
"jittor2torch", "jittor2torch",
] ]


from .mix_modules import MixModule, torch2paddle, paddle2torch, torch2jittor, jittor2torch
from .mix_modules import torch2paddle, paddle2torch, torch2jittor, jittor2torch

+ 2
- 2
fastNLP/modules/mix_modules/__init__.py View File

@@ -1,10 +1,10 @@
__all__ = [ __all__ = [
"MixModule",
# "MixModule",
"torch2paddle", "torch2paddle",
"paddle2torch", "paddle2torch",
"torch2jittor", "torch2jittor",
"jittor2torch", "jittor2torch",
] ]


from .mix_module import MixModule
# from .mix_module import MixModule
from .utils import * from .utils import *

+ 0
- 310
fastNLP/modules/mix_modules/mix_module.py View File

@@ -1,310 +0,0 @@
import os
import io
import pickle
from typing import Dict
from collections import OrderedDict

import numpy as np

from fastNLP.envs.imports import _NEED_IMPORT_JITTOR, _NEED_IMPORT_PADDLE, _NEED_IMPORT_TORCH
from fastNLP.core.utils.paddle_utils import paddle_to

if _NEED_IMPORT_PADDLE:
import paddle
from paddle.nn import Layer as PaddleLayer

if _NEED_IMPORT_TORCH:
import torch
from torch.nn import Module as TorchModule, Parameter as TorchParameter

if _NEED_IMPORT_JITTOR:
import jittor


__all__ = [
"MixModule",
]

class MixModule:
"""
TODO: 支持不同的混合方式;添加state_dict的支持;如果参数里有List of Tensors该怎么处理;
是否需要仿照Module那样在初始化的时候给各种模型分类
可以同时使用Torch和Paddle框架的混合模型
"""
def __init__(self, *args, **kwargs):
pass

def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)

def named_parameters(self, prefix='', recurse: bool=True, backend=None):
"""
返回模型的名字和参数

:param prefix: 输出时在参数名前加上的前缀
:param recurse: 是否递归地输出参数
:param backend: `backend`=`None`时,将所有模型和张量的参数返回;
`backend`=`torch`时,返回`torch`的参数;
`backend`=`paddle`时,返回`paddle`的参数。
"""
if backend is None:
generator = self.attributes(TorchModule, TorchParameter, PaddleLayer)
elif backend == "torch":
generator = self.attributes(TorchModule, TorchParameter)
elif backend == "paddle":
generator = self.attributes(PaddleLayer)
else:
raise ValueError("Unknown backend parameter.")

for name, value in generator:
name = prefix + ('.' if prefix else '') + name
if isinstance(value, TorchParameter):
# 非Module/Layer类型,直接输出名字和值
yield name, value
elif recurse:
# 递归地调用named_parameters
for name_r, value_r in value.named_parameters(name, recurse):
yield name_r, value_r

def parameters(self, recurse: bool = True, backend: str = None):
"""
返回模型的参数

:param recurse:
:param backend: `backend`=`None`时,将所有模型和张量的参数返回;
`backend`=`torch`时,返回`torch`的参数;
`backend`=`paddle`时,返回`paddle`的参数。
"""
for name, value in self.named_parameters(recurse=recurse, backend=backend):
yield value
def forward(self, *args, **kwargs):
raise NotImplementedError

def train_step(self, batch):
raise NotImplementedError

def test_step(self, batch):
raise NotImplementedError

def evaluate_step(self, batch):
raise NotImplementedError

def train(self):
for name, value in self.attributes(TorchModule, PaddleLayer):
value.train()

def eval(self):
for name, value in self.attributes(TorchModule, PaddleLayer):
value.eval()

def to(self, device):
"""
:param device: 设备名
"""
# 有jittor的话 warning
if device == "cpu":
paddle_device = device
elif device.startswith("cuda"):
paddle_device = device.replace("cuda", "gpu")
elif device.startswith("gpu"):
paddle_device = device
device = device.replace("gpu", "cuda")
else:
raise ValueError("Device value error")

for name, value in self.attributes(TorchModule):
# torch的to函数不影响Tensor
vars(self)[name] = value.to(device)
for name, value in self.attributes(TorchParameter):
# Parameter在经过to函数后会变成Tensor类型
vars(self)[name] = TorchParameter(value.to(device), requires_grad=value.requires_grad)

for name, value in self.attributes(PaddleLayer):
vars(self)[name] = value.to(paddle_device)
for name, value in self.attributes(paddle.Tensor):
# paddle的to函数会影响到Tensor
vars(self)[name] = paddle_to(value, paddle_device)

return self

def state_dict(self, backend: str = None) -> Dict:
"""
返回模型的state_dict。

.. note:: torch的destination参数会在将来删除,因此不提供destination参数

:param backend: `backend`=`None`时,将所有模型和张量的state dict返回;
`backend`=`torch`时,返回`torch`的state dict;
`backend`=`paddle`时,返回`paddle`的state dict。
"""
if backend is None:
generator = self.attributes(TorchModule, TorchParameter, PaddleLayer)
elif backend == "torch":
generator = self.attributes(TorchModule, TorchParameter)
elif backend == "paddle":
generator = self.attributes(PaddleLayer)
else:
raise ValueError(f"Unknown backend {backend}.")

destination = OrderedDict()

for name, value in generator:
if value is None:
continue
if isinstance(value, TorchParameter):
destination[name] = value
else:
# 不同框架state_dict函数的参数名和顺序不同
if isinstance(value, PaddleLayer):
kwargs = {
"structured_name_prefix": name + ".",
}
elif isinstance(value, TorchModule):
kwargs = {
"prefix": name + ".",
}
else:
raise ValueError(f"Unknown item type {type(value)}")
destination.update(value.state_dict(**kwargs))

return destination

def save_state_dict_to_file(self, path: str):
"""
保存模型的state dict到path
"""
# TODO 设备限制
filename = os.path.basename(path)
if filename == "":
raise ValueError("Received empty filename.")
dirname = os.path.dirname(path)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
protocol = 4

saved = {}
paddle_dict = self.state_dict(backend="paddle")
torch_dict = self.state_dict(backend="torch")
# 保存paddle部分
# 调用paddle保存时的处理函数
paddle_saved_obj = paddle.framework.io._build_saved_state_dict(paddle_dict)
paddle_saved_obj = paddle.fluid.io._unpack_saved_dict(paddle_saved_obj, protocol)
# 将返回的dict保存
saved["paddle"] = paddle_saved_obj

# 保存torch部分
buffer = io.BytesIO()
torch.save(torch_dict, buffer)
saved["torch"] = buffer.getvalue()

# 保存
with open(path, "wb") as f:
pickle.dump(saved, f, protocol)

def load_state_dict_from_file(self, path: str):
"""
从 `path` 中加载保存的state dict
"""
state_dict = {}
with open(path, "rb") as f:
loaded = pickle.load(f)
# 加载paddle的数据
paddle_loaded_obj = loaded["paddle"]
paddle_load_result = paddle.fluid.io._pack_loaded_dict(paddle_loaded_obj)
if "StructuredToParameterName@@" in paddle_load_result:
for key in paddle_load_result["StructuredToParameterName@@"]:
if isinstance(paddle_load_result[key], np.ndarray):
paddle_load_result[key] = paddle.to_tensor(paddle_load_result[key])
state_dict.update(paddle_load_result)
# 加载torch的数据
torch_loaded_obj = loaded["torch"]
torch_bytes = io.BytesIO(torch_loaded_obj)
torch_load_result = torch.load(torch_bytes)
state_dict.update(torch_load_result)

self.load_state_dict(state_dict)

def load_state_dict(self, state_dict):
"""
从state dict中加载数据
"""
missing_keys = []
unexpected_keys = []
error_msgs = []
new_state = {}

local_state = self.state_dict()

# 对字典内容按前缀进行归类
for key, value in state_dict.items():
splited = key.split(".", 1)
if len(splited) == 1:
# 没有前缀,实际上只有torch.nn.Parameter会进入这种情况
new_state[key] = value
else:
prefix, name = splited
if prefix not in new_state:
new_state[prefix] = {}
new_state[prefix][name] = value

for key, param in self.attributes(TorchModule, TorchParameter, PaddleLayer):
if key in new_state:
# 在传入的字典中找到了对应的值
input_param = new_state[key]
if not isinstance(input_param, dict):
# 且不是字典,即上述没有前缀的情况
# 按照torch.nn.Module._load_from_state_dict进行赋值
if not torch.overrides.is_tensor_like(input_param):
error_msgs.append('While copying the parameter named "{}", '
'expected torch.Tensor or Tensor-like object from checkpoint but '
'received {}'
.format(key, type(input_param)))
continue

# This is used to avoid copying uninitialized parameters into
# non-lazy modules, since they dont have the hook to do the checks
# in such case, it will error when accessing the .shape attribute.
is_param_lazy = torch.nn.parameter.is_lazy(param)
# Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+
if not is_param_lazy and len(param.shape) == 0 and len(input_param.shape) == 1:
input_param = input_param[0]

if not is_param_lazy and input_param.shape != param.shape:
# local shape should match the one in checkpoint
error_msgs.append('size mismatch for {}: copying a param with shape {} from checkpoint, '
'the shape in current model is {}.'
.format(key, input_param.shape, param.shape))
continue
try:
with torch.no_grad():
param.copy_(input_param)
except Exception as ex:
error_msgs.append('While copying the parameter named "{}", '
'whose dimensions in the model are {} and '
'whose dimensions in the checkpoint are {}, '
'an exception occurred : {}.'
.format(key, param.size(), input_param.size(), ex.args))
else:
# 否则在子模块中
if isinstance(param, TorchModule):
# torch模块
# 由于paddle没有提供类似strict的参数,因此也不对torch作要求
param.load_state_dict(input_param, strict=False)
elif isinstance(param, PaddleLayer):
# paddle模块
param.load_dict(input_param)
else:
missing_keys.append(key)

if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
self.__class__.__name__, "\n\t".join(error_msgs)))

def attributes(self, *types):
"""
查找对应类型的成员
"""
for name, value in vars(self).items():
if isinstance(value, types):
yield name, value

+ 0
- 233
fastNLP/modules/mix_modules/utils.py View File

@@ -1,233 +0,0 @@
import warnings
import os
from typing import Any, Optional, Union

import numpy as np

from fastNLP.core.utils.utils import apply_to_collection
from fastNLP.core.utils.paddle_utils import paddle_to
from fastNLP.envs.imports import _NEED_IMPORT_JITTOR, _NEED_IMPORT_TORCH, _NEED_IMPORT_PADDLE

if _NEED_IMPORT_PADDLE:
import paddle

if _NEED_IMPORT_JITTOR:
import jittor

if _NEED_IMPORT_TORCH:
import torch

__all__ = [
"paddle2torch",
"torch2paddle",
"jittor2torch",
"torch2jittor",
]

def _paddle2torch(paddle_tensor: 'paddle.Tensor', target_device: Optional[Union[str, int]] = None, no_gradient: bool = None) -> 'torch.Tensor':
"""
将paddle tensor转换为torch tensor,并且能够保留梯度进行反向传播
:param paddle_tensor: 要转换的paddle张量
:param target_device: 是否将转换后的张量迁移到特定设备上,输入为`None`时,和输入的张量相同。
:param no_gradient: 是否保留原张量的梯度。为`None`时,新的张量与输入张量保持一致;
为`True`时,全部不保留梯度;为`False`时,全部保留梯度。
:return: 转换后的torch张量
"""
no_gradient = paddle_tensor.stop_gradient if no_gradient is None else no_gradient
paddle_numpy = paddle_tensor.numpy()
if not np.issubdtype(paddle_numpy.dtype, np.inexact):
no_gradient = True

if target_device is None:
if paddle_tensor.place.is_gpu_place():
# paddlepaddle有两种Place,对应不同的device id获取方式
if hasattr(paddle_tensor.place, "gpu_device_id"):
# paddle.fluid.core_avx.Place
# 在gpu环境下创建张量的话,张量的place是这一类型
target_device = f"cuda:{paddle_tensor.place.gpu_device_id()}"
else:
# paddle.CUDAPlace
target_device = f"cuda:{paddle_tensor.place.get_device_id()}"
else:
# TODO: 可能需要支持xpu等设备
target_device = "cpu"

if not no_gradient:
# 保持梯度,并保持反向传播
# torch.tensor会保留numpy数组的类型
torch_tensor = torch.tensor(paddle_numpy, requires_grad=True, device=target_device)
hook = torch_tensor.register_hook(
lambda grad: paddle.autograd.backward(paddle_tensor, paddle.to_tensor(grad.cpu().numpy()))
)
else:
# 不保留梯度
torch_tensor = torch.tensor(paddle_numpy, requires_grad=False, device=target_device)

return torch_tensor


def _torch2paddle(torch_tensor: 'torch.Tensor', target_device: str = None, no_gradient: bool = None) -> 'paddle.Tensor':
"""
将torch tensor转换为paddle tensor,并且能够保留梯度进行反向传播。
:param torch_tensor: 要转换的torch张量
:param target_device: 是否将转换后的张量迁移到特定设备上,输入为`None`时,和输入的张量相同。
:param no_gradient: 是否保留原张量的梯度。为`None`时,新的张量与输入张量保持一致;
为`True`时,全部不保留梯度;为`False`时,全部保留梯度。
:return: 转换后的paddle张量
"""
no_gradient = not torch_tensor.requires_grad if no_gradient is None else no_gradient
if target_device is None:
if torch_tensor.is_cuda:
target_device = f"gpu:{torch_tensor.device.index}"
else:
target_device = "cpu"

if not no_gradient:
# 保持梯度并保持反向传播
# paddle的stop_gradient和torch的requires_grad表现是相反的
paddle_tensor = paddle.to_tensor(torch_tensor.detach().numpy(), stop_gradient=False)
hook = paddle_tensor.register_hook(
lambda grad: torch.autograd.backward(torch_tensor, torch.tensor(grad.numpy()))
)
else:
paddle_tensor = paddle.to_tensor(torch_tensor.detach().numpy(), stop_gradient=True)

paddle_tensor = paddle_to(paddle_tensor, target_device)

return paddle_tensor


def _jittor2torch(jittor_var: 'jittor.Var', target_device: Optional[Union[str, int]] = None, no_gradient: bool = None) -> 'torch.Tensor':
"""
将jittor Var转换为torch tensor,并且能够保留梯度进行反向传播
:param jittor_var: 要转换的jittor变量
:param target_device: 是否将转换后的张量迁移到特定设备上,输入为`None`时,根据jittor.flags.use_cuda决定。
:param no_gradient: 是否保留原张量的梯度。为`None`时,新的张量与输入张量保持一致;
为`True`时,全部不保留梯度;为`False`时,全部保留梯度。
:return: 转换后的torch张量
"""
# TODO: warning:无法保留梯度
# jittor的grad可以通过callback进行传递
# 如果outputs有_grad键,可以实现求导
no_gradient = not jittor_var.requires_grad if no_gradient is None else no_gradient
if no_gradient == False:
warnings.warn("The result tensor will not keep gradients due to differences between jittor and pytorch.")
jittor_numpy = jittor_var.numpy()
if not np.issubdtype(jittor_numpy.dtype, np.inexact):
no_gradient = True

if target_device is None:
# jittor的设备分配是自动的
# 根据use_cuda判断
if jittor.flags.use_cuda:
target_device = "cuda:0"
else:
target_device = "cpu"

torch_tensor = torch.tensor(jittor_numpy, requires_grad=not no_gradient, device=target_device)

return torch_tensor


def _torch2jittor(torch_tensor: 'torch.Tensor', no_gradient: bool = None) -> 'jittor.Var':
"""
将torch tensor转换为jittor Var,并且能够保留梯度进行反向传播
:param torch_tensor: 要转换的torch张量
:param no_gradient: 是否保留原张量的梯度。为`None`时,新的张量与输入张量保持一致;
为`True`时,全部不保留梯度;为`False`时,全部保留梯度。
:return: 转换后的jittor变量
"""
no_gradient = not torch_tensor.requires_grad if no_gradient is None else no_gradient

if not no_gradient:
# 保持梯度并保持反向传播
jittor_var = jittor.Var(torch_tensor.detach().numpy())
jittor_var.requires_grad = True
hook = jittor_var.register_hook(
lambda grad: torch.autograd.backward(torch_tensor, torch.tensor(grad.numpy()))
)
else:
jittor_var = jittor.Var(torch_tensor.detach().numpy())
jittor_var.requires_grad = False

return jittor_var


def torch2paddle(torch_in: Any, target_device: str = None, no_gradient: bool = None) -> Any:
"""
递归地将输入中包含的torch张量转换为paddle张量

:param torch_in: 要转换的包含torch.Tensor类型的变量
:param target_device: 是否将转换后的张量迁移到特定设备上,
输入为`None`时,和输入的张量相同,
:param no_gradient: 是否保留原张量的梯度。为`None`时,新的张量与输入张量保持一致;
为`True`时,全部不保留梯度;为`False`时,全部保留梯度。
:return: 将所有torch.Tensor转换为paddle.Tensor的张量
"""

return apply_to_collection(
torch_in,
dtype=torch.Tensor,
function=_torch2paddle,
target_device=target_device,
no_gradient=no_gradient,
)


def paddle2torch(paddle_in: Any, target_device: str = None, no_gradient: bool = None) -> Any:
"""
递归地将输入中包含的paddle张量转换为torch张量

:param torch_in: 要转换的包含paddle.Tensor类型的变量
:param target_device: 是否将转换后的张量迁移到特定设备上,
输入为`None`时,和输入的张量相同,
:param no_gradient: 是否保留原张量的梯度。为`None`时,新的张量与输入张量保持一致;
为`True`时,全部不保留梯度;为`False`时,全部保留梯度。
:return: 将所有paddle.Tensor转换为torch.Tensor后的变量
"""

return apply_to_collection(
paddle_in,
dtype=paddle.Tensor,
function=_paddle2torch,
target_device=target_device,
no_gradient=no_gradient,
)


def jittor2torch(jittor_in: Any, target_device: str = None, no_gradient: bool = None) -> Any:
"""
递归地将输入中包含的jittor变量转换为torch张量

:param jittor_in: 要转换的jittor变量
:param target_device: 是否将转换后的张量迁移到特定设备上,输入为`None`时,默认为cuda:0。
:param no_gradient: 是否保留原张量的梯度。为`None`时,新的张量与输入张量保持一致;
为`True`时,全部不保留梯度;为`False`时,全部保留梯度。
:return: 转换后的torch张量
"""

return apply_to_collection(
jittor_in,
dtype=jittor.Var,
function=_jittor2torch,
target_device=target_device,
no_gradient=no_gradient,
)


def torch2jittor(torch_in: Any, no_gradient: bool = None) -> Any:
"""
递归地将输入中包含的torch张量转换为jittor变量

:param torch_tensor: 要转换的torch张量
:param no_gradient: 是否保留原张量的梯度。为`None`时,新的张量与输入张量保持一致;
为`True`时,全部不保留梯度;为`False`时,全部保留梯度。
:return: 转换后的jittor变量
"""
return apply_to_collection(
torch_in,
dtype=torch.Tensor,
function=_torch2jittor,
no_gradient=no_gradient,
)

+ 73
- 0
tests/core/callbacks/test_checkpoint_callback_torch.py View File

@@ -14,6 +14,7 @@ from tests.helpers.utils import magic_argv_env_context
from fastNLP.envs.distributed import rank_zero_rm from fastNLP.envs.distributed import rank_zero_rm
from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 from tests.helpers.models.torch_model import TorchNormalModel_Classification_1
from tests.helpers.datasets.torch_data import TorchArgMaxDataset from tests.helpers.datasets.torch_data import TorchArgMaxDataset
from tests.helpers.utils import Capturing
from torchmetrics import Accuracy from torchmetrics import Accuracy
from fastNLP.core.log import logger from fastNLP.core.log import logger


@@ -428,6 +429,78 @@ def test_trainer_checkpoint_callback_1(
dist.destroy_process_group() dist.destroy_process_group()




@pytest.mark.torch
def test_load_state(model_and_optimizers):
try:
path = Path.cwd().joinpath(f"test_model_checkpoint")
path.mkdir(exist_ok=True, parents=True)
from fastNLP import Event, Callback
@Trainer.on(Event.on_before_backward(every=3), marker='all')
def print_outputs(*args):
print("????")

class StateCallback(Callback):
def __init__(self, name):
self.name = name

def on_save_checkpoint(self, trainer):
return {'name': self.name}

def on_load_checkpoint(self, trainer, states):
self.name = states['name']

def on_train_end(self, trainer):
print(self.name)

callbacks = [StateCallback('old_callback1'), StateCallback('old_callback2'),
CheckpointCallback(folder=path, every_n_epochs=1, save_object='trainer')]

trainer = Trainer(
model=model_and_optimizers.model,
driver='torch',
device='cpu',
optimizers=model_and_optimizers.optimizers,
train_dataloader=model_and_optimizers.train_dataloader,
evaluate_dataloaders=model_and_optimizers.evaluate_dataloaders,
input_mapping=model_and_optimizers.input_mapping,
output_mapping=model_and_optimizers.output_mapping,
metrics=model_and_optimizers.metrics,
n_epochs=3,
callbacks=callbacks,
output_from_new_proc="all"
)
trainer.run(num_eval_sanity_batch=0, num_train_batch_per_epoch=2)

all_saved_model_paths = {w.name: w for w in path.joinpath(os.environ[FASTNLP_LAUNCH_TIME]).iterdir()}
epoch_2_path = all_saved_model_paths['trainer-epoch_2']

callbacks = [StateCallback('new_callback1'), StateCallback('new_callback2')]
trainer = Trainer(
model=model_and_optimizers.model,
driver='torch',
device='cpu',
optimizers=model_and_optimizers.optimizers,
train_dataloader=model_and_optimizers.train_dataloader,
evaluate_dataloaders=model_and_optimizers.evaluate_dataloaders,
input_mapping=model_and_optimizers.input_mapping,
output_mapping=model_and_optimizers.output_mapping,
metrics=model_and_optimizers.metrics,
n_epochs=3,
callbacks=callbacks,
output_from_new_proc="all"
)
trainer.load(folder=epoch_2_path)
with Capturing() as output:
trainer.run(num_eval_sanity_batch=0, num_train_batch_per_epoch=2)

assert 'old_callback1' in output[0]
assert 'new_callback2' in output[0]
assert output[0].count('???')==1

finally:
rank_zero_rm(path)


@pytest.mark.torch @pytest.mark.torch
# 通过自己编写 model_save_fn 和 model_load_fn 来测试 huggingface 的 transformers 的模型的保存和加载; # 通过自己编写 model_save_fn 和 model_load_fn 来测试 huggingface 的 transformers 的模型的保存和加载;
@pytest.mark.parametrize("driver,device", [("torch_ddp", [6, 7]), ("torch", 7)]) # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1) @pytest.mark.parametrize("driver,device", [("torch_ddp", [6, 7]), ("torch", 7)]) # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)


+ 2
- 2
tests/core/collators/test_collator.py View File

@@ -334,9 +334,9 @@ def test_torch_dl():
dl = TorchDataLoader(ds, batch_size=2) dl = TorchDataLoader(ds, batch_size=2)
batch = next(iter(dl)) batch = next(iter(dl))
assert 'x' in batch and 'y' in batch and 'z' in batch and 'i' in batch and 'j' in batch assert 'x' in batch and 'y' in batch and 'z' in batch and 'i' in batch and 'j' in batch
assert isinstance(batch['z'], torch.Tensor)
assert isinstance(batch['z'], torch.FloatTensor)
assert isinstance(batch['j'], list) assert isinstance(batch['j'], list)
assert isinstance(batch['i']['j'], torch.Tensor)
assert isinstance(batch['i']['j'], torch.LongTensor)


dl.set_ignore('x') dl.set_ignore('x')
batch = next(iter(dl)) batch = next(iter(dl))


+ 13
- 6
tests/core/controllers/_test_trainer_fleet.py View File

@@ -1,7 +1,15 @@
""" """
这个文件测试用户以python -m paddle.distributed.launch 启动的情况
看看有没有用pytest执行的机会
FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet.py
这个文件测试多卡情况下使用 paddle 的情况::

>>> # 测试用 python -m paddle.distributed.launch 启动
>>> FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet.py
>>> # 测试在限制 GPU 的情况下用 python -m paddle.distributed.launch 启动
>>> CUDA_VISIBLE_DEVICES=0,2,3 FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet.py
>>> # 测试直接使用多卡
>>> FASTNLP_BACKEND=paddle python _test_trainer_fleet.py
>>> # 测试在限制 GPU 的情况下直接使用多卡
>>> CUDA_VISIBLE_DEVICES=3,4,5,6 FASTNLP_BACKEND=paddle python _test_trainer_fleet.py

""" """
import os import os
import sys import sys
@@ -71,14 +79,13 @@ def test_trainer_fleet(


n_epochs=n_epochs, n_epochs=n_epochs,
callbacks=callbacks, callbacks=callbacks,
output_from_new_proc="logs",
# output_from_new_proc="logs",
) )
trainer.run() trainer.run()


if __name__ == "__main__": if __name__ == "__main__":
driver = "paddle" driver = "paddle"
device = [0,2,3]
# driver = "paddle"
device = [0,1,3]
# device = 2 # device = 2
callbacks = [ callbacks = [
# RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True), # RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True),


+ 9
- 6
tests/core/controllers/_test_trainer_fleet_outside.py View File

@@ -1,7 +1,11 @@
""" """
这个文件测试用户以python -m paddle.distributed.launch 启动的情况
并且自己初始化了 fleet
FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet_outside.py
这个文件测试用户自己初始化分布式环境后使用 paddle 的情况:

>>> # 测试用 python -m paddle.distributed.launch 启动
>>> FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet_outside.py
>>> # 测试在限制 GPU 的情况下用 python -m paddle.distributed.launch 启动
>>> CUDA_VISIBLE_DEVICES=0,2,3 FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet_outside.py

""" """
import os import os
import sys import sys
@@ -77,14 +81,13 @@ def test_trainer_fleet(


n_epochs=n_epochs, n_epochs=n_epochs,
callbacks=callbacks, callbacks=callbacks,
output_from_new_proc="logs",
data_device=f"gpu:{os.environ['CUDA_VISIBLE_DEVICES']}"
# output_from_new_proc="logs",
) )
trainer.run() trainer.run()


if __name__ == "__main__": if __name__ == "__main__":
driver = "paddle" driver = "paddle"
device = [0,2,3]
device = [0,1,3]
callbacks = [ callbacks = [
# RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True), # RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True),
RichCallback(5), RichCallback(5),


+ 237
- 0
tests/core/controllers/_test_trainer_jittor.py View File

@@ -0,0 +1,237 @@
import os
import sys
import time
# os.environ["cuda_archs"] = "61"
# os.environ["FAS"]
os.environ["log_silent"] = "1"
sys.path.append("../../../")

from datasets import load_dataset
from datasets import DatasetDict
import jittor as jt
from jittor import nn, Module
from jittor.dataset import Dataset
jt.flags.use_cuda = True

from fastNLP.core.controllers.trainer import Trainer
from fastNLP.core.metrics.accuracy import Accuracy
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.callbacks.progress_callback import RichCallback
from fastNLP.core.callbacks.callback import Callback
from fastNLP.core.dataloaders.jittor_dataloader.fdl import JittorDataLoader

class TextClassificationDataset(Dataset):
def __init__(self, dataset):
super(TextClassificationDataset, self).__init__()
self.dataset = dataset
self.set_attrs(total_len=len(dataset))

def __getitem__(self, idx):
return {"x": self.dataset["input_ids"][idx], "y": self.dataset["label"][idx]}


class LSTM(Module):
def __init__(self, num_of_words, hidden_size, features):

self.embedding = nn.Embedding(num_of_words, features)
self.lstm = nn.LSTM(features, hidden_size, batch_first=True)
self.layer = nn.Linear(hidden_size, 2)
self.softmax = nn.Softmax(dim=1)
self.loss_fn = nn.CrossEntropyLoss()
self.hidden_size = hidden_size
self.features = features

def init_hidden(self, x):
# batch_first
batch_size = x.shape[0]
h0 = jt.randn(1, batch_size, hidden_size)
c0 = jt.randn(1, batch_size, hidden_size)

return h0, c0

def execute(self, input_ids):

output = self.embedding(input_ids)
# TODO 去除padding
output, (h, c) = self.lstm(output, self.init_hidden(output))
# len, batch, hidden_size
output = self.layer(output[-1])

return output

def train_step(self, x, y):
x = self(x)
outputs = self.loss_fn(x, y)
return {"loss": outputs}

def evaluate_step(self, x, y):
x = self(x)
return {"pred": x, "target": y.reshape((-1,))}


class PrintWhileTrainingCallBack(Callback):
"""
通过该Callback实现训练过程中loss的输出
"""

def __init__(self, print_every_epoch, print_every_batch):
self.print_every_epoch = print_every_epoch
self.print_every_batch = print_every_batch

self.loss = 0
self.start = 0
self.epoch_start = 0

def on_train_begin(self, trainer):
"""
在训练开始前输出信息
"""
print("Start training. Total {} epochs and {} batches in each epoch.".format(
trainer.n_epochs, trainer.num_batches_per_epoch
))
self.start = time.time()

def on_before_backward(self, trainer, outputs):
"""
每次反向传播前统计loss,用于计算平均值
"""
loss = trainer.extract_loss_from_outputs(outputs)
loss = trainer.driver.tensor_to_numeric(loss)
self.loss += loss

def on_train_epoch_begin(self, trainer):
self.epoch_start = time.time()

def on_train_epoch_end(self, trainer):
"""
在每经过一定epoch或最后一个epoch时输出当前epoch的平均loss和使用时间
"""
if trainer.cur_epoch_idx % self.print_every_epoch == 0 \
or trainer.cur_epoch_idx == trainer.n_epochs:
print("Epoch: {} Loss: {} Current epoch training time: {}s".format(
trainer.cur_epoch_idx, self.loss / trainer.num_batches_per_epoch, time.time() - self.epoch_start
))
# 将loss清零
self.loss = 0
def on_train_batch_end(self, trainer):
"""
在每经过一定batch或最后一个batch时输出当前epoch截止目前的平均loss
"""
if trainer.batch_idx_in_epoch % self.print_every_batch == 0 \
or trainer.batch_idx_in_epoch == trainer.num_batches_per_epoch:
print("\tBatch: {} Loss: {}".format(
trainer.batch_idx_in_epoch, self.loss / trainer.batch_idx_in_epoch
))

def on_train_end(self, trainer):
print("Total training time: {}s".format(time.time() - self.start))

def process_data(ds: DatasetDict, vocabulary: Vocabulary, max_len=256) -> DatasetDict:
# 分词
ds = ds.map(lambda x: {"input_ids": text_to_id(vocabulary, x["text"], max_len)})
ds.set_format(type="numpy", columns=ds.column_names)
return ds

def set_vocabulary(vocab, dataset):

for data in dataset:
vocab.update(data["text"].split())
return vocab

def text_to_id(vocab, text: str, max_len):
text = text.split()
# to index
ids = [vocab.to_index(word) for word in text]
# padding
ids += [vocab.padding_idx] * (max_len - len(text))
return ids[:max_len]

def get_dataset(name, max_len, train_format="", test_format=""):

# datasets
train_dataset = load_dataset(name, split="train" + train_format).shuffle(seed=123)
test_dataset = load_dataset(name, split="test" + test_format).shuffle(seed=321)
split = train_dataset.train_test_split(test_size=0.2, seed=123)
train_dataset = split["train"]
val_dataset = split["test"]

vocab = Vocabulary()
vocab = set_vocabulary(vocab, train_dataset)
vocab = set_vocabulary(vocab, val_dataset)

train_dataset = process_data(train_dataset, vocab, max_len)
val_dataset = process_data(val_dataset, vocab, max_len)
test_dataset = process_data(test_dataset, vocab, max_len)

return TextClassificationDataset(train_dataset), TextClassificationDataset(val_dataset), \
TextClassificationDataset(test_dataset), vocab

if __name__ == "__main__":

# 训练参数
max_len = 20
epochs = 40
lr = 1
batch_size = 64

features = 100
hidden_size = 128

# 获取数据集
# imdb.py SetFit/sst2
train_data, val_data, test_data, vocab = get_dataset("SetFit/sst2", max_len, "", "")
# 使用dataloader
train_dataloader = JittorDataLoader(
dataset=train_data,
batch_size=batch_size,
shuffle=True,
num_workers=4,
)
val_dataloader = JittorDataLoader(
dataset=val_data,
batch_size=batch_size,
shuffle=True,
num_workers=4,
)
test_dataloader = JittorDataLoader(
dataset=test_data,
batch_size=1,
shuffle=False,
)

# 初始化模型
model = LSTM(len(vocab), hidden_size, features)

# 优化器
# 也可以是多个优化器的list
optimizer = nn.SGD(model.parameters(), lr)

# Metrics
metrics = {"acc": Accuracy()}

# callbacks
callbacks = [
PrintWhileTrainingCallBack(print_every_epoch=1, print_every_batch=10),
# RichCallback(), # print_every参数默认为1,即每一个batch更新一次进度条
]

trainer = Trainer(
model=model,
driver="jittor",
device=[0,1,2,3,4],
optimizers=optimizer,
train_dataloader=train_dataloader,
validate_dataloaders=val_dataloader,
validate_every=-1,
input_mapping=None,
output_mapping=None,
metrics=metrics,
n_epochs=epochs,
callbacks=callbacks,
# progress_bar="raw"
)
trainer.run()

+ 110
- 0
tests/core/controllers/imdb.py View File

@@ -0,0 +1,110 @@
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3
"""IMDB movie reviews dataset."""

import datasets
from datasets.tasks import TextClassification


_DESCRIPTION = """\
Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially \
more data than previous benchmark datasets. We provide a set of 25,000 highly \
polar movie reviews for training, and 25,000 for testing. There is additional \
unlabeled data for use as well.\
"""

_CITATION = """\
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
title = {Learning Word Vectors for Sentiment Analysis},
booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
month = {June},
year = {2011},
address = {Portland, Oregon, USA},
publisher = {Association for Computational Linguistics},
pages = {142--150},
url = {http://www.aclweb.org/anthology/P11-1015}
}
"""

_DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"


class IMDBReviewsConfig(datasets.BuilderConfig):
"""BuilderConfig for IMDBReviews."""

def __init__(self, **kwargs):
"""BuilderConfig for IMDBReviews.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)


class Imdb(datasets.GeneratorBasedBuilder):
"""IMDB movie reviews dataset."""

BUILDER_CONFIGS = [
IMDBReviewsConfig(
name="plain_text",
description="Plain text",
)
]

def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])}
),
supervised_keys=None,
homepage="http://ai.stanford.edu/~amaas/data/sentiment/",
citation=_CITATION,
task_templates=[TextClassification(text_column="text", label_column="label")],
)

def _split_generators(self, dl_manager):
archive = dl_manager.download(_DOWNLOAD_URL)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"}
),
datasets.SplitGenerator(
name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"}
),
datasets.SplitGenerator(
name=datasets.Split("unsupervised"),
gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False},
),
]

def _generate_examples(self, files, split, labeled=True):
"""Generate aclImdb examples."""
# For labeled examples, extract the label from the path.
if labeled:
label_mapping = {"pos": 1, "neg": 0}
for path, f in files:
if path.startswith(f"aclImdb/{split}"):
label = label_mapping.get(path.split("/")[2])
if label is not None:
yield path, {"text": f.read().decode("utf-8"), "label": label}
else:
for path, f in files:
if path.startswith(f"aclImdb/{split}"):
if path.split("/")[2] == "unsup":
yield path, {"text": f.read().decode("utf-8"), "label": -1}

+ 4
- 0
tests/core/controllers/test_trainer_jittor.py View File

@@ -11,6 +11,9 @@ if _NEED_IMPORT_JITTOR:
import jittor as jt import jittor as jt
from jittor import nn, Module from jittor import nn, Module
from jittor.dataset import Dataset from jittor.dataset import Dataset
else:
from fastNLP.core.utils.dummy_class import DummyClass as Module
from fastNLP.core.utils.dummy_class import DummyClass as Dataset




class JittorNormalModel_Classification(Module): class JittorNormalModel_Classification(Module):
@@ -68,6 +71,7 @@ class TrainJittorConfig:


@pytest.mark.parametrize("driver,device", [("jittor", None)]) @pytest.mark.parametrize("driver,device", [("jittor", None)])
@pytest.mark.parametrize("callbacks", [[RichCallback(100)]]) @pytest.mark.parametrize("callbacks", [[RichCallback(100)]])
@pytest.mark.jittor
def test_trainer_jittor( def test_trainer_jittor(
driver, driver,
device, device,


+ 5
- 0
tests/core/controllers/test_trainer_paddle.py View File

@@ -1,3 +1,5 @@
import os
from typing import List
import pytest import pytest
from dataclasses import dataclass from dataclasses import dataclass


@@ -5,6 +7,7 @@ from fastNLP.core.controllers.trainer import Trainer
from fastNLP.core.metrics.accuracy import Accuracy from fastNLP.core.metrics.accuracy import Accuracy
from fastNLP.core.callbacks.progress_callback import RichCallback from fastNLP.core.callbacks.progress_callback import RichCallback
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES


if _NEED_IMPORT_PADDLE: if _NEED_IMPORT_PADDLE:
from paddle.optimizer import Adam from paddle.optimizer import Adam
@@ -34,6 +37,8 @@ def test_trainer_paddle(
callbacks, callbacks,
n_epochs=2, n_epochs=2,
): ):
if isinstance(device, List) and USER_CUDA_VISIBLE_DEVICES not in os.environ:
pytest.skip("Skip test fleet if FASTNLP_BACKEND is not set to paddle.")
model = PaddleNormalModel_Classification_1( model = PaddleNormalModel_Classification_1(
num_labels=TrainPaddleConfig.num_labels, num_labels=TrainPaddleConfig.num_labels,
feature_dimension=TrainPaddleConfig.feature_dimension feature_dimension=TrainPaddleConfig.feature_dimension


+ 0
- 0
tests/core/drivers/torch_paddle_driver/__init__.py View File


+ 0
- 122
tests/core/drivers/torch_paddle_driver/_test_torch_paddle_driver.py View File

@@ -1,122 +0,0 @@
import pytest

from fastNLP.modules.mix_modules.mix_module import MixModule
from fastNLP.core.drivers.torch_paddle_driver.torch_paddle_driver import TorchPaddleDriver
from fastNLP.modules.mix_modules.utils import paddle2torch, torch2paddle

import torch
import paddle
from paddle.io import Dataset, DataLoader
import numpy as np

############################################################################
#
# 测试在MNIST数据集上的表现
#
############################################################################

class MNISTDataset(Dataset):
def __init__(self, dataset):

self.dataset = [
(
np.array(img).astype('float32').reshape(-1),
label
) for img, label in dataset
]

def __getitem__(self, idx):
return self.dataset[idx]

def __len__(self):
return len(self.dataset)

class MixMNISTModel(MixModule):
def __init__(self):
super(MixMNISTModel, self).__init__()

self.fc1 = paddle.nn.Linear(784, 64)
self.fc2 = paddle.nn.Linear(64, 32)
self.fc3 = torch.nn.Linear(32, 10)
self.fc4 = torch.nn.Linear(10, 10)

def forward(self, x):

paddle_out = self.fc1(x)
paddle_out = self.fc2(paddle_out)
torch_in = paddle2torch(paddle_out)
torch_out = self.fc3(torch_in)
torch_out = self.fc4(torch_out)

return torch_out

def train_step(self, x):
return self.forward(x)

def test_step(self, x):
return self.forward(x)

@pytest.mark.torchpaddle
class TestMNIST:

@classmethod
def setup_class(self):

self.train_dataset = paddle.vision.datasets.MNIST(mode='train')
self.test_dataset = paddle.vision.datasets.MNIST(mode='test')
self.train_dataset = MNISTDataset(self.train_dataset)

self.lr = 0.0003
self.epochs = 20

self.dataloader = DataLoader(self.train_dataset, batch_size=100, shuffle=True)

def setup_method(self):
model = MixMNISTModel()
self.torch_loss_func = torch.nn.CrossEntropyLoss()

torch_opt = torch.optim.Adam(model.parameters(backend="torch"), self.lr)
paddle_opt = paddle.optimizer.Adam(parameters=model.parameters(backend="paddle"), learning_rate=self.lr)

self.driver = TorchPaddleDriver(model=model, device="cuda:0")
self.driver.set_optimizers([torch_opt, paddle_opt])

def test_case1(self):

epochs = 20

self.driver.setup()
self.driver.zero_grad()
# 开始训练
current_epoch_idx = 0
while current_epoch_idx < epochs:
epoch_loss, batch = 0, 0
self.driver.set_model_mode("train")
self.driver.set_sampler_epoch(self.dataloader, current_epoch_idx)
for batch, (img, label) in enumerate(self.dataloader):
img = paddle.to_tensor(img).cuda()
torch_out = self.driver.train_step(img)
label = torch.from_numpy(label.numpy()).reshape(-1)
loss = self.torch_loss_func(torch_out.cpu(), label)
epoch_loss += loss.item()

self.driver.backward(loss)
self.driver.step()
self.driver.zero_grad()

current_epoch_idx += 1

# 开始测试
correct = 0
for img, label in self.test_dataset:

img = paddle.to_tensor(np.array(img).astype('float32').reshape(1, -1))
torch_out = self.driver.test_step(img)
res = torch_out.softmax(-1).argmax().item()
label = label.item()
if res == label:
correct += 1

acc = correct / len(self.test_dataset)
assert acc > 0.85

+ 0
- 0
tests/core/drivers/torch_paddle_driver/_test_utils.py View File


+ 0
- 204
tests/core/utils/_test_torch_paddle_utils.py View File

@@ -1,204 +0,0 @@
import paddle
import pytest
import torch

from fastNLP.core.utils.torch_paddle_utils import torch_paddle_move_data_to_device

############################################################################
#
# 测试将参数中包含的所有torch和paddle张量迁移到指定设备
#
############################################################################

@pytest.mark.torchpaddle
class TestTorchPaddleMoveDataToDevice:

def check_gpu(self, tensor, idx):
"""
检查张量是否在指定显卡上的工具函数
"""

if isinstance(tensor, paddle.Tensor):
assert tensor.place.is_gpu_place()
assert tensor.place.gpu_device_id() == idx
elif isinstance(tensor, torch.Tensor):
assert tensor.is_cuda
assert tensor.device.index == idx

def check_cpu(self, tensor):
if isinstance(tensor, paddle.Tensor):
assert tensor.place.is_cpu_place()
elif isinstance(tensor, torch.Tensor):
assert not tensor.is_cuda

def test_tensor_transfer(self):
"""
测试迁移单个张量
"""

paddle_tensor = paddle.rand((3, 4, 5)).cpu()
res = torch_paddle_move_data_to_device(paddle_tensor, device=None, data_device=None)
self.check_cpu(res)

res = torch_paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device=None)
self.check_gpu(res, 0)

res = torch_paddle_move_data_to_device(paddle_tensor, device="gpu:1", data_device=None)
self.check_gpu(res, 1)

res = torch_paddle_move_data_to_device(paddle_tensor, device="cuda:0", data_device="cpu")
self.check_gpu(res, 0)

res = torch_paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:0")
self.check_gpu(res, 0)

res = torch_paddle_move_data_to_device(paddle_tensor, device=None, data_device="cuda:1")
self.check_gpu(res, 1)

torch_tensor = torch.rand(3, 4, 5)
res = torch_paddle_move_data_to_device(torch_tensor, device=None, data_device=None)
self.check_cpu(res)

res = torch_paddle_move_data_to_device(torch_tensor, device="gpu:0", data_device=None)
self.check_gpu(res, 0)

res = torch_paddle_move_data_to_device(torch_tensor, device="gpu:1", data_device=None)
self.check_gpu(res, 1)

res = torch_paddle_move_data_to_device(torch_tensor, device="gpu:0", data_device="cpu")
self.check_gpu(res, 0)

res = torch_paddle_move_data_to_device(torch_tensor, device=None, data_device="gpu:0")
self.check_gpu(res, 0)

res = torch_paddle_move_data_to_device(torch_tensor, device=None, data_device="gpu:1")
self.check_gpu(res, 1)

def test_list_transfer(self):
"""
测试迁移张量的列表
"""

paddle_list = [paddle.rand((6, 4, 2)) for i in range(5)] + [torch.rand((6, 4, 2)) for i in range(5)]
res = torch_paddle_move_data_to_device(paddle_list, device=None, data_device="gpu:1")
assert isinstance(res, list)
for r in res:
self.check_gpu(r, 1)

res = torch_paddle_move_data_to_device(paddle_list, device="cpu", data_device="gpu:1")
assert isinstance(res, list)
for r in res:
self.check_cpu(r)

res = torch_paddle_move_data_to_device(paddle_list, device="gpu:0", data_device=None)
assert isinstance(res, list)
for r in res:
self.check_gpu(r, 0)

res = torch_paddle_move_data_to_device(paddle_list, device="gpu:1", data_device="cpu")
assert isinstance(res, list)
for r in res:
self.check_gpu(r, 1)

def test_tensor_tuple_transfer(self):
"""
测试迁移张量的元组
"""

paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)] + [torch.rand((6, 4, 2)) for i in range(5)]
paddle_tuple = tuple(paddle_list)
res = torch_paddle_move_data_to_device(paddle_tuple, device=None, data_device="gpu:1")
assert isinstance(res, tuple)
for r in res:
self.check_gpu(r, 1)

res = torch_paddle_move_data_to_device(paddle_tuple, device="cpu", data_device="gpu:1")
assert isinstance(res, tuple)
for r in res:
self.check_cpu(r)

res = torch_paddle_move_data_to_device(paddle_tuple, device="gpu:0", data_device=None)
assert isinstance(res, tuple)
for r in res:
self.check_gpu(r, 0)

res = torch_paddle_move_data_to_device(paddle_tuple, device="gpu:1", data_device="cpu")
assert isinstance(res, tuple)
for r in res:
self.check_gpu(r, 1)

def test_dict_transfer(self):
"""
测试迁移复杂的字典结构
"""

paddle_dict = {
"torch_tensor": torch.rand((3, 4)),
"torch_list": [torch.rand((6, 4, 2)) for i in range(10)],
"dict":{
"list": [paddle.rand((6, 4, 2)) for i in range(5)] + [torch.rand((6, 4, 2)) for i in range(5)],
"torch_tensor": torch.rand((3, 4)),
"paddle_tensor": paddle.rand((3, 4))
},
"paddle_tensor": paddle.rand((3, 4)),
"list": [paddle.rand((6, 4, 2)) for i in range(10)] ,
"int": 2,
"string": "test string"
}

res = torch_paddle_move_data_to_device(paddle_dict, device="gpu:0", data_device=None)
assert isinstance(res, dict)
self.check_gpu(res["torch_tensor"], 0)
self.check_gpu(res["paddle_tensor"], 0)
assert isinstance(res["torch_list"], list)
for t in res["torch_list"]:
self.check_gpu(t, 0)
assert isinstance(res["list"], list)
for t in res["list"]:
self.check_gpu(t, 0)
assert isinstance(res["int"], int)
assert isinstance(res["string"], str)
assert isinstance(res["dict"], dict)
assert isinstance(res["dict"]["list"], list)
for t in res["dict"]["list"]:
self.check_gpu(t, 0)
self.check_gpu(res["dict"]["torch_tensor"], 0)
self.check_gpu(res["dict"]["paddle_tensor"], 0)

res = torch_paddle_move_data_to_device(paddle_dict, device=None, data_device="gpu:1")
assert isinstance(res, dict)
self.check_gpu(res["torch_tensor"], 1)
self.check_gpu(res["paddle_tensor"], 1)
assert isinstance(res["torch_list"], list)
for t in res["torch_list"]:
self.check_gpu(t, 1)
assert isinstance(res["list"], list)
for t in res["list"]:
self.check_gpu(t, 1)
assert isinstance(res["int"], int)
assert isinstance(res["string"], str)
assert isinstance(res["dict"], dict)
assert isinstance(res["dict"]["list"], list)
for t in res["dict"]["list"]:
self.check_gpu(t, 1)
self.check_gpu(res["dict"]["torch_tensor"], 1)
self.check_gpu(res["dict"]["paddle_tensor"], 1)

res = torch_paddle_move_data_to_device(paddle_dict, device="cpu", data_device="gpu:0")
assert isinstance(res, dict)
self.check_cpu(res["torch_tensor"])
self.check_cpu(res["paddle_tensor"])
assert isinstance(res["torch_list"], list)
for t in res["torch_list"]:
self.check_cpu(t)
assert isinstance(res["list"], list)
for t in res["list"]:
self.check_cpu(t)
assert isinstance(res["int"], int)
assert isinstance(res["string"], str)
assert isinstance(res["dict"], dict)
assert isinstance(res["dict"]["list"], list)
for t in res["dict"]["list"]:
self.check_cpu(t)
self.check_cpu(res["dict"]["torch_tensor"])
self.check_cpu(res["dict"]["paddle_tensor"])

+ 20
- 15
tests/core/utils/test_paddle_utils.py View File

@@ -2,37 +2,42 @@ import os


import pytest import pytest


from fastNLP.core.utils.paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device
from fastNLP.core.utils.paddle_utils import _convert_data_device, paddle_to, paddle_move_data_to_device
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
if _NEED_IMPORT_PADDLE: if _NEED_IMPORT_PADDLE:
import paddle import paddle

@pytest.mark.parametrize( @pytest.mark.parametrize(
("user_visible_devices, cuda_visible_devices, device, output_type, correct"),
("user_visible_devices, cuda_visible_devices, device, correct"),
( (
("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"),
("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"),
("3,4,5,6", "3,5", 0, int, 0),
("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"),
(None, None, 1, "gpu:1"),
(None, "2,4,5,6", 2, "gpu:2"),
(None, "3,4,5", 1, "gpu:1"),
("0,1,2,3,4,5,6,7", "0", "cpu", "cpu"),
("3,4,5,6,7", "0", "cpu", "cpu"),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", "gpu:1"),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", "gpu:2"),
("3,4,5,6", "3,5", 0, "gpu:0"),
("3,6,7,8", "6,7,8", "gpu:2", "gpu:1"),
) )
) )
@pytest.mark.paddle
def test_get_device_from_visible(user_visible_devices, cuda_visible_devices, device, output_type, correct):
def test_convert_data_device(user_visible_devices, cuda_visible_devices, device, correct):
_cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") _cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
_user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES") _user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES")
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices
res = get_device_from_visible(device, output_type)
if cuda_visible_devices is not None:
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
if user_visible_devices is not None:
os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices
res = _convert_data_device(device)
assert res == correct assert res == correct


# 还原环境变量 # 还原环境变量
if _cuda_visible_devices is None: if _cuda_visible_devices is None:
del os.environ["CUDA_VISIBLE_DEVICES"]
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
else: else:
os.environ["CUDA_VISIBLE_DEVICES"] = _cuda_visible_devices os.environ["CUDA_VISIBLE_DEVICES"] = _cuda_visible_devices
if _user_visible_devices is None: if _user_visible_devices is None:
del os.environ["USER_CUDA_VISIBLE_DEVICES"]
os.environ.pop("USER_CUDA_VISIBLE_DEVICES", None)
else: else:
os.environ["USER_CUDA_VISIBLE_DEVICES"] = _user_visible_devices os.environ["USER_CUDA_VISIBLE_DEVICES"] = _user_visible_devices




Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save