Browse Source

Merge branch 'dev0.8.0' of github.com:fastnlp/fastNLP into dev0.8.0

tags/v1.0.0alpha
yh_cc 2 years ago
parent
commit
2297fcb30a
29 changed files with 596 additions and 229 deletions
  1. +1
    -1
      fastNLP/core/collators/padders/paddle_padder.py
  2. +5
    -1
      fastNLP/core/dataloaders/paddle_dataloader/fdl.py
  3. +5
    -12
      fastNLP/core/drivers/paddle_driver/fleet.py
  4. +0
    -1
      fastNLP/core/drivers/paddle_driver/fleet_launcher.py
  5. +2
    -1
      fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
  6. +3
    -2
      fastNLP/core/drivers/paddle_driver/paddle_driver.py
  7. +7
    -15
      fastNLP/core/drivers/paddle_driver/single_device.py
  8. +3
    -38
      fastNLP/core/drivers/paddle_driver/utils.py
  9. +2
    -4
      fastNLP/core/metrics/backend/paddle_backend/backend.py
  10. +2
    -1
      fastNLP/core/utils/__init__.py
  11. +35
    -1
      fastNLP/core/utils/paddle_utils.py
  12. +2
    -2
      tests/core/collators/padders/test_get_padder.py
  13. +1
    -1
      tests/core/controllers/_test_trainer_fleet.py
  14. +2
    -2
      tests/core/controllers/_test_trainer_fleet_outside.py
  15. +1
    -1
      tests/core/controllers/test_trainer_paddle.py
  16. +0
    -8
      tests/core/dataloaders/paddle_dataloader/test_fdl.py
  17. +0
    -18
      tests/core/dataset/test_dataset.py
  18. +3
    -4
      tests/core/drivers/paddle_driver/test_dist_utils.py
  19. +3
    -3
      tests/core/drivers/paddle_driver/test_fleet.py
  20. +4
    -9
      tests/core/drivers/paddle_driver/test_single_device.py
  21. +0
    -20
      tests/core/drivers/paddle_driver/test_utils.py
  22. +4
    -9
      tests/core/drivers/torch_driver/test_single_device.py
  23. +32
    -46
      tests/core/utils/test_paddle_utils.py
  24. +1
    -0
      tests/pytest.ini
  25. +7
    -0
      tutorials/data/test4dataset.csv
  26. +7
    -0
      tutorials/data/test4dataset.tsv
  27. +423
    -29
      tutorials/fastnlp_tutorial_1.ipynb
  28. +41
    -0
      tutorials/fastnlp_tutorial_2.ipynb
  29. BIN
      tutorials/figures/T1-fig-dataset-and-vocabulary.png

+ 1
- 1
fastNLP/core/collators/padders/paddle_padder.py View File

@@ -56,7 +56,7 @@ def is_paddle_dtype_str(dtype):


def _get_dtype(ele_dtype, dtype, class_name):
if not (ele_dtype is not None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)):
if not (ele_dtype is None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)):
raise EleDtypeUnsupportedError(f"`{class_name}` only supports padding python numbers "
f"or numpy numbers or paddle.Tensor but get `{ele_dtype}`.")



+ 5
- 1
fastNLP/core/dataloaders/paddle_dataloader/fdl.py View File

@@ -8,11 +8,12 @@ from typing import Callable, List, Optional, Union, Dict, Sequence
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE

if _NEED_IMPORT_PADDLE:
from paddle.io import DataLoader, Dataset
from paddle.io import DataLoader, Dataset, Sampler
from paddle.fluid.dataloader.collate import default_collate_fn
else:
from fastNLP.core.utils.dummy_class import DummyClass as Dataset
from fastNLP.core.utils.dummy_class import DummyClass as DataLoader
from fastNLP.core.utils.dummy_class import DummyClass as Sampler

from fastNLP.core.collators.collator import Collator
from fastNLP.core.dataloaders.utils import indice_collate_wrapper
@@ -58,6 +59,9 @@ class PaddleDataLoader(DataLoader):
if batch_sampler is None:
batch_sampler = RandomBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle,
drop_last=drop_last)
batch_size = 1
shuffle = False
drop_last = False

super(PaddleDataLoader, self).__init__(dataset=dataset, feed_list=feed_list, places=places,
return_list=return_list, batch_sampler=batch_sampler,


+ 5
- 12
fastNLP/core/drivers/paddle_driver/fleet.py View File

@@ -1,12 +1,12 @@
import os
import shutil
from typing import List, Union, Optional, Dict, Tuple, Callable

from fastNLP.core.utils.paddle_utils import get_device_from_visible

from .paddle_driver import PaddleDriver
from .fleet_launcher import FleetLauncher
from .utils import (
_FleetWrappingModel,
get_device_from_visible,
reset_seed,
replace_sampler,
replace_batch_sampler,
@@ -17,8 +17,8 @@ from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.core.utils import (
auto_param_call,
check_user_specific_params,
paddle_move_data_to_device,
is_in_paddle_dist
is_in_paddle_dist,
is_in_paddle_dist,
)
from fastNLP.envs.distributed import rank_zero_rm
from fastNLP.core.samplers import (
@@ -609,12 +609,6 @@ class PaddleFleetDriver(PaddleDriver):
def is_distributed(self):
return True

def move_data_to_device(self, batch: 'paddle.Tensor'):
device = self.data_device
# 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误
device = get_device_from_visible(device)
return paddle_move_data_to_device(batch, device)

@staticmethod
def _check_optimizer_legality(optimizers):
# paddle 存在设置分布式 optimizers 的函数,返回值为 fleet.meta_optimizers.HybridParallelOptimizer
@@ -637,9 +631,8 @@ class PaddleFleetDriver(PaddleDriver):
:return: 如果当前不是分布式 driver 直接返回输入的 obj 。如果当前 rank 是接收端(其 global rank 包含在了 dst 中),则返回
接收到的参数;如果是 source 端则返回发射的内容;既不是发送端、又不是接收端,则返回 None 。
"""
device = self.data_device
# 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误
device = get_device_from_visible(device)
device = get_device_from_visible(self.data_device)
return fastnlp_paddle_broadcast_object(obj, src, device=device, group=group)

def all_gather(self, obj, group=None) -> List:


+ 0
- 1
fastNLP/core/drivers/paddle_driver/fleet_launcher.py View File

@@ -10,7 +10,6 @@ from fastNLP.envs.env import (
FASTNLP_DISTRIBUTED_CHECK,
FASTNLP_LOG_LEVEL,
FASTNLP_GLOBAL_SEED,
USER_CUDA_VISIBLE_DEVICES,
)
from .utils import (
find_free_ports,


+ 2
- 1
fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py View File

@@ -42,7 +42,8 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[

user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES")
if user_visible_devices is None:
raise RuntimeError("This situation cannot happen, please report a bug to us.")
raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
"`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
_could_use_device_num = len(user_visible_devices.split(","))
if isinstance(device, int):
if device < 0 and device != -1:


+ 3
- 2
fastNLP/core/drivers/paddle_driver/paddle_driver.py View File

@@ -10,7 +10,7 @@ import numpy as np
from .utils import _build_fp16_env, optimizer_state_to_device, DummyGradScaler
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.core.drivers.driver import Driver
from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device
from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device, get_device_from_visible
from fastNLP.envs import (
FASTNLP_SEED_WORKERS,
FASTNLP_MODEL_FILENAME,
@@ -394,7 +394,8 @@ class PaddleDriver(Driver):

:return: 将移动到指定机器上的 batch 对象返回;
"""
return paddle_move_data_to_device(batch, self.data_device)
device = get_device_from_visible(self.data_device)
return paddle_move_data_to_device(batch, device)

@staticmethod
def worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: # pragma: no cover


+ 7
- 15
fastNLP/core/drivers/paddle_driver/single_device.py View File

@@ -2,14 +2,14 @@ import os
from typing import Optional, Dict, Union, Callable, Tuple

from .paddle_driver import PaddleDriver
from .utils import replace_batch_sampler, replace_sampler, get_device_from_visible
from .utils import replace_batch_sampler, replace_sampler
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES
from fastNLP.core.utils import (
auto_param_call,
get_device_from_visible,
get_paddle_gpu_str,
get_paddle_device_id,
paddle_move_data_to_device,
)
from fastNLP.core.utils.utils import _get_fun_msg
from fastNLP.core.samplers import (
@@ -39,6 +39,9 @@ class PaddleSingleDriver(PaddleDriver):
raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`")

cuda_visible_devices = os.environ.get(USER_CUDA_VISIBLE_DEVICES, None)
if cuda_visible_devices is None:
raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
"`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
if cuda_visible_devices == "":
device = "cpu"
logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to"
@@ -54,7 +57,7 @@ class PaddleSingleDriver(PaddleDriver):
device_id = device
else:
device_id = get_paddle_device_id(device)
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ[USER_CUDA_VISIBLE_DEVICES].split(",")[device_id]
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices.split(",")[device_id]
self.model_device = get_paddle_gpu_str(device)

self.local_rank = 0
@@ -65,8 +68,7 @@ class PaddleSingleDriver(PaddleDriver):
r"""
该函数用来初始化训练环境,用于设置当前训练的设备,并将模型迁移到对应设备上。
"""
device = self.model_device
device = get_device_from_visible(device, output_type=str)
device = get_device_from_visible(self.model_device, output_type=str)
paddle.device.set_device(device)
self.model.to(device)

@@ -121,16 +123,6 @@ class PaddleSingleDriver(PaddleDriver):
else:
raise RuntimeError(f"There is no `{fn}` method in your {type(self.model)}.")

def move_data_to_device(self, batch: 'paddle.Tensor'):
r"""
将数据迁移到指定的机器上;batch 可能是 list 也可能 dict ,或其嵌套结构。
在 Paddle 中使用可能会引起因与设置的设备不一致而产生的问题,请注意。

:return: 将移动到指定机器上的 batch 对象返回;
"""
device = get_device_from_visible(self.data_device)
return paddle_move_data_to_device(batch, device)

def set_dist_repro_dataloader(self, dataloader, dist: Union[str, ReproducibleBatchSampler, ReproducibleSampler]=None,
reproducible: bool = False):
r"""


+ 3
- 38
fastNLP/core/drivers/paddle_driver/utils.py View File

@@ -6,12 +6,11 @@ import inspect
import numpy as np
from copy import deepcopy
from contextlib import ExitStack, closing
from enum import IntEnum
from typing import Dict, Optional, Union
from typing import Dict, Optional

from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.core.utils import get_paddle_device_id, auto_param_call, paddle_to
from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS, USER_CUDA_VISIBLE_DEVICES
from fastNLP.core.utils import auto_param_call, paddle_to
from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS
from fastNLP.core.log import logger


@@ -173,40 +172,6 @@ def find_free_ports(num):

return None

def get_device_from_visible(device: Union[str, int], output_type=int):
"""
在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。
如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。

:param device: 未转化的设备名
:param output_type: 返回值的类型
:return: 转化后的设备id
"""
if output_type not in [int, str]:
raise ValueError("Parameter `output_type` should be one of these types: [int, str]")
if device == "cpu":
return device
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
idx = get_paddle_device_id(device)
if cuda_visible_devices is None or cuda_visible_devices == "":
# 这个判断一般不会发生,因为 fastnlp 会为 paddle 强行注入 CUDA_VISIBLE_DEVICES
raise RuntimeError("This situation should not happen, please report us this bug.")
else:
# 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if user_visible_devices is None:
raise RuntimeError("This situation cannot happen, please report a bug to us.")
idx = user_visible_devices.split(",")[idx]

cuda_visible_devices_list = cuda_visible_devices.split(',')
if idx not in cuda_visible_devices_list:
raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}].")
res = cuda_visible_devices_list.index(idx)
if output_type == int:
return res
else:
return f"gpu:{res}"

def replace_batch_sampler(dataloader: "DataLoader", batch_sampler: "BatchSampler"):
"""
利用 `batch_sampler` 重新构建一个 DataLoader,起到替换 `batch_sampler` 又不影响原 `dataloader` 的作用。


+ 2
- 4
fastNLP/core/metrics/backend/paddle_backend/backend.py View File

@@ -1,11 +1,10 @@
from typing import List, Optional, Any
from typing import List, Any

import numpy as np

from fastNLP.core.metrics.backend import Backend
from fastNLP.core.utils.paddle_utils import paddle_to
from fastNLP.core.utils.paddle_utils import paddle_to, get_device_from_visible
from fastNLP.core.metrics.utils import AggregateMethodError
from fastNLP.core.drivers.paddle_driver.utils import get_device_from_visible
from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE

@@ -80,7 +79,6 @@ class PaddleBackend(Backend):
raise ValueError(f"tensor: {tensor} can not convert to ndarray!")

def move_tensor_to_device(self, tensor, device):
# TODO 如果在这里处理的话,会不会在别的地方引起bug?
device = get_device_from_visible(device)
return paddle_to(tensor, device)



+ 2
- 1
fastNLP/core/utils/__init__.py View File

@@ -2,6 +2,7 @@ __all__ = [
'cache_results',
'is_jittor_dataset',
'jittor_collate_wraps',
'get_device_from_visible',
'paddle_to',
'paddle_move_data_to_device',
'get_paddle_device_id',
@@ -27,7 +28,7 @@ __all__ = [

from .cache_results import cache_results
from .jittor_utils import is_jittor_dataset, jittor_collate_wraps
from .paddle_utils import paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \
from .paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \
is_in_fnlp_paddle_dist, is_in_paddle_launch_dist
from .rich_progress import f_rich_progress
from .torch_paddle_utils import torch_paddle_move_data_to_device


+ 35
- 1
fastNLP/core/utils/paddle_utils.py View File

@@ -1,4 +1,5 @@
__all__ = [
"get_device_from_visible",
"paddle_to",
"paddle_move_data_to_device",
"get_paddle_gpu_str",
@@ -13,13 +14,45 @@ import re
from typing import Any, Optional, Union

from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH
from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH, USER_CUDA_VISIBLE_DEVICES

if _NEED_IMPORT_PADDLE:
import paddle

from .utils import apply_to_collection

def get_device_from_visible(device: Union[str, int], output_type=int):
"""
在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。
如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。

:param device: 未转化的设备名
:param output_type: 返回值的类型
:return: 转化后的设备id
"""
if output_type not in [int, str]:
raise ValueError("Parameter `output_type` should be one of these types: [int, str]")
if device == "cpu":
return device
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if user_visible_devices is None:
raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
"`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
idx = get_paddle_device_id(device)
# 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备
if user_visible_devices is None:
raise RuntimeError("This situation cannot happen, please report a bug to us.")
idx = user_visible_devices.split(",")[idx]

cuda_visible_devices_list = cuda_visible_devices.split(',')
if idx not in cuda_visible_devices_list:
raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}]. ")
res = cuda_visible_devices_list.index(idx)
if output_type == int:
return res
else:
return f"gpu:{res}"

def paddle_to(data, device: Union[str, int]):
"""
@@ -33,6 +66,7 @@ def paddle_to(data, device: Union[str, int]):
if device == "cpu":
return data.cpu()
else:
# device = get_device_from_visible(device, output_type=int)
return data.cuda(get_paddle_device_id(device))




+ 2
- 2
tests/core/collators/padders/test_get_padder.py View File

@@ -14,10 +14,10 @@ def test_get_element_shape_dtype():
catalog = _get_element_shape_dtype([np.zeros(3), np.zeros((2, 1))])


@pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle'])
# @pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle'])
@pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'paddle'])
@pytest.mark.torch
@pytest.mark.paddle
@pytest.mark.jittor
def test_get_padder_run(backend):
if not _NEED_IMPORT_TORCH and backend == 'torch':
pytest.skip("No torch")


+ 1
- 1
tests/core/controllers/_test_trainer_fleet.py View File

@@ -1,7 +1,7 @@
"""
这个文件测试用户以python -m paddle.distributed.launch 启动的情况
看看有没有用pytest执行的机会
python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet.py
FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet.py
"""
import os
import sys


+ 2
- 2
tests/core/controllers/_test_trainer_fleet_outside.py View File

@@ -1,7 +1,7 @@
"""
这个文件测试用户以python -m paddle.distributed.launch 启动的情况
并且自己初始化了 fleet
python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet_outside.py
FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet_outside.py
"""
import os
import sys
@@ -93,5 +93,5 @@ if __name__ == "__main__":
driver=driver,
device=device,
callbacks=callbacks,
n_epochs=30,
n_epochs=5,
)

+ 1
- 1
tests/core/controllers/test_trainer_paddle.py View File

@@ -27,7 +27,7 @@ class TrainPaddleConfig:
@pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1), ("fleet", [0, 1])])
# @pytest.mark.parametrize("driver,device", [("fleet", [0, 1])])
@pytest.mark.parametrize("callbacks", [[RichCallback(5)]])
@pytest.mark.paddle
@pytest.mark.paddledist
@magic_argv_env_context
def test_trainer_paddle(
driver,


+ 0
- 8
tests/core/dataloaders/paddle_dataloader/test_fdl.py View File

@@ -58,11 +58,3 @@ class TestPaddle:
for batch in fdl1:
assert batch['image'].shape == [4, 10, 5]
print(batch)

def test_v2(self):
from fastNLP.core.collators import Collator
logger.setLevel("DEBUG")
data = [paddle.Tensor(np.random.random((10, 5)).astype('float32')), paddle.Tensor(np.random.random((10, 5)).astype('float32'))]
col = Collator(backend="jittor")
res = col(data)
print(res)

+ 0
- 18
tests/core/dataset/test_dataset.py View File

@@ -370,29 +370,11 @@ class TestDataSetMethods:
assert os.path.exists("1.csv") == True
os.remove("1.csv")

def test_add_collate_fn(self):
ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]})

def collate_fn(item):
return item

ds.add_collate_fn(collate_fn)

def test_get_collator(self):
from typing import Callable
ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]})
collate_fn = ds.get_collator()
assert isinstance(collate_fn, Callable) == True

def test_add_seq_len(self):
ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]})
ds.add_seq_len('x')
print(ds)

def test_set_target(self):
ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]})
ds.set_target('x')


class TestFieldArrayInit:
"""


+ 3
- 4
tests/core/drivers/paddle_driver/test_dist_utils.py View File

@@ -19,7 +19,7 @@ if _NEED_IMPORT_PADDLE:
import paddle
import paddle.distributed as dist

@pytest.mark.paddle
@pytest.mark.paddledist
class TestDistUtilsTools:
"""
测试一些工具函数
@@ -79,14 +79,13 @@ class TestDistUtilsTools:
assert res["int"] == paddle_dict["int"]
assert res["string"] == paddle_dict["string"]


@pytest.mark.paddle
@pytest.mark.paddledist
class TestAllGatherAndBroadCast:

@classmethod
def setup_class(cls):
devices = [0,1,2]
output_from_new_proc = "only_error"
output_from_new_proc = "all"

launcher = FleetLauncher(devices=devices, output_from_new_proc=output_from_new_proc)
cls.local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", "0"))


+ 3
- 3
tests/core/drivers/paddle_driver/test_fleet.py View File

@@ -39,7 +39,7 @@ def generate_driver(num_labels, feature_dimension, device=[0,1], fp16=False, out
#
############################################################################

@pytest.mark.paddle
@pytest.mark.paddledist
class TestFleetDriverFunction:
"""
测试 PaddleFleetDriver 一些简单函数的测试类,基本都是测试能否运行、是否存在 import 错误等问题
@@ -147,7 +147,7 @@ class TestFleetDriverFunction:
#
############################################################################

@pytest.mark.paddle
@pytest.mark.paddledist
class TestSetDistReproDataloader:

@classmethod
@@ -521,7 +521,7 @@ class TestSetDistReproDataloader:
#
############################################################################

@pytest.mark.paddle
@pytest.mark.paddledist
class TestSaveLoad:
"""
测试多卡情况下 save 和 load 相关函数的表现


+ 4
- 9
tests/core/drivers/paddle_driver/test_single_device.py View File

@@ -552,22 +552,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"):

return driver

@pytest.fixture
def prepare_test_save_load():
dataset = PaddleRandomMaxDataset(40, 10)
dataloader = DataLoader(dataset, batch_size=4)
driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10)
return driver1, driver2, dataloader

@pytest.mark.paddle
@pytest.mark.parametrize("only_state_dict", ([True, False]))
def test_save_and_load_model(prepare_test_save_load, only_state_dict):
def test_save_and_load_model(only_state_dict):
"""
测试 save_model 和 load_model 函数
"""
try:
path = "model"
driver1, driver2, dataloader = prepare_test_save_load
dataset = PaddleRandomMaxDataset(40, 10)
dataloader = DataLoader(dataset, batch_size=4)
driver1, driver2 = generate_random_driver(10, 10, device="gpu"), generate_random_driver(10, 10, device="gpu")

if only_state_dict:
driver1.save_model(path, only_state_dict)


+ 0
- 20
tests/core/drivers/paddle_driver/test_utils.py View File

@@ -1,8 +1,6 @@
import os
import pytest

from fastNLP.core.drivers.paddle_driver.utils import (
get_device_from_visible,
replace_batch_sampler,
replace_sampler,
)
@@ -14,24 +12,6 @@ if _NEED_IMPORT_PADDLE:

from tests.helpers.datasets.paddle_data import PaddleNormalDataset

@pytest.mark.parametrize(
("user_visible_devices, cuda_visible_devices, device, output_type, correct"),
(
("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"),
("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"),
("3,4,5,6", "3,5", 0, int, 0),
("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"),
)
)
@pytest.mark.paddle
def test_get_device_from_visible_str(user_visible_devices, cuda_visible_devices, device, output_type, correct):
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices
res = get_device_from_visible(device, output_type)
assert res == correct

@pytest.mark.paddle
def test_replace_batch_sampler():
dataset = PaddleNormalDataset(10)


+ 4
- 9
tests/core/drivers/torch_driver/test_single_device.py View File

@@ -545,22 +545,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"):

return driver

@pytest.fixture
def prepare_test_save_load():
dataset = TorchArgMaxDataset(10, 40)
dataloader = DataLoader(dataset, batch_size=4)
driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10)
return driver1, driver2, dataloader

@pytest.mark.torch
@pytest.mark.parametrize("only_state_dict", ([True, False]))
def test_save_and_load_model(prepare_test_save_load, only_state_dict):
def test_save_and_load_model(only_state_dict):
"""
测试 save_model 和 load_model 函数
"""
try:
path = "model"
driver1, driver2, dataloader = prepare_test_save_load
dataset = TorchArgMaxDataset(10, 40)
dataloader = DataLoader(dataset, batch_size=4)
driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10)

driver1.save_model(path, only_state_dict)
driver2.load_model(path, only_state_dict)


+ 32
- 46
tests/core/utils/test_paddle_utils.py View File

@@ -1,10 +1,40 @@
import os

import pytest

from fastNLP.core.utils.paddle_utils import paddle_to, paddle_move_data_to_device
from fastNLP.core.utils.paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
if _NEED_IMPORT_PADDLE:
import paddle

@pytest.mark.parametrize(
("user_visible_devices, cuda_visible_devices, device, output_type, correct"),
(
("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"),
("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"),
("3,4,5,6", "3,5", 0, int, 0),
("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"),
)
)
@pytest.mark.paddle
def test_get_device_from_visible(user_visible_devices, cuda_visible_devices, device, output_type, correct):
_cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
_user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES")
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices
res = get_device_from_visible(device, output_type)
assert res == correct

# 还原环境变量
if _cuda_visible_devices is None:
del os.environ["CUDA_VISIBLE_DEVICES"]
else:
os.environ["CUDA_VISIBLE_DEVICES"] = _cuda_visible_devices
if _user_visible_devices is None:
del os.environ["USER_CUDA_VISIBLE_DEVICES"]
else:
os.environ["USER_CUDA_VISIBLE_DEVICES"] = _user_visible_devices

############################################################################
#
@@ -22,12 +52,6 @@ class TestPaddleToDevice:
assert res.place.gpu_device_id() == 0
res = paddle_to(tensor, "cpu")
assert res.place.is_cpu_place()
res = paddle_to(tensor, "gpu:2")
assert res.place.is_gpu_place()
assert res.place.gpu_device_id() == 2
res = paddle_to(tensor, "gpu:1")
assert res.place.is_gpu_place()
assert res.place.gpu_device_id() == 1

############################################################################
#
@@ -64,28 +88,18 @@ class TestPaddleMoveDataToDevice:
res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device=None)
self.check_gpu(res, 0)

res = paddle_move_data_to_device(paddle_tensor, device="gpu:1", data_device=None)
self.check_gpu(res, 1)

res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device="cpu")
self.check_gpu(res, 0)

res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:0")
self.check_gpu(res, 0)

res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:1")
self.check_gpu(res, 1)

def test_list_transfer(self):
"""
测试张量列表的迁移
"""

paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)]
res = paddle_move_data_to_device(paddle_list, device=None, data_device="gpu:1")
assert isinstance(res, list)
for r in res:
self.check_gpu(r, 1)

res = paddle_move_data_to_device(paddle_list, device="cpu", data_device="gpu:1")
assert isinstance(res, list)
@@ -97,11 +111,6 @@ class TestPaddleMoveDataToDevice:
for r in res:
self.check_gpu(r, 0)

res = paddle_move_data_to_device(paddle_list, device="gpu:1", data_device="cpu")
assert isinstance(res, list)
for r in res:
self.check_gpu(r, 1)

def test_tensor_tuple_transfer(self):
"""
测试张量元组的迁移
@@ -109,10 +118,6 @@ class TestPaddleMoveDataToDevice:

paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)]
paddle_tuple = tuple(paddle_list)
res = paddle_move_data_to_device(paddle_tuple, device=None, data_device="gpu:1")
assert isinstance(res, tuple)
for r in res:
self.check_gpu(r, 1)

res = paddle_move_data_to_device(paddle_tuple, device="cpu", data_device="gpu:1")
assert isinstance(res, tuple)
@@ -124,11 +129,6 @@ class TestPaddleMoveDataToDevice:
for r in res:
self.check_gpu(r, 0)

res = paddle_move_data_to_device(paddle_tuple, device="gpu:1", data_device="cpu")
assert isinstance(res, tuple)
for r in res:
self.check_gpu(r, 1)

def test_dict_transfer(self):
"""
测试字典结构的迁移
@@ -173,20 +173,6 @@ class TestPaddleMoveDataToDevice:
self.check_gpu(t, 0)
self.check_gpu(res["dict"]["tensor"], 0)

res = paddle_move_data_to_device(paddle_dict, device=None, data_device="gpu:1")
assert isinstance(res, dict)
self.check_gpu(res["tensor"], 1)
assert isinstance(res["list"], list)
for t in res["list"]:
self.check_gpu(t, 1)
assert isinstance(res["int"], int)
assert isinstance(res["string"], str)
assert isinstance(res["dict"], dict)
assert isinstance(res["dict"]["list"], list)
for t in res["dict"]["list"]:
self.check_gpu(t, 1)
self.check_gpu(res["dict"]["tensor"], 1)

res = paddle_move_data_to_device(paddle_dict, device="cpu", data_device="gpu:0")
assert isinstance(res, dict)
self.check_cpu(res["tensor"])


+ 1
- 0
tests/pytest.ini View File

@@ -2,5 +2,6 @@
markers =
torch
paddle
paddledist
jittor
torchpaddle

+ 7
- 0
tutorials/data/test4dataset.csv View File

@@ -0,0 +1,7 @@
,SentenceId,Sentence,Sentiment
0,1,"['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']",negative
1,2,"['this', 'quiet', ',', 'introspective', 'and', 'entertaining', 'independent', 'is', 'worth', 'seeking', '.']",positive
2,3,"['even', 'fans', 'of', 'ismail', 'merchant', ""'s"", 'work', ',', 'i', 'suspect', ',', 'would', 'have', 'a', 'hard', 'time', 'sitting', 'through', 'this', 'one', '.']",negative
3,4,"['a', 'positively', 'thrilling', 'combination', 'of', 'ethnography', 'and', 'all', 'the', 'intrigue', ',', 'betrayal', ',', 'deceit', 'and', 'murder', 'of', 'a', 'shakespearean', 'tragedy', 'or', 'a', 'juicy', 'soap', 'opera', '.']",neutral
4,5,"['a', 'comedy-drama', 'of', 'nearly', 'epic', 'proportions', 'rooted', 'in', 'a', 'sincere', 'performance', 'by', 'the', 'title', 'character', 'undergoing', 'midlife', 'crisis', '.']",positive
5,6,"['the', 'importance', 'of', 'being', 'earnest', ',', 'so', 'thick', 'with', 'wit', 'it', 'plays', 'like', 'a', 'reading', 'from', 'bartlett', ""'s"", 'familiar', 'quotations']",neutral

+ 7
- 0
tutorials/data/test4dataset.tsv View File

@@ -0,0 +1,7 @@
SentenceId Sentence Sentiment
1 A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . negative
2 This quiet , introspective and entertaining independent is worth seeking . positive
3 Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . negative
4 A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . neutral
5 A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . positive
6 The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations neutral

+ 423
- 29
tutorials/fastnlp_tutorial_1.ipynb View File

@@ -153,7 +153,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1969418794120 1971237588872\n",
"2438703969992 2438374526920\n",
"+-----+------------------------+------------------------+-----+\n",
"| idx | sentence | words | num |\n",
"+-----+------------------------+------------------------+-----+\n",
@@ -198,7 +198,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1971237588872 1971237588872\n",
"2438374526920 2438374526920\n",
"+-----+------------------------+------------------------+-----+\n",
"| idx | sentence | words | num |\n",
"+-----+------------------------+------------------------+-----+\n",
@@ -774,9 +774,9 @@
{
"data": {
"text/plain": [
"{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d08>,\n",
" 'words': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d88>,\n",
" 'num': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879e08>}"
"{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d388>,\n",
" 'words': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d408>,\n",
" 'num': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d488>}"
]
},
"execution_count": 15,
@@ -923,7 +923,8 @@
"output_type": "stream",
"text": [
"5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n",
"6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n"
"6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n",
"6 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n"
]
}
],
@@ -931,7 +932,8 @@
"vocab.add_word_lst(['生活', '就像', '海洋'])\n",
"print(len(vocab), vocab.word_count)\n",
"vocab.add_word('只有')\n",
"print(len(vocab), vocab.word_count)"
"print(len(vocab), vocab.word_count)\n",
"print(len(vocab), vocab.word2idx)"
]
},
{
@@ -959,7 +961,6 @@
"<pad> 0\n",
"<unk> 1\n",
"生活 2\n",
"只有 5\n",
"彼岸 1 False\n"
]
}
@@ -968,7 +969,6 @@
"print(vocab.to_word(0), vocab.to_index('<pad>'))\n",
"print(vocab.to_word(1), vocab.to_index('<unk>'))\n",
"print(vocab.to_word(2), vocab.to_index('生活'))\n",
"print(vocab.to_word(5), vocab.to_index('只有'))\n",
"print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
]
},
@@ -979,7 +979,9 @@
"source": [
"**`vocabulary`允许反复添加相同单词**,**可以通过`word_count`方法看到相应单词被添加的次数**\n",
"\n",
"&emsp; 但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询"
"&emsp; 但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询\n",
"\n",
"&emsp; 注:**使用`add_word_lst`添加单词**,**单词对应序号不会动态调整**,**使用`dataset`添加单词的情况不同**"
]
},
{
@@ -992,15 +994,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
"13 Counter({'生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '人': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
"彼岸 12 True\n"
"生活 2\n",
"彼岸 12 True\n",
"13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
"13 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n"
]
}
],
"source": [
"vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '才', '能', '到达', '彼岸'])\n",
"vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n",
"print(vocab.to_word(2), vocab.to_index('生活'))\n",
"print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))\n",
"print(len(vocab), vocab.word_count)\n",
"print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
"print(len(vocab), vocab.word2idx)"
]
},
{
@@ -1082,52 +1088,440 @@
"## 3 dataset 和 vocabulary 的组合使用\n",
" \n",
"### 3.1 从 dataframe 中加载 dataset\n",
"\n"
"\n",
"以下通过 [NLP-beginner](https://github.com/FudanNLP/nlp-beginner) 实践一中 [Rotten Tomatoes 影评数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) 的部分训练数据组成`test4dataset.tsv`文件\n",
"\n",
"&emsp; 介绍如何使用`dataset`、`vocabulary`简单加载并处理数据集,首先使用`pandas`模块,读取原始数据的`dataframe`"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "3dbd985d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SentenceId</th>\n",
" <th>Sentence</th>\n",
" <th>Sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>A series of escapades demonstrating the adage ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>This quiet , introspective and entertaining in...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Even fans of Ismail Merchant 's work , I suspe...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>A positively thrilling combination of ethnogra...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>A comedy-drama of nearly epic proportions root...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>The Importance of Being Earnest , so thick wit...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SentenceId Sentence Sentiment\n",
"0 1 A series of escapades demonstrating the adage ... negative\n",
"1 2 This quiet , introspective and entertaining in... positive\n",
"2 3 Even fans of Ismail Merchant 's work , I suspe... negative\n",
"3 4 A positively thrilling combination of ethnogra... neutral\n",
"4 5 A comedy-drama of nearly epic proportions root... positive\n",
"5 6 The Importance of Being Earnest , so thick wit... neutral"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv('./data/test4dataset.tsv', sep='\\t')\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "89059713",
"id": "919ab350",
"metadata": {},
"source": []
"source": [
"接着,通过`dataset`中的`from_pandas`方法填充数据集,并使用`apply_more`方法对文本进行分词操作"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3dbd985d",
"execution_count": 25,
"id": "4f634586",
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+------------------------------+-----------+\n",
"| SentenceId | Sentence | Sentiment |\n",
"+------------+------------------------------+-----------+\n",
"| 1 | ['a', 'series', 'of', 'es... | negative |\n",
"| 2 | ['this', 'quiet', ',', 'i... | positive |\n",
"| 3 | ['even', 'fans', 'of', 'i... | negative |\n",
"| 4 | ['a', 'positively', 'thri... | neutral |\n",
"| 5 | ['a', 'comedy-drama', 'of... | positive |\n",
"| 6 | ['the', 'importance', 'of... | neutral |\n",
"+------------+------------------------------+-----------+\n"
]
}
],
"source": [
"from fastNLP.core.dataset import DataSet\n",
"\n",
"dataset = DataSet()\n",
"dataset = dataset.from_pandas(df)\n",
"dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n",
" 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']})\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"id": "5c1ae192",
"metadata": {},
"source": [
"&emsp; 如果需要保存中间结果,也可以使用`dataset`的`to_csv`方法,生成`.csv`或`.tsv`文件"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f634586",
"execution_count": 26,
"id": "46722efc",
"metadata": {},
"outputs": [],
"source": []
"source": [
"dataset.to_csv('./data/test4dataset.csv')"
]
},
{
"cell_type": "markdown",
"id": "5ba13989",
"metadata": {},
"source": [
"### 3.2 从 dataset 中获取 vocabulary"
"### 3.2 从 dataset 中获取 vocabulary\n",
"\n",
"然后,初始化`vocabulary`,使用`vocabulary`中的`from_dataset`方法,从`dataset`的指定字段中\n",
"\n",
"&emsp; 获取字段中的所有元素,然后编号;如果指定字段是个列表,则针对字段中所有列表包含的元素编号\n",
"\n",
"&emsp; 注:**使用`dataset`添加单词**,**不同于`add_word_list`**,**单词被添加次数越多**,**序号越靠前**,例如案例中的`a`"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 27,
"id": "a2de615b",
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Counter({'a': 9, 'of': 9, ',': 7, 'the': 6, '.': 5, 'is': 3, 'and': 3, 'good': 2, 'for': 2, 'which': 2, 'this': 2, \"'s\": 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'quiet': 1, 'introspective': 1, 'entertaining': 1, 'independent': 1, 'worth': 1, 'seeking': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, 'work': 1, 'i': 1, 'suspect': 1, 'would': 1, 'have': 1, 'hard': 1, 'time': 1, 'sitting': 1, 'through': 1, 'one': 1, 'positively': 1, 'thrilling': 1, 'combination': 1, 'ethnography': 1, 'all': 1, 'intrigue': 1, 'betrayal': 1, 'deceit': 1, 'murder': 1, 'shakespearean': 1, 'tragedy': 1, 'or': 1, 'juicy': 1, 'soap': 1, 'opera': 1, 'comedy-drama': 1, 'nearly': 1, 'epic': 1, 'proportions': 1, 'rooted': 1, 'in': 1, 'sincere': 1, 'performance': 1, 'by': 1, 'title': 1, 'character': 1, 'undergoing': 1, 'midlife': 1, 'crisis': 1, 'importance': 1, 'being': 1, 'earnest': 1, 'so': 1, 'thick': 1, 'with': 1, 'wit': 1, 'it': 1, 'plays': 1, 'like': 1, 'reading': 1, 'from': 1, 'bartlett': 1, 'familiar': 1, 'quotations': 1}) \n",
"\n",
"{'<pad>': 0, '<unk>': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n",
"\n",
"Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n"
]
}
],
"source": [
"from fastNLP.core.vocabulary import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab = vocab.from_dataset(dataset, field_name='Sentence')\n",
"print(vocab.word_count, '\\n')\n",
"print(vocab.word2idx, '\\n')\n",
"print(vocab)"
]
},
{
"cell_type": "markdown",
"id": "f0857ccb",
"metadata": {},
"source": [
"之后,**通过`vocabulary`的`index_dataset`方法**,**调整`dataset`中指定字段的元素**,**使用编号将之代替**\n",
"\n",
"&emsp; 使用上述方法,可以将影评数据集中的单词序列转化为词编号序列,为接下来转化为词嵌入序列做准备"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 28,
"id": "2f9a04b2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+------------------------------+-----------+\n",
"| SentenceId | Sentence | Sentiment |\n",
"+------------+------------------------------+-----------+\n",
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | negative |\n",
"| 2 | [12, 32, 4, 33, 8, 34, 35... | positive |\n",
"| 3 | [38, 39, 3, 40, 41, 13, 4... | negative |\n",
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | neutral |\n",
"| 5 | [2, 67, 3, 68, 69, 70, 71... | positive |\n",
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | neutral |\n",
"+------------+------------------------------+-----------+\n"
]
}
],
"source": [
"vocab.index_dataset(dataset, field_name='Sentence')\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"id": "6b26b707",
"metadata": {},
"source": [
"最后,使用相同方法,再将`dataset`中`Sentiment`字段中的`negative`、`neutral`、`positive`转化为数字编号"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "5f5eed18",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'negative': 0, 'positive': 1, 'neutral': 2}\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+------------------------------+-----------+\n",
"| SentenceId | Sentence | Sentiment |\n",
"+------------+------------------------------+-----------+\n",
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | 0 |\n",
"| 2 | [12, 32, 4, 33, 8, 34, 35... | 1 |\n",
"| 3 | [38, 39, 3, 40, 41, 13, 4... | 0 |\n",
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | 2 |\n",
"| 5 | [2, 67, 3, 68, 69, 70, 71... | 1 |\n",
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | 2 |\n",
"+------------+------------------------------+-----------+\n"
]
}
],
"source": [
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
"\n",
"target_vocab.from_dataset(dataset, field_name='Sentiment')\n",
"print(target_vocab.word2idx)\n",
"target_vocab.index_dataset(dataset, field_name='Sentiment')\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"id": "eed7ea64",
"metadata": {},
"source": [
"在最后的最后,通过以下的一张图,来总结本章关于`dataset`和`vocabulary`主要知识点的讲解,以及两者的联系\n",
"\n",
"<img src=\"./figures/T1-fig-dataset-and-vocabulary.png\" width=\"80%\" height=\"80%\" align=\"center\"></img>"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35b4f0f7",
"metadata": {},
"outputs": [],
"source": []
}


+ 41
- 0
tutorials/fastnlp_tutorial_2.ipynb View File

@@ -0,0 +1,41 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

BIN
tutorials/figures/T1-fig-dataset-and-vocabulary.png View File

Before After
Width: 1326  |  Height: 701  |  Size: 139 kB

Loading…
Cancel
Save