Browse Source

Merge branch 'dev0.8.0' of github.com:fastnlp/fastNLP into dev0.8.0

tags/v1.0.0alpha
MorningForest 3 years ago
parent
commit
736058e723
36 changed files with 804 additions and 274 deletions
  1. +8
    -0
      fastNLP/core/__init__.py
  2. +4
    -0
      fastNLP/core/callbacks/__init__.py
  3. +4
    -5
      fastNLP/core/callbacks/more_evaluate_callback.py
  4. +1
    -1
      fastNLP/core/collators/padders/paddle_padder.py
  5. +1
    -3
      fastNLP/core/controllers/trainer.py
  6. +5
    -1
      fastNLP/core/dataloaders/paddle_dataloader/fdl.py
  7. +2
    -2
      fastNLP/core/dataloaders/torch_dataloader/fdl.py
  8. +5
    -12
      fastNLP/core/drivers/paddle_driver/fleet.py
  9. +0
    -1
      fastNLP/core/drivers/paddle_driver/fleet_launcher.py
  10. +2
    -1
      fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
  11. +3
    -2
      fastNLP/core/drivers/paddle_driver/paddle_driver.py
  12. +7
    -15
      fastNLP/core/drivers/paddle_driver/single_device.py
  13. +3
    -38
      fastNLP/core/drivers/paddle_driver/utils.py
  14. +2
    -4
      fastNLP/core/metrics/backend/paddle_backend/backend.py
  15. +2
    -1
      fastNLP/core/utils/__init__.py
  16. +61
    -35
      fastNLP/core/utils/cache_results.py
  17. +35
    -1
      fastNLP/core/utils/paddle_utils.py
  18. +2
    -2
      tests/core/collators/padders/test_get_padder.py
  19. +1
    -1
      tests/core/controllers/_test_trainer_fleet.py
  20. +2
    -2
      tests/core/controllers/_test_trainer_fleet_outside.py
  21. +1
    -1
      tests/core/controllers/test_trainer_paddle.py
  22. +0
    -8
      tests/core/dataloaders/paddle_dataloader/test_fdl.py
  23. +0
    -18
      tests/core/dataset/test_dataset.py
  24. +3
    -4
      tests/core/drivers/paddle_driver/test_dist_utils.py
  25. +3
    -3
      tests/core/drivers/paddle_driver/test_fleet.py
  26. +4
    -9
      tests/core/drivers/paddle_driver/test_single_device.py
  27. +0
    -20
      tests/core/drivers/paddle_driver/test_utils.py
  28. +4
    -9
      tests/core/drivers/torch_driver/test_single_device.py
  29. +128
    -0
      tests/core/utils/test_cache_results.py
  30. +32
    -46
      tests/core/utils/test_paddle_utils.py
  31. +1
    -0
      tests/pytest.ini
  32. +7
    -0
      tutorials/data/test4dataset.csv
  33. +7
    -0
      tutorials/data/test4dataset.tsv
  34. +423
    -29
      tutorials/fastnlp_tutorial_1.ipynb
  35. +41
    -0
      tutorials/fastnlp_tutorial_2.ipynb
  36. BIN
      tutorials/figures/T1-fig-dataset-and-vocabulary.png

+ 8
- 0
fastNLP/core/__init__.py View File

@@ -14,6 +14,8 @@ __all__ = [
'MoreEvaluateCallback',
"TorchWarmupCallback",
"TorchGradClipCallback",
"MonitorUtility",
'HasMonitorCallback',

# collators
'Collator',
@@ -40,6 +42,12 @@ __all__ = [
'Trainer',

# dataloaders TODO 需要把 mix_dataloader 的搞定
'TorchDataLoader',
'PaddleDataLoader',
'JittorDataLoader',
'prepare_jittor_dataloader',
'prepare_paddle_dataloader',
'prepare_torch_dataloader',

# dataset
'DataSet',


+ 4
- 0
fastNLP/core/callbacks/__init__.py View File

@@ -15,6 +15,9 @@ __all__ = [

"TorchWarmupCallback",
"TorchGradClipCallback",

"MonitorUtility",
'HasMonitorCallback'
]


@@ -28,4 +31,5 @@ from .load_best_model_callback import LoadBestModelCallback
from .early_stop_callback import EarlyStopCallback
from .torch_callbacks import *
from .more_evaluate_callback import MoreEvaluateCallback
from .has_monitor_callback import MonitorUtility, HasMonitorCallback


+ 4
- 5
fastNLP/core/callbacks/more_evaluate_callback.py View File

@@ -66,7 +66,6 @@ class MoreEvaluateCallback(HasMonitorCallback):
raise RuntimeError("`evaluate_every` and `watch_monitor` cannot be None at the same time.")
if watch_monitor is not None and evaluate_every is not None:
raise RuntimeError("`evaluate_every` and `watch_monitor` cannot be set at the same time.")
self.watch_monitor = watch_monitor

if topk_monitor is not None and topk == 0:
raise RuntimeError("`topk_monitor` is set, but `topk` is 0.")
@@ -93,8 +92,8 @@ class MoreEvaluateCallback(HasMonitorCallback):

def on_after_trainer_initialized(self, trainer, driver):
# 如果是需要 watch 的,不能没有 evaluator
if self.watch_monitor is not None:
assert trainer.evaluator is not None, f"You set `watch_monitor={self.watch_monitor}`, but no " \
if self.monitor is not None:
assert trainer.evaluator is not None, f"You set `watch_monitor={self.monitor}`, but no " \
f"evaluate_dataloaders is provided in Trainer."

if trainer.evaluate_fn is self.evaluate_fn:
@@ -134,7 +133,7 @@ class MoreEvaluateCallback(HasMonitorCallback):
self.topk_saver.save_topk(trainer, results)

def on_train_epoch_end(self, trainer):
if self.watch_monitor is not None:
if self.monitor is not None:
return
if isinstance(self.evaluate_every, int) and self.evaluate_every < 0:
evaluate_every = -self.evaluate_every
@@ -143,7 +142,7 @@ class MoreEvaluateCallback(HasMonitorCallback):
self.topk_saver.save_topk(trainer, results)

def on_train_batch_end(self, trainer):
if self.watch_monitor is not None:
if self.monitor is not None:
return
if callable(self.evaluate_every):
if self.evaluate_every(trainer):


+ 1
- 1
fastNLP/core/collators/padders/paddle_padder.py View File

@@ -56,7 +56,7 @@ def is_paddle_dtype_str(dtype):


def _get_dtype(ele_dtype, dtype, class_name):
if not (ele_dtype is not None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)):
if not (ele_dtype is None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)):
raise EleDtypeUnsupportedError(f"`{class_name}` only supports padding python numbers "
f"or numpy numbers or paddle.Tensor but get `{ele_dtype}`.")



+ 1
- 3
fastNLP/core/controllers/trainer.py View File

@@ -117,6 +117,7 @@ class Trainer(TrainerEventTrigger):
:param monitor: 当存在 evaluate_dataloaders 时,默认的 monitor metric 的名字。传入的 callback 如果有 monitor 参数且没有
在 callback 初始化设定的,将采取这个值。如果在 evaluation 结果中没有找到完全一致的名称,将使用 最短公共字符串算法 找到最匹配
的那个作为 monitor 。也可以传入一个函数,接受参数为 evaluation 的结果(字典类型),返回一个 float 值作为 monitor 的结果。
如果 evaluate_dataloaders 与 metrics 没有提供,该参数无意义。
:param larger_better: monitor 的值是否是越大越好。
:param marker: 用于标记一个 Trainer 实例,从而在用户调用 `Trainer.on` 函数时,标记该 callback 函数属于哪一个具体的 'trainer' 实例;默认为 None;
:param kwargs: 一些其它的可能需要的参数;
@@ -231,7 +232,6 @@ class Trainer(TrainerEventTrigger):
total_batches=None
)

""" 设置内部的 Evaluator """
if metrics is None and evaluate_dataloaders is not None:
raise ValueError("You have set 'evaluate_dataloaders' but forget to set 'metrics'.")

@@ -760,8 +760,6 @@ class Trainer(TrainerEventTrigger):
self.on_before_backward(outputs)
loss = self.extract_loss_from_outputs(outputs)
loss = loss / self.accumulation_steps
# with self.get_no_sync_context():
# self.driver.backward(loss)
self.driver.backward(loss)
self.on_after_backward()



+ 5
- 1
fastNLP/core/dataloaders/paddle_dataloader/fdl.py View File

@@ -8,11 +8,12 @@ from typing import Callable, List, Optional, Union, Dict, Sequence
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE

if _NEED_IMPORT_PADDLE:
from paddle.io import DataLoader, Dataset
from paddle.io import DataLoader, Dataset, Sampler
from paddle.fluid.dataloader.collate import default_collate_fn
else:
from fastNLP.core.utils.dummy_class import DummyClass as Dataset
from fastNLP.core.utils.dummy_class import DummyClass as DataLoader
from fastNLP.core.utils.dummy_class import DummyClass as Sampler

from fastNLP.core.collators.collator import Collator
from fastNLP.core.dataloaders.utils import indice_collate_wrapper
@@ -58,6 +59,9 @@ class PaddleDataLoader(DataLoader):
if batch_sampler is None:
batch_sampler = RandomBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle,
drop_last=drop_last)
batch_size = 1
shuffle = False
drop_last = False

super(PaddleDataLoader, self).__init__(dataset=dataset, feed_list=feed_list, places=places,
return_list=return_list, batch_sampler=batch_sampler,


+ 2
- 2
fastNLP/core/dataloaders/torch_dataloader/fdl.py View File

@@ -165,8 +165,8 @@ class TorchDataLoader(DataLoader):


def prepare_torch_dataloader(ds_or_db: Union[DataSet, DataBundle, Sequence[DataSet], Mapping[str, DataSet]],
batch_size: int = 1,
shuffle: bool = False, sampler: Union["Sampler[int]", ReproducibleSampler, UnrepeatedSampler] = None,
batch_size: int = 16,
shuffle: bool = True, sampler: Union["Sampler[int]", ReproducibleSampler, UnrepeatedSampler] = None,
batch_sampler: Union["Sampler[Sequence[int]]", ReproducibleBatchSampler] = None,
num_workers: int = 0, collate_fn: Union[str, Callable, None] = None,
pin_memory: bool = False, drop_last: bool = False,


+ 5
- 12
fastNLP/core/drivers/paddle_driver/fleet.py View File

@@ -1,12 +1,12 @@
import os
import shutil
from typing import List, Union, Optional, Dict, Tuple, Callable

from fastNLP.core.utils.paddle_utils import get_device_from_visible

from .paddle_driver import PaddleDriver
from .fleet_launcher import FleetLauncher
from .utils import (
_FleetWrappingModel,
get_device_from_visible,
reset_seed,
replace_sampler,
replace_batch_sampler,
@@ -17,8 +17,8 @@ from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.core.utils import (
auto_param_call,
check_user_specific_params,
paddle_move_data_to_device,
is_in_paddle_dist
is_in_paddle_dist,
is_in_paddle_dist,
)
from fastNLP.envs.distributed import rank_zero_rm
from fastNLP.core.samplers import (
@@ -609,12 +609,6 @@ class PaddleFleetDriver(PaddleDriver):
def is_distributed(self):
return True

def move_data_to_device(self, batch: 'paddle.Tensor'):
device = self.data_device
# 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误
device = get_device_from_visible(device)
return paddle_move_data_to_device(batch, device)

@staticmethod
def _check_optimizer_legality(optimizers):
# paddle 存在设置分布式 optimizers 的函数,返回值为 fleet.meta_optimizers.HybridParallelOptimizer
@@ -637,9 +631,8 @@ class PaddleFleetDriver(PaddleDriver):
:return: 如果当前不是分布式 driver 直接返回输入的 obj 。如果当前 rank 是接收端(其 global rank 包含在了 dst 中),则返回
接收到的参数;如果是 source 端则返回发射的内容;既不是发送端、又不是接收端,则返回 None 。
"""
device = self.data_device
# 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误
device = get_device_from_visible(device)
device = get_device_from_visible(self.data_device)
return fastnlp_paddle_broadcast_object(obj, src, device=device, group=group)

def all_gather(self, obj, group=None) -> List:


+ 0
- 1
fastNLP/core/drivers/paddle_driver/fleet_launcher.py View File

@@ -10,7 +10,6 @@ from fastNLP.envs.env import (
FASTNLP_DISTRIBUTED_CHECK,
FASTNLP_LOG_LEVEL,
FASTNLP_GLOBAL_SEED,
USER_CUDA_VISIBLE_DEVICES,
)
from .utils import (
find_free_ports,


+ 2
- 1
fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py View File

@@ -42,7 +42,8 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[

user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES")
if user_visible_devices is None:
raise RuntimeError("This situation cannot happen, please report a bug to us.")
raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
"`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
_could_use_device_num = len(user_visible_devices.split(","))
if isinstance(device, int):
if device < 0 and device != -1:


+ 3
- 2
fastNLP/core/drivers/paddle_driver/paddle_driver.py View File

@@ -10,7 +10,7 @@ import numpy as np
from .utils import _build_fp16_env, optimizer_state_to_device, DummyGradScaler
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.core.drivers.driver import Driver
from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device
from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device, get_device_from_visible
from fastNLP.envs import (
FASTNLP_SEED_WORKERS,
FASTNLP_MODEL_FILENAME,
@@ -394,7 +394,8 @@ class PaddleDriver(Driver):

:return: 将移动到指定机器上的 batch 对象返回;
"""
return paddle_move_data_to_device(batch, self.data_device)
device = get_device_from_visible(self.data_device)
return paddle_move_data_to_device(batch, device)

@staticmethod
def worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: # pragma: no cover


+ 7
- 15
fastNLP/core/drivers/paddle_driver/single_device.py View File

@@ -2,14 +2,14 @@ import os
from typing import Optional, Dict, Union, Callable, Tuple

from .paddle_driver import PaddleDriver
from .utils import replace_batch_sampler, replace_sampler, get_device_from_visible
from .utils import replace_batch_sampler, replace_sampler
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES
from fastNLP.core.utils import (
auto_param_call,
get_device_from_visible,
get_paddle_gpu_str,
get_paddle_device_id,
paddle_move_data_to_device,
)
from fastNLP.core.utils.utils import _get_fun_msg
from fastNLP.core.samplers import (
@@ -39,6 +39,9 @@ class PaddleSingleDriver(PaddleDriver):
raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`")

cuda_visible_devices = os.environ.get(USER_CUDA_VISIBLE_DEVICES, None)
if cuda_visible_devices is None:
raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
"`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
if cuda_visible_devices == "":
device = "cpu"
logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to"
@@ -54,7 +57,7 @@ class PaddleSingleDriver(PaddleDriver):
device_id = device
else:
device_id = get_paddle_device_id(device)
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ[USER_CUDA_VISIBLE_DEVICES].split(",")[device_id]
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices.split(",")[device_id]
self.model_device = get_paddle_gpu_str(device)

self.local_rank = 0
@@ -65,8 +68,7 @@ class PaddleSingleDriver(PaddleDriver):
r"""
该函数用来初始化训练环境,用于设置当前训练的设备,并将模型迁移到对应设备上。
"""
device = self.model_device
device = get_device_from_visible(device, output_type=str)
device = get_device_from_visible(self.model_device, output_type=str)
paddle.device.set_device(device)
self.model.to(device)

@@ -121,16 +123,6 @@ class PaddleSingleDriver(PaddleDriver):
else:
raise RuntimeError(f"There is no `{fn}` method in your {type(self.model)}.")

def move_data_to_device(self, batch: 'paddle.Tensor'):
r"""
将数据迁移到指定的机器上;batch 可能是 list 也可能 dict ,或其嵌套结构。
在 Paddle 中使用可能会引起因与设置的设备不一致而产生的问题,请注意。

:return: 将移动到指定机器上的 batch 对象返回;
"""
device = get_device_from_visible(self.data_device)
return paddle_move_data_to_device(batch, device)

def set_dist_repro_dataloader(self, dataloader, dist: Union[str, ReproducibleBatchSampler, ReproducibleSampler]=None,
reproducible: bool = False):
r"""


+ 3
- 38
fastNLP/core/drivers/paddle_driver/utils.py View File

@@ -6,12 +6,11 @@ import inspect
import numpy as np
from copy import deepcopy
from contextlib import ExitStack, closing
from enum import IntEnum
from typing import Dict, Optional, Union
from typing import Dict, Optional

from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.core.utils import get_paddle_device_id, auto_param_call, paddle_to
from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS, USER_CUDA_VISIBLE_DEVICES
from fastNLP.core.utils import auto_param_call, paddle_to
from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS
from fastNLP.core.log import logger


@@ -173,40 +172,6 @@ def find_free_ports(num):

return None

def get_device_from_visible(device: Union[str, int], output_type=int):
"""
在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。
如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。

:param device: 未转化的设备名
:param output_type: 返回值的类型
:return: 转化后的设备id
"""
if output_type not in [int, str]:
raise ValueError("Parameter `output_type` should be one of these types: [int, str]")
if device == "cpu":
return device
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
idx = get_paddle_device_id(device)
if cuda_visible_devices is None or cuda_visible_devices == "":
# 这个判断一般不会发生,因为 fastnlp 会为 paddle 强行注入 CUDA_VISIBLE_DEVICES
raise RuntimeError("This situation should not happen, please report us this bug.")
else:
# 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if user_visible_devices is None:
raise RuntimeError("This situation cannot happen, please report a bug to us.")
idx = user_visible_devices.split(",")[idx]

cuda_visible_devices_list = cuda_visible_devices.split(',')
if idx not in cuda_visible_devices_list:
raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}].")
res = cuda_visible_devices_list.index(idx)
if output_type == int:
return res
else:
return f"gpu:{res}"

def replace_batch_sampler(dataloader: "DataLoader", batch_sampler: "BatchSampler"):
"""
利用 `batch_sampler` 重新构建一个 DataLoader,起到替换 `batch_sampler` 又不影响原 `dataloader` 的作用。


+ 2
- 4
fastNLP/core/metrics/backend/paddle_backend/backend.py View File

@@ -1,11 +1,10 @@
from typing import List, Optional, Any
from typing import List, Any

import numpy as np

from fastNLP.core.metrics.backend import Backend
from fastNLP.core.utils.paddle_utils import paddle_to
from fastNLP.core.utils.paddle_utils import paddle_to, get_device_from_visible
from fastNLP.core.metrics.utils import AggregateMethodError
from fastNLP.core.drivers.paddle_driver.utils import get_device_from_visible
from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE

@@ -80,7 +79,6 @@ class PaddleBackend(Backend):
raise ValueError(f"tensor: {tensor} can not convert to ndarray!")

def move_tensor_to_device(self, tensor, device):
# TODO 如果在这里处理的话,会不会在别的地方引起bug?
device = get_device_from_visible(device)
return paddle_to(tensor, device)



+ 2
- 1
fastNLP/core/utils/__init__.py View File

@@ -2,6 +2,7 @@ __all__ = [
'cache_results',
'is_jittor_dataset',
'jittor_collate_wraps',
'get_device_from_visible',
'paddle_to',
'paddle_move_data_to_device',
'get_paddle_device_id',
@@ -27,7 +28,7 @@ __all__ = [

from .cache_results import cache_results
from .jittor_utils import is_jittor_dataset, jittor_collate_wraps
from .paddle_utils import paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \
from .paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \
is_in_fnlp_paddle_dist, is_in_paddle_launch_dist
from .rich_progress import f_rich_progress
from .torch_paddle_utils import torch_paddle_move_data_to_device


+ 61
- 35
fastNLP/core/utils/cache_results.py View File

@@ -3,6 +3,7 @@ import hashlib
import _pickle
import functools
import os
import re
from typing import Callable, List, Any, Optional
import inspect
import ast
@@ -126,7 +127,10 @@ def _get_func_and_its_called_func_source_code(func) -> List[str]:
# some failure
pass
del last_frame #
sources.append(inspect.getsource(func))
func_source_code = inspect.getsource(func) # 将这个函数中的 cache_results 装饰删除掉。
for match in list(re.finditer('@cache_results\(.*\)\\n', func_source_code))[::-1]:
func_source_code = func_source_code[:match.start()] + func_source_code[match.end():]
sources.append(func_source_code)
return sources


@@ -163,11 +167,12 @@ def cal_fn_hash_code(fn: Optional[Callable] = None, fn_kwargs: Optional[dict] =
if fn_kwargs is None:
fn_kwargs = {}
hasher = Hasher()
try:
sources = _get_func_and_its_called_func_source_code(fn)
hasher.update(sources)
except:
return "can't be hashed"
if fn is not None:
try:
sources = _get_func_and_its_called_func_source_code(fn)
hasher.update(sources)
except:
return "can't be hashed"
for key in sorted(fn_kwargs):
hasher.update(key)
try:
@@ -177,7 +182,7 @@ def cal_fn_hash_code(fn: Optional[Callable] = None, fn_kwargs: Optional[dict] =
return hasher.hexdigest()


def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True):
def cache_results(_cache_fp, _hash_param=True, _refresh=False, _verbose=1, _check_hash=True):
r"""
cache_results是fastNLP中用于cache数据的装饰器。通过下面的例子看一下如何使用::

@@ -186,9 +191,9 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True):
from fastNLP import cache_results

@cache_results('cache.pkl')
def process_data():
def process_data(second=1):
# 一些比较耗时的工作,比如读取数据,预处理数据等,这里用time.sleep()代替耗时
time.sleep(1)
time.sleep(second)
return np.random.randint(10, size=(5,))

start_time = time.time()
@@ -199,49 +204,49 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True):
print("res =",process_data())
print(time.time() - start_time)

# 输出内容如下,可以看到两次结果相同,且第二次几乎没有花费时间
# Save cache to cache.pkl.
start_time = time.time()
print("res =",process_data(second=2))
print(time.time() - start_time)

# 输出内容如下,可以看到前两次结果相同,且第二次几乎没有花费时间。第三次由于参数变化了,所以cache的结果也就自然变化了。
# Save cache to 2d145aeb_cache.pkl.
# res = [5 4 9 1 8]
# 1.0042750835418701
# Read cache from cache.pkl.
# 1.0134737491607666
# Read cache from 2d145aeb_cache.pkl (Saved on xxxx).
# res = [5 4 9 1 8]
# 0.0040721893310546875
# Save cache to 0ead3093_cache.pkl.
# res = [1 8 2 5 1]
# 2.0086121559143066

可以看到第二次运行的时候,只用了0.0001s左右,是由于第二次运行将直接从cache.pkl这个文件读取数据,而不会经过再次预处理::

# 还是以上面的例子为例,如果需要重新生成另一个cache,比如另一个数据集的内容,通过如下的方式调用即可
process_data(_cache_fp='cache2.pkl') # 完全不影响之前的‘cache.pkl'

上面的_cache_fp是cache_results会识别的参数,它将从'cache2.pkl'这里缓存/读取数据,即这里的'cache2.pkl'覆盖默认的
'cache.pkl'。如果在你的函数前面加上了@cache_results()则你的函数会增加三个参数[_cache_fp, _refresh, _verbose]。
上面的例子即为使用_cache_fp的情况,这三个参数不会传入到你的函数中,当然你写的函数参数名也不可能包含这三个名称::

process_data(_cache_fp='cache2.pkl', _refresh=True) # 这里强制重新生成一份对预处理的cache。
# _verbose是用于控制输出信息的,如果为0,则不输出任何内容;如果为1,则会提醒当前步骤是读取的cache还是生成了新的cache
可以看到第二次运行的时候,只用了0.0001s左右,是由于第二次运行将直接从cache.pkl这个文件读取数据,而不会经过再次预处理。
如果在函数加上了装饰器@cache_results(),则函数会增加五个参数[_cache_fp, _hash_param, _refresh, _verbose,
_check_hash]。上面的例子即为使用_cache_fp的情况,这五个参数不会传入到被装饰函数中,当然被装饰函数参数名也不能包含这五个名称::

:param str _cache_fp: 将返回结果缓存到什么位置;或从什么位置读取缓存。如果为None,cache_results没有任何效用,除非在
函数调用的时候传入_cache_fp这个参数。
:param bool _refresh: 是否重新生成cache。
函数调用的时候传入 _cache_fp 这个参数。保存文件的名称会受到
:param bool _hash_param: 是否将传入给被装饰函数的 parameter 进行 str 之后的 hash 结果加入到 _cache_fp 中,这样每次函数的
parameter 改变的时候,cache 文件就自动改变了。
:param bool _refresh: 强制重新生成新的 cache 。
:param int _verbose: 是否打印cache的信息。
:param bool _check_hash: 如果为 True 将尝试对比修饰的函数的源码以及该函数内部调用的函数的源码的hash值。如果发现保存时的hash值
与当前的hash值有差异,会报warning。但该warning可能出现实质上并不影响结果的误报(例如增删空白行);且在修改不涉及源码时,虽然
该修改对结果有影响,但无法做出warning。

:return:
"""

def wrapper_(func):
signature = inspect.signature(func)
for key, _ in signature.parameters.items():
if key in ('_cache_fp', '_refresh', '_verbose', '_check_hash'):
if key in ('_cache_fp', "_hash_param", '_refresh', '_verbose', '_check_hash'):
raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key))

@functools.wraps(func)
def wrapper(*args, **kwargs):
fn_param = kwargs.copy()
if args:
params = [p.name for p in inspect.signature(func).parameters.values()]
fn_param.update(zip(params, args))
# fn_param = kwargs.copy()
# if args:
# params = [p.name for p in inspect.signature(func).parameters.values()]
# fn_param.update(zip(params, args))
if '_cache_fp' in kwargs:
cache_filepath = kwargs.pop('_cache_fp')
assert isinstance(cache_filepath, str), "_cache_fp can only be str."
@@ -263,10 +268,31 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True):
else:
check_hash = _check_hash

if '_hash_param' in kwargs:
hash_param = kwargs.pop('_hash_param')
assert isinstance(hash_param, bool), "_hash_param can only be bool."
else:
hash_param = _hash_param

if hash_param and cache_filepath is not None: # 尝试将parameter给hash一下
try:
params = dict(inspect.getcallargs(func, *args, **kwargs))
if inspect.ismethod(func): # 如果是 method 的话第一个参数(一般就是 self )就不考虑了
first_key = next(iter(params.items()))
params.pop(first_key)
if len(params):
# sort 一下防止顺序改变
params = {k: str(v) for k, v in sorted(params.items(), key=lambda item: item[0])}
param_hash = cal_fn_hash_code(None, params)[:8]
head, tail = os.path.split(cache_filepath)
cache_filepath = os.path.join(head, param_hash + '_' + tail)
except BaseException as e:
logger.debug(f"Fail to add parameter hash to cache path, because of Exception:{e}")

refresh_flag = True
new_hash_code = None
if check_hash:
new_hash_code = cal_fn_hash_code(func, fn_param)
new_hash_code = cal_fn_hash_code(func, None)

if cache_filepath is not None and refresh is False:
# load data
@@ -281,13 +307,13 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True):
logger.info("Read cache from {} (Saved on {}).".format(cache_filepath, save_time))
if check_hash and old_hash_code != new_hash_code:
logger.warning(f"The function `{func.__name__}` is different from its last cache (Save on {save_time}). The "
f"difference may caused by the sourcecode change of the functions by this function.",
f"difference may caused by the sourcecode change.",
extra={'highlighter': ColorHighlighter('red')})
refresh_flag = False

if refresh_flag:
if new_hash_code is None:
new_hash_code = cal_fn_hash_code(func, fn_param)
new_hash_code = cal_fn_hash_code(func, None)
results = func(*args, **kwargs)
if cache_filepath is not None:
if results is None:


+ 35
- 1
fastNLP/core/utils/paddle_utils.py View File

@@ -1,4 +1,5 @@
__all__ = [
"get_device_from_visible",
"paddle_to",
"paddle_move_data_to_device",
"get_paddle_gpu_str",
@@ -13,13 +14,45 @@ import re
from typing import Any, Optional, Union

from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH
from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH, USER_CUDA_VISIBLE_DEVICES

if _NEED_IMPORT_PADDLE:
import paddle

from .utils import apply_to_collection

def get_device_from_visible(device: Union[str, int], output_type=int):
"""
在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。
如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。

:param device: 未转化的设备名
:param output_type: 返回值的类型
:return: 转化后的设备id
"""
if output_type not in [int, str]:
raise ValueError("Parameter `output_type` should be one of these types: [int, str]")
if device == "cpu":
return device
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if user_visible_devices is None:
raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
"`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
idx = get_paddle_device_id(device)
# 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备
if user_visible_devices is None:
raise RuntimeError("This situation cannot happen, please report a bug to us.")
idx = user_visible_devices.split(",")[idx]

cuda_visible_devices_list = cuda_visible_devices.split(',')
if idx not in cuda_visible_devices_list:
raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}]. ")
res = cuda_visible_devices_list.index(idx)
if output_type == int:
return res
else:
return f"gpu:{res}"

def paddle_to(data, device: Union[str, int]):
"""
@@ -33,6 +66,7 @@ def paddle_to(data, device: Union[str, int]):
if device == "cpu":
return data.cpu()
else:
# device = get_device_from_visible(device, output_type=int)
return data.cuda(get_paddle_device_id(device))




+ 2
- 2
tests/core/collators/padders/test_get_padder.py View File

@@ -14,10 +14,10 @@ def test_get_element_shape_dtype():
catalog = _get_element_shape_dtype([np.zeros(3), np.zeros((2, 1))])


@pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle'])
# @pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle'])
@pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'paddle'])
@pytest.mark.torch
@pytest.mark.paddle
@pytest.mark.jittor
def test_get_padder_run(backend):
if not _NEED_IMPORT_TORCH and backend == 'torch':
pytest.skip("No torch")


+ 1
- 1
tests/core/controllers/_test_trainer_fleet.py View File

@@ -1,7 +1,7 @@
"""
这个文件测试用户以python -m paddle.distributed.launch 启动的情况
看看有没有用pytest执行的机会
python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet.py
FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet.py
"""
import os
import sys


+ 2
- 2
tests/core/controllers/_test_trainer_fleet_outside.py View File

@@ -1,7 +1,7 @@
"""
这个文件测试用户以python -m paddle.distributed.launch 启动的情况
并且自己初始化了 fleet
python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet_outside.py
FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet_outside.py
"""
import os
import sys
@@ -93,5 +93,5 @@ if __name__ == "__main__":
driver=driver,
device=device,
callbacks=callbacks,
n_epochs=30,
n_epochs=5,
)

+ 1
- 1
tests/core/controllers/test_trainer_paddle.py View File

@@ -27,7 +27,7 @@ class TrainPaddleConfig:
@pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1), ("fleet", [0, 1])])
# @pytest.mark.parametrize("driver,device", [("fleet", [0, 1])])
@pytest.mark.parametrize("callbacks", [[RichCallback(5)]])
@pytest.mark.paddle
@pytest.mark.paddledist
@magic_argv_env_context
def test_trainer_paddle(
driver,


+ 0
- 8
tests/core/dataloaders/paddle_dataloader/test_fdl.py View File

@@ -58,11 +58,3 @@ class TestPaddle:
for batch in fdl1:
assert batch['image'].shape == [4, 10, 5]
print(batch)

def test_v2(self):
from fastNLP.core.collators import Collator
logger.setLevel("DEBUG")
data = [paddle.Tensor(np.random.random((10, 5)).astype('float32')), paddle.Tensor(np.random.random((10, 5)).astype('float32'))]
col = Collator(backend="jittor")
res = col(data)
print(res)

+ 0
- 18
tests/core/dataset/test_dataset.py View File

@@ -370,29 +370,11 @@ class TestDataSetMethods:
assert os.path.exists("1.csv") == True
os.remove("1.csv")

def test_add_collate_fn(self):
ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]})

def collate_fn(item):
return item

ds.add_collate_fn(collate_fn)

def test_get_collator(self):
from typing import Callable
ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]})
collate_fn = ds.get_collator()
assert isinstance(collate_fn, Callable) == True

def test_add_seq_len(self):
ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]})
ds.add_seq_len('x')
print(ds)

def test_set_target(self):
ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]})
ds.set_target('x')


class TestFieldArrayInit:
"""


+ 3
- 4
tests/core/drivers/paddle_driver/test_dist_utils.py View File

@@ -19,7 +19,7 @@ if _NEED_IMPORT_PADDLE:
import paddle
import paddle.distributed as dist

@pytest.mark.paddle
@pytest.mark.paddledist
class TestDistUtilsTools:
"""
测试一些工具函数
@@ -79,14 +79,13 @@ class TestDistUtilsTools:
assert res["int"] == paddle_dict["int"]
assert res["string"] == paddle_dict["string"]


@pytest.mark.paddle
@pytest.mark.paddledist
class TestAllGatherAndBroadCast:

@classmethod
def setup_class(cls):
devices = [0,1,2]
output_from_new_proc = "only_error"
output_from_new_proc = "all"

launcher = FleetLauncher(devices=devices, output_from_new_proc=output_from_new_proc)
cls.local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", "0"))


+ 3
- 3
tests/core/drivers/paddle_driver/test_fleet.py View File

@@ -39,7 +39,7 @@ def generate_driver(num_labels, feature_dimension, device=[0,1], fp16=False, out
#
############################################################################

@pytest.mark.paddle
@pytest.mark.paddledist
class TestFleetDriverFunction:
"""
测试 PaddleFleetDriver 一些简单函数的测试类,基本都是测试能否运行、是否存在 import 错误等问题
@@ -147,7 +147,7 @@ class TestFleetDriverFunction:
#
############################################################################

@pytest.mark.paddle
@pytest.mark.paddledist
class TestSetDistReproDataloader:

@classmethod
@@ -521,7 +521,7 @@ class TestSetDistReproDataloader:
#
############################################################################

@pytest.mark.paddle
@pytest.mark.paddledist
class TestSaveLoad:
"""
测试多卡情况下 save 和 load 相关函数的表现


+ 4
- 9
tests/core/drivers/paddle_driver/test_single_device.py View File

@@ -552,22 +552,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"):

return driver

@pytest.fixture
def prepare_test_save_load():
dataset = PaddleRandomMaxDataset(40, 10)
dataloader = DataLoader(dataset, batch_size=4)
driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10)
return driver1, driver2, dataloader

@pytest.mark.paddle
@pytest.mark.parametrize("only_state_dict", ([True, False]))
def test_save_and_load_model(prepare_test_save_load, only_state_dict):
def test_save_and_load_model(only_state_dict):
"""
测试 save_model 和 load_model 函数
"""
try:
path = "model"
driver1, driver2, dataloader = prepare_test_save_load
dataset = PaddleRandomMaxDataset(40, 10)
dataloader = DataLoader(dataset, batch_size=4)
driver1, driver2 = generate_random_driver(10, 10, device="gpu"), generate_random_driver(10, 10, device="gpu")

if only_state_dict:
driver1.save_model(path, only_state_dict)


+ 0
- 20
tests/core/drivers/paddle_driver/test_utils.py View File

@@ -1,8 +1,6 @@
import os
import pytest

from fastNLP.core.drivers.paddle_driver.utils import (
get_device_from_visible,
replace_batch_sampler,
replace_sampler,
)
@@ -14,24 +12,6 @@ if _NEED_IMPORT_PADDLE:

from tests.helpers.datasets.paddle_data import PaddleNormalDataset

@pytest.mark.parametrize(
("user_visible_devices, cuda_visible_devices, device, output_type, correct"),
(
("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"),
("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"),
("3,4,5,6", "3,5", 0, int, 0),
("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"),
)
)
@pytest.mark.paddle
def test_get_device_from_visible_str(user_visible_devices, cuda_visible_devices, device, output_type, correct):
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices
res = get_device_from_visible(device, output_type)
assert res == correct

@pytest.mark.paddle
def test_replace_batch_sampler():
dataset = PaddleNormalDataset(10)


+ 4
- 9
tests/core/drivers/torch_driver/test_single_device.py View File

@@ -545,22 +545,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"):

return driver

@pytest.fixture
def prepare_test_save_load():
dataset = TorchArgMaxDataset(10, 40)
dataloader = DataLoader(dataset, batch_size=4)
driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10)
return driver1, driver2, dataloader

@pytest.mark.torch
@pytest.mark.parametrize("only_state_dict", ([True, False]))
def test_save_and_load_model(prepare_test_save_load, only_state_dict):
def test_save_and_load_model(only_state_dict):
"""
测试 save_model 和 load_model 函数
"""
try:
path = "model"
driver1, driver2, dataloader = prepare_test_save_load
dataset = TorchArgMaxDataset(10, 40)
dataloader = DataLoader(dataset, batch_size=4)
driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10)

driver1.save_model(path, only_state_dict)
driver2.load_model(path, only_state_dict)


+ 128
- 0
tests/core/utils/test_cache_results.py View File

@@ -246,6 +246,106 @@ class TestCacheResults:
rank_zero_rm('demo.pkl')


def remove_postfix(folder='.', post_fix='.pkl'):
import os
for f in os.listdir(folder):
if os.path.isfile(f) and f.endswith(post_fix):
os.remove(os.path.join(folder, f))


class TestCacheResultsWithParam:
@pytest.mark.parametrize('_refresh', [True, False])
@pytest.mark.parametrize('_hash_param', [True, False])
@pytest.mark.parametrize('_verbose', [0, 1])
@pytest.mark.parametrize('_check_hash', [True, False])
def test_cache_save(self, _refresh, _hash_param, _verbose, _check_hash):
cache_fp = 'demo.pkl'
try:
@cache_results(cache_fp, _refresh=_refresh, _hash_param=_hash_param, _verbose=_verbose,
_check_hash=_check_hash)
def demo(a=1):
print("¥")
return 1
res = demo()

with Capturing() as output:
res = demo(a=1)
if _refresh is False:
assert '¥' not in output[0]
if _verbose is 0:
assert 'read' not in output[0]

with Capturing() as output:
res = demo(1)
if _refresh is False:
assert '¥' not in output[0]

with Capturing() as output:
res = demo(a=2)
if _hash_param is True: # 一定对不上,需要重新生成
assert '¥' in output[0]

finally:
remove_postfix('.')

def test_cache_complex_param(self):
cache_fp = 'demo.pkl'
try:
@cache_results(cache_fp, _refresh=False)
def demo(*args, s=1, **kwargs):
print("¥")
return 1

res = demo(1,2,3, s=4, d=4)
with Capturing() as output:
res = demo(1,2,3,d=4, s=4)
assert '¥' not in output[0]
finally:
remove_postfix('.')

def test_wrapper_change(self):
cache_fp = 'demo.pkl'
test_type = 'wrapper_change'
try:
cmd = f'python {__file__} --cache_fp {cache_fp} --test_type {test_type} --turn 0'
res = get_subprocess_results(cmd)
assert "¥" in res
cmd = f'python {__file__} --cache_fp {cache_fp} --test_type {test_type} --turn 1'
res = get_subprocess_results(cmd)
assert "¥" not in res
assert 'Read' in res
assert 'different' not in res

finally:
remove_postfix('.')

def test_param_change(self):
cache_fp = 'demo.pkl'
test_type = 'param_change'
try:
cmd = f'python {__file__} --cache_fp {cache_fp} --test_type {test_type} --turn 0'
res = get_subprocess_results(cmd)
assert "¥" in res
cmd = f'python {__file__} --cache_fp {cache_fp} --test_type {test_type} --turn 1'
res = get_subprocess_results(cmd)
assert "¥" in res
assert 'Read' not in res
finally:
remove_postfix('.')

def test_create_cache_dir(self):
@cache_results('demo/demo.pkl')
def cache(s):
return 1, 2

try:
results = cache(s=1)
assert (1, 2) == results
finally:
import shutil
shutil.rmtree('demo/')


if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
@@ -294,3 +394,31 @@ if __name__ == '__main__':

res = demo_func()

if test_type == 'wrapper_change':
if turn == 0:
@cache_results(cache_fp, _refresh=True)
def demo_wrapper_change():
print("¥")
return 1
else:
@cache_results(cache_fp, _refresh=False)
def demo_wrapper_change():
print("¥")
return 1

res = demo_wrapper_change()

if test_type == 'param_change':
if turn == 0:
@cache_results(cache_fp, _refresh=False)
def demo_param_change():
print("¥")
return 1
else:
@cache_results(cache_fp, _refresh=False)
def demo_param_change(a=1):
print("¥")
return 1

res = demo_param_change()


+ 32
- 46
tests/core/utils/test_paddle_utils.py View File

@@ -1,10 +1,40 @@
import os

import pytest

from fastNLP.core.utils.paddle_utils import paddle_to, paddle_move_data_to_device
from fastNLP.core.utils.paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
if _NEED_IMPORT_PADDLE:
import paddle

@pytest.mark.parametrize(
("user_visible_devices, cuda_visible_devices, device, output_type, correct"),
(
("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"),
("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1),
("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"),
("3,4,5,6", "3,5", 0, int, 0),
("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"),
)
)
@pytest.mark.paddle
def test_get_device_from_visible(user_visible_devices, cuda_visible_devices, device, output_type, correct):
_cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
_user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES")
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices
res = get_device_from_visible(device, output_type)
assert res == correct

# 还原环境变量
if _cuda_visible_devices is None:
del os.environ["CUDA_VISIBLE_DEVICES"]
else:
os.environ["CUDA_VISIBLE_DEVICES"] = _cuda_visible_devices
if _user_visible_devices is None:
del os.environ["USER_CUDA_VISIBLE_DEVICES"]
else:
os.environ["USER_CUDA_VISIBLE_DEVICES"] = _user_visible_devices

############################################################################
#
@@ -22,12 +52,6 @@ class TestPaddleToDevice:
assert res.place.gpu_device_id() == 0
res = paddle_to(tensor, "cpu")
assert res.place.is_cpu_place()
res = paddle_to(tensor, "gpu:2")
assert res.place.is_gpu_place()
assert res.place.gpu_device_id() == 2
res = paddle_to(tensor, "gpu:1")
assert res.place.is_gpu_place()
assert res.place.gpu_device_id() == 1

############################################################################
#
@@ -64,28 +88,18 @@ class TestPaddleMoveDataToDevice:
res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device=None)
self.check_gpu(res, 0)

res = paddle_move_data_to_device(paddle_tensor, device="gpu:1", data_device=None)
self.check_gpu(res, 1)

res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device="cpu")
self.check_gpu(res, 0)

res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:0")
self.check_gpu(res, 0)

res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:1")
self.check_gpu(res, 1)

def test_list_transfer(self):
"""
测试张量列表的迁移
"""

paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)]
res = paddle_move_data_to_device(paddle_list, device=None, data_device="gpu:1")
assert isinstance(res, list)
for r in res:
self.check_gpu(r, 1)

res = paddle_move_data_to_device(paddle_list, device="cpu", data_device="gpu:1")
assert isinstance(res, list)
@@ -97,11 +111,6 @@ class TestPaddleMoveDataToDevice:
for r in res:
self.check_gpu(r, 0)

res = paddle_move_data_to_device(paddle_list, device="gpu:1", data_device="cpu")
assert isinstance(res, list)
for r in res:
self.check_gpu(r, 1)

def test_tensor_tuple_transfer(self):
"""
测试张量元组的迁移
@@ -109,10 +118,6 @@ class TestPaddleMoveDataToDevice:

paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)]
paddle_tuple = tuple(paddle_list)
res = paddle_move_data_to_device(paddle_tuple, device=None, data_device="gpu:1")
assert isinstance(res, tuple)
for r in res:
self.check_gpu(r, 1)

res = paddle_move_data_to_device(paddle_tuple, device="cpu", data_device="gpu:1")
assert isinstance(res, tuple)
@@ -124,11 +129,6 @@ class TestPaddleMoveDataToDevice:
for r in res:
self.check_gpu(r, 0)

res = paddle_move_data_to_device(paddle_tuple, device="gpu:1", data_device="cpu")
assert isinstance(res, tuple)
for r in res:
self.check_gpu(r, 1)

def test_dict_transfer(self):
"""
测试字典结构的迁移
@@ -173,20 +173,6 @@ class TestPaddleMoveDataToDevice:
self.check_gpu(t, 0)
self.check_gpu(res["dict"]["tensor"], 0)

res = paddle_move_data_to_device(paddle_dict, device=None, data_device="gpu:1")
assert isinstance(res, dict)
self.check_gpu(res["tensor"], 1)
assert isinstance(res["list"], list)
for t in res["list"]:
self.check_gpu(t, 1)
assert isinstance(res["int"], int)
assert isinstance(res["string"], str)
assert isinstance(res["dict"], dict)
assert isinstance(res["dict"]["list"], list)
for t in res["dict"]["list"]:
self.check_gpu(t, 1)
self.check_gpu(res["dict"]["tensor"], 1)

res = paddle_move_data_to_device(paddle_dict, device="cpu", data_device="gpu:0")
assert isinstance(res, dict)
self.check_cpu(res["tensor"])


+ 1
- 0
tests/pytest.ini View File

@@ -2,5 +2,6 @@
markers =
torch
paddle
paddledist
jittor
torchpaddle

+ 7
- 0
tutorials/data/test4dataset.csv View File

@@ -0,0 +1,7 @@
,SentenceId,Sentence,Sentiment
0,1,"['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']",negative
1,2,"['this', 'quiet', ',', 'introspective', 'and', 'entertaining', 'independent', 'is', 'worth', 'seeking', '.']",positive
2,3,"['even', 'fans', 'of', 'ismail', 'merchant', ""'s"", 'work', ',', 'i', 'suspect', ',', 'would', 'have', 'a', 'hard', 'time', 'sitting', 'through', 'this', 'one', '.']",negative
3,4,"['a', 'positively', 'thrilling', 'combination', 'of', 'ethnography', 'and', 'all', 'the', 'intrigue', ',', 'betrayal', ',', 'deceit', 'and', 'murder', 'of', 'a', 'shakespearean', 'tragedy', 'or', 'a', 'juicy', 'soap', 'opera', '.']",neutral
4,5,"['a', 'comedy-drama', 'of', 'nearly', 'epic', 'proportions', 'rooted', 'in', 'a', 'sincere', 'performance', 'by', 'the', 'title', 'character', 'undergoing', 'midlife', 'crisis', '.']",positive
5,6,"['the', 'importance', 'of', 'being', 'earnest', ',', 'so', 'thick', 'with', 'wit', 'it', 'plays', 'like', 'a', 'reading', 'from', 'bartlett', ""'s"", 'familiar', 'quotations']",neutral

+ 7
- 0
tutorials/data/test4dataset.tsv View File

@@ -0,0 +1,7 @@
SentenceId Sentence Sentiment
1 A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . negative
2 This quiet , introspective and entertaining independent is worth seeking . positive
3 Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . negative
4 A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . neutral
5 A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . positive
6 The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations neutral

+ 423
- 29
tutorials/fastnlp_tutorial_1.ipynb View File

@@ -153,7 +153,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1969418794120 1971237588872\n",
"2438703969992 2438374526920\n",
"+-----+------------------------+------------------------+-----+\n",
"| idx | sentence | words | num |\n",
"+-----+------------------------+------------------------+-----+\n",
@@ -198,7 +198,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1971237588872 1971237588872\n",
"2438374526920 2438374526920\n",
"+-----+------------------------+------------------------+-----+\n",
"| idx | sentence | words | num |\n",
"+-----+------------------------+------------------------+-----+\n",
@@ -774,9 +774,9 @@
{
"data": {
"text/plain": [
"{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d08>,\n",
" 'words': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d88>,\n",
" 'num': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879e08>}"
"{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d388>,\n",
" 'words': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d408>,\n",
" 'num': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d488>}"
]
},
"execution_count": 15,
@@ -923,7 +923,8 @@
"output_type": "stream",
"text": [
"5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n",
"6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n"
"6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n",
"6 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n"
]
}
],
@@ -931,7 +932,8 @@
"vocab.add_word_lst(['生活', '就像', '海洋'])\n",
"print(len(vocab), vocab.word_count)\n",
"vocab.add_word('只有')\n",
"print(len(vocab), vocab.word_count)"
"print(len(vocab), vocab.word_count)\n",
"print(len(vocab), vocab.word2idx)"
]
},
{
@@ -959,7 +961,6 @@
"<pad> 0\n",
"<unk> 1\n",
"生活 2\n",
"只有 5\n",
"彼岸 1 False\n"
]
}
@@ -968,7 +969,6 @@
"print(vocab.to_word(0), vocab.to_index('<pad>'))\n",
"print(vocab.to_word(1), vocab.to_index('<unk>'))\n",
"print(vocab.to_word(2), vocab.to_index('生活'))\n",
"print(vocab.to_word(5), vocab.to_index('只有'))\n",
"print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
]
},
@@ -979,7 +979,9 @@
"source": [
"**`vocabulary`允许反复添加相同单词**,**可以通过`word_count`方法看到相应单词被添加的次数**\n",
"\n",
"&emsp; 但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询"
"&emsp; 但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询\n",
"\n",
"&emsp; 注:**使用`add_word_lst`添加单词**,**单词对应序号不会动态调整**,**使用`dataset`添加单词的情况不同**"
]
},
{
@@ -992,15 +994,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
"13 Counter({'生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '人': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
"彼岸 12 True\n"
"生活 2\n",
"彼岸 12 True\n",
"13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
"13 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n"
]
}
],
"source": [
"vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '才', '能', '到达', '彼岸'])\n",
"vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n",
"print(vocab.to_word(2), vocab.to_index('生活'))\n",
"print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))\n",
"print(len(vocab), vocab.word_count)\n",
"print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
"print(len(vocab), vocab.word2idx)"
]
},
{
@@ -1082,52 +1088,440 @@
"## 3 dataset 和 vocabulary 的组合使用\n",
" \n",
"### 3.1 从 dataframe 中加载 dataset\n",
"\n"
"\n",
"以下通过 [NLP-beginner](https://github.com/FudanNLP/nlp-beginner) 实践一中 [Rotten Tomatoes 影评数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) 的部分训练数据组成`test4dataset.tsv`文件\n",
"\n",
"&emsp; 介绍如何使用`dataset`、`vocabulary`简单加载并处理数据集,首先使用`pandas`模块,读取原始数据的`dataframe`"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "3dbd985d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SentenceId</th>\n",
" <th>Sentence</th>\n",
" <th>Sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>A series of escapades demonstrating the adage ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>This quiet , introspective and entertaining in...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Even fans of Ismail Merchant 's work , I suspe...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>A positively thrilling combination of ethnogra...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>A comedy-drama of nearly epic proportions root...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>The Importance of Being Earnest , so thick wit...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SentenceId Sentence Sentiment\n",
"0 1 A series of escapades demonstrating the adage ... negative\n",
"1 2 This quiet , introspective and entertaining in... positive\n",
"2 3 Even fans of Ismail Merchant 's work , I suspe... negative\n",
"3 4 A positively thrilling combination of ethnogra... neutral\n",
"4 5 A comedy-drama of nearly epic proportions root... positive\n",
"5 6 The Importance of Being Earnest , so thick wit... neutral"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv('./data/test4dataset.tsv', sep='\\t')\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "89059713",
"id": "919ab350",
"metadata": {},
"source": []
"source": [
"接着,通过`dataset`中的`from_pandas`方法填充数据集,并使用`apply_more`方法对文本进行分词操作"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3dbd985d",
"execution_count": 25,
"id": "4f634586",
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+------------------------------+-----------+\n",
"| SentenceId | Sentence | Sentiment |\n",
"+------------+------------------------------+-----------+\n",
"| 1 | ['a', 'series', 'of', 'es... | negative |\n",
"| 2 | ['this', 'quiet', ',', 'i... | positive |\n",
"| 3 | ['even', 'fans', 'of', 'i... | negative |\n",
"| 4 | ['a', 'positively', 'thri... | neutral |\n",
"| 5 | ['a', 'comedy-drama', 'of... | positive |\n",
"| 6 | ['the', 'importance', 'of... | neutral |\n",
"+------------+------------------------------+-----------+\n"
]
}
],
"source": [
"from fastNLP.core.dataset import DataSet\n",
"\n",
"dataset = DataSet()\n",
"dataset = dataset.from_pandas(df)\n",
"dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n",
" 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']})\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"id": "5c1ae192",
"metadata": {},
"source": [
"&emsp; 如果需要保存中间结果,也可以使用`dataset`的`to_csv`方法,生成`.csv`或`.tsv`文件"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f634586",
"execution_count": 26,
"id": "46722efc",
"metadata": {},
"outputs": [],
"source": []
"source": [
"dataset.to_csv('./data/test4dataset.csv')"
]
},
{
"cell_type": "markdown",
"id": "5ba13989",
"metadata": {},
"source": [
"### 3.2 从 dataset 中获取 vocabulary"
"### 3.2 从 dataset 中获取 vocabulary\n",
"\n",
"然后,初始化`vocabulary`,使用`vocabulary`中的`from_dataset`方法,从`dataset`的指定字段中\n",
"\n",
"&emsp; 获取字段中的所有元素,然后编号;如果指定字段是个列表,则针对字段中所有列表包含的元素编号\n",
"\n",
"&emsp; 注:**使用`dataset`添加单词**,**不同于`add_word_list`**,**单词被添加次数越多**,**序号越靠前**,例如案例中的`a`"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 27,
"id": "a2de615b",
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Counter({'a': 9, 'of': 9, ',': 7, 'the': 6, '.': 5, 'is': 3, 'and': 3, 'good': 2, 'for': 2, 'which': 2, 'this': 2, \"'s\": 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'quiet': 1, 'introspective': 1, 'entertaining': 1, 'independent': 1, 'worth': 1, 'seeking': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, 'work': 1, 'i': 1, 'suspect': 1, 'would': 1, 'have': 1, 'hard': 1, 'time': 1, 'sitting': 1, 'through': 1, 'one': 1, 'positively': 1, 'thrilling': 1, 'combination': 1, 'ethnography': 1, 'all': 1, 'intrigue': 1, 'betrayal': 1, 'deceit': 1, 'murder': 1, 'shakespearean': 1, 'tragedy': 1, 'or': 1, 'juicy': 1, 'soap': 1, 'opera': 1, 'comedy-drama': 1, 'nearly': 1, 'epic': 1, 'proportions': 1, 'rooted': 1, 'in': 1, 'sincere': 1, 'performance': 1, 'by': 1, 'title': 1, 'character': 1, 'undergoing': 1, 'midlife': 1, 'crisis': 1, 'importance': 1, 'being': 1, 'earnest': 1, 'so': 1, 'thick': 1, 'with': 1, 'wit': 1, 'it': 1, 'plays': 1, 'like': 1, 'reading': 1, 'from': 1, 'bartlett': 1, 'familiar': 1, 'quotations': 1}) \n",
"\n",
"{'<pad>': 0, '<unk>': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n",
"\n",
"Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n"
]
}
],
"source": [
"from fastNLP.core.vocabulary import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab = vocab.from_dataset(dataset, field_name='Sentence')\n",
"print(vocab.word_count, '\\n')\n",
"print(vocab.word2idx, '\\n')\n",
"print(vocab)"
]
},
{
"cell_type": "markdown",
"id": "f0857ccb",
"metadata": {},
"source": [
"之后,**通过`vocabulary`的`index_dataset`方法**,**调整`dataset`中指定字段的元素**,**使用编号将之代替**\n",
"\n",
"&emsp; 使用上述方法,可以将影评数据集中的单词序列转化为词编号序列,为接下来转化为词嵌入序列做准备"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 28,
"id": "2f9a04b2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+------------------------------+-----------+\n",
"| SentenceId | Sentence | Sentiment |\n",
"+------------+------------------------------+-----------+\n",
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | negative |\n",
"| 2 | [12, 32, 4, 33, 8, 34, 35... | positive |\n",
"| 3 | [38, 39, 3, 40, 41, 13, 4... | negative |\n",
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | neutral |\n",
"| 5 | [2, 67, 3, 68, 69, 70, 71... | positive |\n",
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | neutral |\n",
"+------------+------------------------------+-----------+\n"
]
}
],
"source": [
"vocab.index_dataset(dataset, field_name='Sentence')\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"id": "6b26b707",
"metadata": {},
"source": [
"最后,使用相同方法,再将`dataset`中`Sentiment`字段中的`negative`、`neutral`、`positive`转化为数字编号"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "5f5eed18",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'negative': 0, 'positive': 1, 'neutral': 2}\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+------------------------------+-----------+\n",
"| SentenceId | Sentence | Sentiment |\n",
"+------------+------------------------------+-----------+\n",
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | 0 |\n",
"| 2 | [12, 32, 4, 33, 8, 34, 35... | 1 |\n",
"| 3 | [38, 39, 3, 40, 41, 13, 4... | 0 |\n",
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | 2 |\n",
"| 5 | [2, 67, 3, 68, 69, 70, 71... | 1 |\n",
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | 2 |\n",
"+------------+------------------------------+-----------+\n"
]
}
],
"source": [
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
"\n",
"target_vocab.from_dataset(dataset, field_name='Sentiment')\n",
"print(target_vocab.word2idx)\n",
"target_vocab.index_dataset(dataset, field_name='Sentiment')\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"id": "eed7ea64",
"metadata": {},
"source": [
"在最后的最后,通过以下的一张图,来总结本章关于`dataset`和`vocabulary`主要知识点的讲解,以及两者的联系\n",
"\n",
"<img src=\"./figures/T1-fig-dataset-and-vocabulary.png\" width=\"80%\" height=\"80%\" align=\"center\"></img>"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35b4f0f7",
"metadata": {},
"outputs": [],
"source": []
}


+ 41
- 0
tutorials/fastnlp_tutorial_2.ipynb View File

@@ -0,0 +1,41 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

BIN
tutorials/figures/T1-fig-dataset-and-vocabulary.png View File

Before After
Width: 1326  |  Height: 701  |  Size: 139 kB

Loading…
Cancel
Save