Browse Source

[to #42322933] fix log print and extensions issue for datasets==2.5.2

1. ExternalDataset的init部分中,引入datasets包自带的_EXTENSION_TO_MODULE会有版本兼容性的问题,比如2.5.2版本就修改了数据结构,与老版本不兼容;
2. 某些cv数据集跳过打印logger.error
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10893702
master^2
xingjun.wxj yingda.chen 3 years ago
parent
commit
1878500cb4
2 changed files with 22 additions and 6 deletions
  1. +11
    -6
      modelscope/msdatasets/utils/dataset_builder.py
  2. +11
    -0
      modelscope/utils/constant.py

+ 11
- 6
modelscope/msdatasets/utils/dataset_builder.py View File

@@ -9,11 +9,11 @@ import pandas as pd
import pyarrow as pa import pyarrow as pa
from datasets.info import DatasetInfo from datasets.info import DatasetInfo
from datasets.naming import camelcase_to_snakecase from datasets.naming import camelcase_to_snakecase
from datasets.packaged_modules import _EXTENSION_TO_MODULE as exts
from datasets.packaged_modules import csv from datasets.packaged_modules import csv
from datasets.utils.filelock import FileLock from datasets.utils.filelock import FileLock


from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
EXTENSIONS_TO_LOAD, DownloadMode)
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


logger = get_logger() logger = get_logger()
@@ -198,22 +198,27 @@ class ExternalDataset(object):
self.ext_dataset = None self.ext_dataset = None
self.split_data_files = {k: [] for k, _ in split_path_dict.items()} self.split_data_files = {k: [] for k, _ in split_path_dict.items()}
file_ext = '' file_ext = ''

for split_name, split_dir in split_path_dict.items(): for split_name, split_dir in split_path_dict.items():
if os.path.isdir(split_dir):
if isinstance(split_dir, str) and os.path.isdir(split_dir):
split_file_names = os.listdir(split_dir) split_file_names = os.listdir(split_dir)
set_files_exts = set([ set_files_exts = set([
os.path.splitext(file_name)[-1].strip('.') os.path.splitext(file_name)[-1].strip('.')
for file_name in split_file_names for file_name in split_file_names
]) ])
if '' in set_files_exts:
continue
# ensure these files have same extensions # ensure these files have same extensions
if len(set_files_exts) != 1: if len(set_files_exts) != 1:
supported_exts = ','.join(exts.keys())
supported_exts = ','.join(EXTENSIONS_TO_LOAD.keys())
logger.error( logger.error(
f'Split-{split_name} has been ignored, please flatten your folder structure, ' f'Split-{split_name} has been ignored, please flatten your folder structure, '
f'and make sure these files have same extensions. ' f'and make sure these files have same extensions. '
f'Supported extensions: {supported_exts} .') f'Supported extensions: {supported_exts} .')
continue continue
file_ext = list(set_files_exts)[0] file_ext = list(set_files_exts)[0]
if file_ext not in EXTENSIONS_TO_LOAD:
continue


split_file_paths = [ split_file_paths = [
os.path.join(split_dir, file_name) os.path.join(split_dir, file_name)
@@ -221,8 +226,8 @@ class ExternalDataset(object):
] ]
self.split_data_files[split_name] = split_file_paths self.split_data_files[split_name] = split_file_paths


if file_ext and file_ext in exts:
file_ext = exts.get(file_ext)
if file_ext:
file_ext = EXTENSIONS_TO_LOAD.get(file_ext)
self.ext_dataset = datasets.load_dataset( self.ext_dataset = datasets.load_dataset(
file_ext, data_files=self.split_data_files, **config_kwargs) file_ext, data_files=self.split_data_files, **config_kwargs)




+ 11
- 0
modelscope/utils/constant.py View File

@@ -390,3 +390,14 @@ class Devices:
"""device used for training and inference""" """device used for training and inference"""
cpu = 'cpu' cpu = 'cpu'
gpu = 'gpu' gpu = 'gpu'


# Supported extensions for text datasets.
EXTENSIONS_TO_LOAD = {
'csv': 'csv',
'tsv': 'csv',
'json': 'json',
'jsonl': 'json',
'parquet': 'parquet',
'txt': 'text'
}

Loading…
Cancel
Save