diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 589968a7..a7a286d0 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -268,7 +268,7 @@ def _prepare_cache_filepath(filepath): raise RuntimeError("The cache_file_path must be a file, not a directory.") cache_dir = os.path.dirname(_cache_filepath) if not os.path.exists(cache_dir): - os.makedirs(cache_dir) + os.makedirs(cache_dir, exist_ok=True) def cache_results(_cache_fp, _refresh=False, _verbose=1): diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py index fdd6f2cd..f3f0e649 100644 --- a/fastNLP/io/pipe/utils.py +++ b/fastNLP/io/pipe/utils.py @@ -12,6 +12,7 @@ import warnings from ...core.const import Const from ...core.vocabulary import Vocabulary from ...core._logger import logger +from pkg_resources import parse_version def iob2(tags: List[str]) -> List[str]: @@ -82,7 +83,10 @@ def get_tokenizer(tokenize_method: str, lang='en'): spacy.prefer_gpu() if lang != 'en': raise RuntimeError("Spacy only supports en right right.") - en = spacy.load(lang) + if parse_version(spacy.__version__) >= parse_version('3.0'): + en = spacy.load('en_core_web_sm') + else: + en = spacy.load(lang) tokenizer = lambda x: [w.text for w in en.tokenizer(x)] elif tokenize_method in tokenizer_dict: tokenizer = tokenizer_dict[tokenize_method]