diff --git a/docs/source/conf.py b/docs/source/conf.py index 39e0d881..4371c927 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -25,7 +25,7 @@ version_file = '../../modelscope/version.py' def get_version(): - with open(version_file, 'r') as f: + with open(version_file, 'r', encoding='utf-8') as f: exec(compile(f.read(), version_file, 'exec')) return locals()['__version__'] diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index f2ff822d..17c21d44 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -739,7 +739,7 @@ class ModelScopeConfig: with open( os.path.join(ModelScopeConfig.path_credential, ModelScopeConfig.USER_INFO_FILE_NAME), - 'r') as f: + 'r', encoding='utf-8') as f: info = f.read() return info.split(':')[0], info.split(':')[1] except FileNotFoundError: @@ -760,7 +760,7 @@ class ModelScopeConfig: with open( os.path.join(ModelScopeConfig.path_credential, ModelScopeConfig.GIT_TOKEN_FILE_NAME), - 'r') as f: + 'r', encoding='utf-8') as f: token = f.read() except FileNotFoundError: pass diff --git a/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py index cc47d0c4..9378c32a 100644 --- a/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py +++ b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py @@ -21,7 +21,7 @@ class KanTtsText2MelDataset(Dataset): self.cache = cache - with open(config_filename) as f: + with open(config_filename, encoding='utf-8') as f: self._config = json.loads(f.read()) # Load metadata: diff --git a/modelscope/models/audio/tts/sambert_hifi.py b/modelscope/models/audio/tts/sambert_hifi.py index a9b55795..9a14219e 100644 --- a/modelscope/models/audio/tts/sambert_hifi.py +++ b/modelscope/models/audio/tts/sambert_hifi.py @@ -60,7 +60,7 @@ class SambertHifigan(Model): with zipfile.ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(model_dir) voice_cfg_path = os.path.join(self.__voice_path, 'voices.json') - with open(voice_cfg_path, 'r') as f: + with open(voice_cfg_path, 'r', encoding='utf-8') as f: voice_cfg = json.load(f) if 'voices' not in voice_cfg: raise TtsModelConfigurationException( diff --git a/modelscope/models/cv/tinynas_classfication/plain_net_utils.py b/modelscope/models/cv/tinynas_classfication/plain_net_utils.py index 844535ed..1f5c8852 100644 --- a/modelscope/models/cv/tinynas_classfication/plain_net_utils.py +++ b/modelscope/models/cv/tinynas_classfication/plain_net_utils.py @@ -39,7 +39,7 @@ class PlainNet(nn.Module): plainnet_struct_txt = self.module_opt.plainnet_struct_txt if plainnet_struct_txt is not None: - with open(plainnet_struct_txt, 'r') as fid: + with open(plainnet_struct_txt, 'r', encoding='utf-8') as fid: the_line = fid.readlines()[0].strip() self.plainnet_struct = the_line pass diff --git a/modelscope/models/multi_modal/clip/bert_tokenizer.py b/modelscope/models/multi_modal/clip/bert_tokenizer.py index 8d356f42..1ee715c9 100644 --- a/modelscope/models/multi_modal/clip/bert_tokenizer.py +++ b/modelscope/models/multi_modal/clip/bert_tokenizer.py @@ -120,7 +120,7 @@ def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() index = 0 - with open(vocab_file, 'r') as reader: + with open(vocab_file, 'r', encoding='utf-8') as reader: while True: token = convert_to_unicode(reader.readline()) if not token: diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py index 9b82e4a1..c2d82dca 100644 --- a/modelscope/models/multi_modal/clip/model.py +++ b/modelscope/models/multi_modal/clip/model.py @@ -523,8 +523,10 @@ class CLIPForMultiModalEmbedding(TorchModel): logger.info(f'Loading text model config from {text_model_config_file}') assert os.path.exists(text_model_config_file) - with open(vision_model_config_file, - 'r') as fv, open(text_model_config_file, 'r') as ft: + with open( + vision_model_config_file, 'r', + encoding='utf-8') as fv,\ + open(text_model_config_file, 'r', encoding='utf-8') as ft: self.model_info = json.load(fv) for k, v in json.load(ft).items(): self.model_info[k] = v diff --git a/modelscope/models/multi_modal/diffusion/model.py b/modelscope/models/multi_modal/diffusion/model.py index 4229391f..5150a0c3 100644 --- a/modelscope/models/multi_modal/diffusion/model.py +++ b/modelscope/models/multi_modal/diffusion/model.py @@ -76,7 +76,7 @@ class DiffusionModel(nn.Module): super(DiffusionModel, self).__init__() # including text and generator config model_config = json.load( - open('{}/model_config.json'.format(model_dir))) + open('{}/model_config.json'.format(model_dir), encoding='utf-8')) # text encoder text_config = model_config['text_config'] @@ -142,7 +142,9 @@ class DiffusionForTextToImageSynthesis(Model): # diffusion process diffusion_params = json.load( - open('{}/diffusion_config.json'.format(model_dir))) + open( + '{}/diffusion_config.json'.format(model_dir), + encoding='utf-8')) self.diffusion_generator = make_diffusion( **diffusion_params['generator_config']) self.diffusion_upsampler_256 = make_diffusion( diff --git a/modelscope/models/multi_modal/diffusion/structbert.py b/modelscope/models/multi_modal/diffusion/structbert.py index d5d678ed..16c1407f 100644 --- a/modelscope/models/multi_modal/diffusion/structbert.py +++ b/modelscope/models/multi_modal/diffusion/structbert.py @@ -130,7 +130,7 @@ class BertConfig(object): @classmethod def from_json_file(cls, json_file): """Constructs a `BertConfig` from a json file of parameters.""" - with open(json_file, 'r') as reader: + with open(json_file, 'r', encoding='utf-8') as reader: text = reader.read() return cls.from_dict(json.loads(text)) diff --git a/modelscope/models/multi_modal/diffusion/tokenizer.py b/modelscope/models/multi_modal/diffusion/tokenizer.py index 82c09661..e2c951b1 100644 --- a/modelscope/models/multi_modal/diffusion/tokenizer.py +++ b/modelscope/models/multi_modal/diffusion/tokenizer.py @@ -67,7 +67,7 @@ def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() index = 0 - with open(vocab_file, 'r') as reader: + with open(vocab_file, 'r', encoding='utf-8') as reader: while True: token = convert_to_unicode(reader.readline()) if not token: diff --git a/modelscope/models/multi_modal/gemm/gemm_base.py b/modelscope/models/multi_modal/gemm/gemm_base.py index 806c469c..c77a682a 100644 --- a/modelscope/models/multi_modal/gemm/gemm_base.py +++ b/modelscope/models/multi_modal/gemm/gemm_base.py @@ -522,7 +522,9 @@ class GEMMModel(nn.Module): def __init__(self, model_dir): super().__init__() - with open('{}/encoder_config.json'.format(model_dir), 'r') as f: + with open( + '{}/encoder_config.json'.format(model_dir), 'r', + encoding='utf-8') as f: model_config = json.loads(f.read()) model_name = list(model_config.keys())[0] config_args = model_config[model_name] diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py index 0cc040c6..813f750e 100644 --- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py +++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py @@ -35,7 +35,9 @@ class VideoCLIPForMultiModalEmbedding(TorchModel): def __init__(self, model_dir, **kwargs): super().__init__(model_dir=model_dir, **kwargs) # model config parameters - with open(f'{model_dir}/{ModelFile.CONFIGURATION}', 'r') as json_file: + with open( + f'{model_dir}/{ModelFile.CONFIGURATION}', 'r', + encoding='utf-8') as json_file: model_config = json.load(json_file) model_config = model_config['paras'] model_config['model_dir'] = model_dir diff --git a/modelscope/models/multi_modal/mplug/configuration_mplug.py b/modelscope/models/multi_modal/mplug/configuration_mplug.py index 914678c5..946ebb82 100644 --- a/modelscope/models/multi_modal/mplug/configuration_mplug.py +++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py @@ -111,6 +111,6 @@ class MPlugConfig(PretrainedConfig): @classmethod def from_yaml_file(cls, yaml_file: Union[str, os.PathLike]) -> Dict[str, Any]: - with open(yaml_file, 'r') as reader: + with open(yaml_file, 'r', encoding='utf-8') as reader: config_dict = yaml.load(reader, Loader=yaml.Loader) return cls(**config_dict) diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/model.py b/modelscope/models/multi_modal/multi_stage_diffusion/model.py index 59bd837d..58fd6698 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/model.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/model.py @@ -50,7 +50,8 @@ class UnCLIP(nn.Module): def __init__(self, model_dir): super(UnCLIP, self).__init__() self.model_dir = model_dir - self.config = json.load(open(f'{model_dir}/{ModelFile.CONFIGURATION}')) + self.config = json.load( + open(f'{model_dir}/{ModelFile.CONFIGURATION}', encoding='utf-8')) # modules self.clip = CLIP(**self.config['clip']).fp16() diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py index fc578b25..77dff54a 100644 --- a/modelscope/models/multi_modal/ofa_for_all_tasks.py +++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py @@ -312,7 +312,7 @@ class OfaForAllTasks(TorchModel): if self.cfg.model.get('answer2label', None): ans2label_file = osp.join(self.model_dir, self.cfg.model.answer2label) - with open(ans2label_file, 'r') as reader: + with open(ans2label_file, 'r', encoding='utf-8') as reader: self.ans2label_dict = json.load(reader) def save_pretrained(self, diff --git a/modelscope/models/nlp/mglm/arguments.py b/modelscope/models/nlp/mglm/arguments.py index 13b3aeab..4fa33c65 100755 --- a/modelscope/models/nlp/mglm/arguments.py +++ b/modelscope/models/nlp/mglm/arguments.py @@ -743,7 +743,7 @@ def get_args(): if hasattr(args, 'deepspeed' ) and args.deepspeed and args.deepspeed_config is not None: - with open(args.deepspeed_config) as file: + with open(args.deepspeed_config, encoding='utf-8') as file: deepspeed_config = json.load(file) if 'train_micro_batch_size_per_gpu' in deepspeed_config: args.batch_size = deepspeed_config[ diff --git a/modelscope/models/nlp/mglm/data_utils/corpora.py b/modelscope/models/nlp/mglm/data_utils/corpora.py index 7c6f58f8..cf756c0a 100755 --- a/modelscope/models/nlp/mglm/data_utils/corpora.py +++ b/modelscope/models/nlp/mglm/data_utils/corpora.py @@ -156,7 +156,7 @@ class DataReader: def read_input_to_queue(): for path in paths: print_rank_0(f'Start reading {path}') - with open(path) as file: + with open(path, encoding='utf-8') as file: items = json.load(file) for item in items: task_queue.put(item) diff --git a/modelscope/models/nlp/mglm/data_utils/datasets.py b/modelscope/models/nlp/mglm/data_utils/datasets.py index 777b7d43..39ffaea3 100644 --- a/modelscope/models/nlp/mglm/data_utils/datasets.py +++ b/modelscope/models/nlp/mglm/data_utils/datasets.py @@ -511,12 +511,12 @@ class json_dataset(data.Dataset): def load_json_stream(self, load_path): if not self.loose_json: - jsons = json.load(open(load_path, 'r')) + jsons = json.load(open(load_path, 'r', encoding='utf-8')) generator = iter(jsons) else: def gen_helper(): - with open(load_path, 'r') as f: + with open(load_path, 'r', encoding='utf-8') as f: for row in f: yield json.loads(row) diff --git a/modelscope/models/nlp/mglm/data_utils/extraction.py b/modelscope/models/nlp/mglm/data_utils/extraction.py index 53027e4f..da062f34 100644 --- a/modelscope/models/nlp/mglm/data_utils/extraction.py +++ b/modelscope/models/nlp/mglm/data_utils/extraction.py @@ -29,7 +29,9 @@ with open(output_path, 'w') as output: print(filename) article_lines = [] article_open = False - with open(filename, mode='r', newline='\n') as file: + with open( + filename, mode='r', newline='\n', + encoding='utf-8') as file: for line in file: line = line.rstrip() if ' List[InputExample]: examples = [] - with open(path) as f: + with open(path, encoding='utf-8') as f: reader = csv.reader(f, delimiter=',') for idx, row in enumerate(reader): label, headline, body = row @@ -1209,7 +1209,7 @@ class YelpPolarityProcessor(DataProcessor): def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] - with open(path) as f: + with open(path, encoding='utf-8') as f: reader = csv.reader(f, delimiter=',') for idx, row in enumerate(reader): label, body = row @@ -1419,7 +1419,7 @@ class SquadProcessor(DataProcessor): @staticmethod def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] - with open(path) as f: + with open(path, encoding='utf-8') as f: data = json.load(f)['data'] for idx, passage in enumerate(data): diff --git a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py index ff394172..e149f503 100644 --- a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py +++ b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py @@ -538,7 +538,7 @@ class PVP(ABC): dict) # type: Dict[int, Dict[str, List[str]]] current_pattern_id = None - with open(path, 'r') as fh: + with open(path, 'r', encoding='utf-8') as fh: for line in fh.read().splitlines(): if line.isdigit(): current_pattern_id = int(line) diff --git a/modelscope/models/nlp/mglm/utils.py b/modelscope/models/nlp/mglm/utils.py index 2bfcf8c0..0e781189 100644 --- a/modelscope/models/nlp/mglm/utils.py +++ b/modelscope/models/nlp/mglm/utils.py @@ -77,7 +77,7 @@ def print_and_save_args(args, verbose=True, log_dir=None): with open(json_file, 'w') as output: json.dump(vars(args), output, sort_keys=True) if args.deepspeed and args.deepspeed_config is not None: - with open(args.deepspeed_config) as file: + with open(args.deepspeed_config, encoding='utf-8') as file: deepspeed_config = json.load(file) deepspeed_json_file = os.path.join(log_dir, 'config_gpt_large.json') @@ -324,7 +324,7 @@ def get_checkpoint_iteration(load_path): print_rank_0(' will not load any checkpoints and will start from ' 'random') return load_path, 0, False, False - with open(tracker_filename, 'r') as f: + with open(tracker_filename, 'r', encoding='utf-8') as f: metastring = f.read().strip() release = metastring == 'release' # try: diff --git a/modelscope/models/science/unifold/data/residue_constants.py b/modelscope/models/science/unifold/data/residue_constants.py index beebfe89..2701ee38 100644 --- a/modelscope/models/science/unifold/data/residue_constants.py +++ b/modelscope/models/science/unifold/data/residue_constants.py @@ -443,7 +443,7 @@ def load_stereo_chemical_props(): stereo_chemical_props_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'stereo_chemical_props.txt') - with open(stereo_chemical_props_path, 'rt') as f: + with open(stereo_chemical_props_path, 'rt', encoding='utf-8') as f: stereo_chemical_props = f.read() lines_iter = iter(stereo_chemical_props.splitlines()) # Load bond lengths. diff --git a/modelscope/models/science/unifold/dataset.py b/modelscope/models/science/unifold/dataset.py index 29e1a8b0..f14c2ef7 100644 --- a/modelscope/models/science/unifold/dataset.py +++ b/modelscope/models/science/unifold/dataset.py @@ -250,7 +250,7 @@ class UnifoldDataset(UnicoreDataset): self.path = data_path def load_json(filename): - return json.load(open(filename, 'r')) + return json.load(open(filename, 'r', encoding='utf-8')) sample_weight = load_json( os.path.join(self.path, @@ -400,7 +400,8 @@ class UnifoldMultimerDataset(UnifoldDataset): self.pdb_assembly = json.load( open( os.path.join(self.data_path, - json_prefix + 'pdb_assembly.json'))) + json_prefix + 'pdb_assembly.json'), + encoding='utf-8')) self.pdb_chains = self.get_chains(self.inverse_multi_label) self.monomer_feature_path = os.path.join(self.data_path, 'pdb_features') diff --git a/modelscope/models/science/unifold/msa/pipeline.py b/modelscope/models/science/unifold/msa/pipeline.py index b7889bff..8037e50e 100644 --- a/modelscope/models/science/unifold/msa/pipeline.py +++ b/modelscope/models/science/unifold/msa/pipeline.py @@ -99,7 +99,7 @@ def run_msa_tool( f.write(result[msa_format]) else: logging.warning('Reading MSA from file %s', msa_out_path) - with open(msa_out_path, 'r') as f: + with open(msa_out_path, 'r', encoding='utf-8') as f: result = {msa_format: f.read()} return result @@ -153,7 +153,7 @@ class DataPipeline: def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: """Runs alignment tools on the input sequence and creates features.""" - with open(input_fasta_path) as f: + with open(input_fasta_path, encoding='utf-8') as f: input_fasta_str = f.read() input_seqs, input_descs = parsers.parse_fasta(input_fasta_str) if len(input_seqs) != 1: diff --git a/modelscope/models/science/unifold/msa/templates.py b/modelscope/models/science/unifold/msa/templates.py index fe3bcef9..d1ff8cf1 100644 --- a/modelscope/models/science/unifold/msa/templates.py +++ b/modelscope/models/science/unifold/msa/templates.py @@ -155,7 +155,7 @@ def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]: """Parses release dates file, returns a mapping from PDBs to release dates.""" if path.endswith('txt'): release_dates = {} - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: for line in f: pdb_id, date = line.split(':') date = date.strip() diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py index 68cbf918..49991b11 100644 --- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py +++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py @@ -106,14 +106,14 @@ class MovieSceneSegmentationDataset(TorchTaskDataset): self.tmpl = '{}/shot_{}_img_{}.jpg' # video_id, shot_id, shot_num if not self.test_mode: - with open(self.ann_file) as f: + with open(self.ann_file, encoding='utf-8') as f: self.anno_data = json.load(f) self.vidsid2label = { f"{it['video_id']}_{it['shot_id']}": it['boundary_label'] for it in self.anno_data } else: - with open(self.ann_file) as f: + with open(self.ann_file, encoding='utf-8') as f: self.anno_data = json.load(f) def init_sampler(self, cfg): diff --git a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py b/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py index c90351e9..8b6d22a4 100644 --- a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py +++ b/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py @@ -146,7 +146,7 @@ class ReferringVideoObjectSegmentationDataset(TorchTaskDataset): saved_annotations_file_path = osp.join( root_path, f'sentences_single_frame_{subset}_annotations.json') if osp.exists(saved_annotations_file_path): - with open(saved_annotations_file_path, 'r') as f: + with open(saved_annotations_file_path, 'r', encoding='utf-8') as f: text_annotations_by_frame = [tuple(a) for a in json.load(f)] return text_annotations_by_frame elif (distributed and dist.get_rank() == 0) or not distributed: @@ -203,7 +203,7 @@ class ReferringVideoObjectSegmentationDataset(TorchTaskDataset): json.dump(text_annotations_by_frame, f) if distributed: dist.barrier() - with open(saved_annotations_file_path, 'r') as f: + with open(saved_annotations_file_path, 'r', encoding='utf-8') as f: text_annotations_by_frame = [tuple(a) for a in json.load(f)] return text_annotations_by_frame @@ -267,8 +267,10 @@ def get_text_annotations_gt(root_path, subset): osp.join(root_path, 'Release/videoset.csv'), header=None) # 'vid', 'label', 'start_time', 'end_time', 'height', 'width', 'total_frames', 'annotated_frames', 'subset' a2d_data_info.columns = ['vid', '', '', '', '', '', '', '', 'subset'] - with open(osp.join(root_path, 'text_annotations/missed_videos.txt'), - 'r') as f: + with open( + osp.join(root_path, 'text_annotations/missed_videos.txt'), + 'r', + encoding='utf-8') as f: unused_videos = f.read().splitlines() subsets = {'train': 0, 'test': 1} # filter unused videos and videos which do not belong to our train/test subset: diff --git a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py index 34eb0450..02639be8 100644 --- a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py +++ b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py @@ -26,7 +26,7 @@ class VideoSummarizationDataset(TorchTaskDataset): self.list_n_frames = [] self.list_positions = [] - with open(self.split_filename) as f: + with open(self.split_filename, encoding='utf-8') as f: data = json.loads(f.read()) for i, split in enumerate(data): if i == self.split_index: diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py index 6a4864bf..da339083 100644 --- a/modelscope/pipelines/audio/asr_inference_pipeline.py +++ b/modelscope/pipelines/audio/asr_inference_pipeline.py @@ -116,7 +116,7 @@ class AutomaticSpeechRecognitionPipeline(Pipeline): } if self.framework == Frameworks.torch: - config_file = open(inputs['asr_model_config']) + config_file = open(inputs['asr_model_config'], encoding='utf-8') root = yaml.full_load(config_file) config_file.close() frontend_conf = None diff --git a/modelscope/pipelines/cv/animal_recognition_pipeline.py b/modelscope/pipelines/cv/animal_recognition_pipeline.py index 671a5b4c..6d395a46 100644 --- a/modelscope/pipelines/cv/animal_recognition_pipeline.py +++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py @@ -109,7 +109,7 @@ class AnimalRecognitionPipeline(Pipeline): def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: label_mapping_path = osp.join(self.local_path, 'label_mapping.txt') - with open(label_mapping_path, 'r') as f: + with open(label_mapping_path, 'r', encoding='utf-8') as f: label_mapping = f.readlines() score = torch.max(inputs['outputs']) inputs = { diff --git a/modelscope/pipelines/cv/general_recognition_pipeline.py b/modelscope/pipelines/cv/general_recognition_pipeline.py index 80f6f88a..c1136882 100644 --- a/modelscope/pipelines/cv/general_recognition_pipeline.py +++ b/modelscope/pipelines/cv/general_recognition_pipeline.py @@ -110,7 +110,7 @@ class GeneralRecognitionPipeline(Pipeline): def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: label_mapping_path = osp.join(self.local_path, 'meta_info.txt') - with open(label_mapping_path, 'r') as f: + with open(label_mapping_path, 'r', encoding='utf-8') as f: label_mapping = f.readlines() score = torch.max(inputs['outputs']) inputs = { diff --git a/modelscope/pipelines/cv/ocr_recognition_pipeline.py b/modelscope/pipelines/cv/ocr_recognition_pipeline.py index e81467a1..d90f8db6 100644 --- a/modelscope/pipelines/cv/ocr_recognition_pipeline.py +++ b/modelscope/pipelines/cv/ocr_recognition_pipeline.py @@ -49,7 +49,7 @@ class OCRRecognitionPipeline(Pipeline): self.infer_model.load_state_dict( torch.load(model_path, map_location=self.device)) self.labelMapping = dict() - with open(label_path, 'r') as f: + with open(label_path, 'r', encoding='utf-8') as f: lines = f.readlines() cnt = 2 for line in lines: diff --git a/modelscope/pipelines/cv/tinynas_classification_pipeline.py b/modelscope/pipelines/cv/tinynas_classification_pipeline.py index a470e58b..4dfd5c51 100644 --- a/modelscope/pipelines/cv/tinynas_classification_pipeline.py +++ b/modelscope/pipelines/cv/tinynas_classification_pipeline.py @@ -82,7 +82,7 @@ class TinynasClassificationPipeline(Pipeline): def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: label_mapping_path = osp.join(self.path, 'label_map.txt') - f = open(label_mapping_path) + f = open(label_mapping_path, encoding='utf-8') content = f.read() f.close() label_dict = eval(content) diff --git a/modelscope/pipelines/cv/video_category_pipeline.py b/modelscope/pipelines/cv/video_category_pipeline.py index e4c73649..4c52205e 100644 --- a/modelscope/pipelines/cv/video_category_pipeline.py +++ b/modelscope/pipelines/cv/video_category_pipeline.py @@ -36,7 +36,7 @@ class VideoCategoryPipeline(Pipeline): super().__init__(model=model, **kwargs) config_path = osp.join(self.model, ModelFile.CONFIGURATION) logger.info(f'loading configuration from {config_path}') - with open(config_path, 'r') as f: + with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) self.frame_num = config['frame_num'] self.level_1_num = config['level_1_num'] diff --git a/modelscope/pipelines/science/protein_structure_pipeline.py b/modelscope/pipelines/science/protein_structure_pipeline.py index 3dc51c72..1ef9aa29 100644 --- a/modelscope/pipelines/science/protein_structure_pipeline.py +++ b/modelscope/pipelines/science/protein_structure_pipeline.py @@ -59,8 +59,9 @@ def load_feature_for_one_target( else: uniprot_msa_dir = data_folder - sequence_ids = open(os.path.join(data_folder, - 'chains.txt')).readline().split() + sequence_ids = open( + os.path.join(data_folder, 'chains.txt'), + encoding='utf-8').readline().split() if symmetry_group is None: batch, _ = load_and_process( diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py index 1e659218..f02381ad 100644 --- a/modelscope/preprocessors/audio.py +++ b/modelscope/preprocessors/audio.py @@ -15,7 +15,7 @@ from modelscope.utils.constant import Fields def load_kaldi_feature_transform(filename): - fp = open(filename, 'r') + fp = open(filename, 'r', encoding='utf-8') all_str = fp.read() pos1 = all_str.find('AddShift') pos2 = all_str.find('[', pos1) diff --git a/modelscope/preprocessors/kws.py b/modelscope/preprocessors/kws.py index 6f09d545..33847702 100644 --- a/modelscope/preprocessors/kws.py +++ b/modelscope/preprocessors/kws.py @@ -78,7 +78,7 @@ class WavToLists(Preprocessor): assert os.path.exists( inputs['config_path']), 'model config yaml file does not exist' - config_file = open(inputs['config_path']) + config_file = open(inputs['config_path'], encoding='utf-8') root = yaml.full_load(config_file) config_file.close() diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 3a3ae820..52cde61c 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -145,8 +145,9 @@ class CLIPPreprocessor(Preprocessor): self.image_resolution = kwargs['resolution'] else: self.image_resolution = json.load( - open('{}/vision_model_config.json'.format( - model_dir)))['image_resolution'] + open( + '{}/vision_model_config.json'.format(model_dir), + encoding='utf-8'))['image_resolution'] self.img_preprocess = self._build_image_transform() # key mapping # specify the input keys, compatible with training and inference whose key names may be different diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index 45efc6e7..7fe28eb5 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -59,8 +59,10 @@ class NLPBasePreprocessor(Preprocessor, ABC): self.use_fast = False elif self.use_fast is None and os.path.isfile( os.path.join(model_dir, 'tokenizer_config.json')): - with open(os.path.join(model_dir, 'tokenizer_config.json'), - 'r') as f: + with open( + os.path.join(model_dir, 'tokenizer_config.json'), + 'r', + encoding='utf-8') as f: json_config = json.load(f) self.use_fast = json_config.get('use_fast') self.use_fast = False if self.use_fast is None else self.use_fast diff --git a/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py index 2923157e..5aa662fc 100644 --- a/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py +++ b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py @@ -35,7 +35,10 @@ class DialogIntentPredictionPreprocessor(Preprocessor): self.model_dir, config=self.config) self.categories = None - with open(os.path.join(self.model_dir, 'categories.json'), 'r') as f: + with open( + os.path.join(self.model_dir, 'categories.json'), + 'r', + encoding='utf-8') as f: self.categories = json.load(f) assert len(self.categories) == 77 diff --git a/modelscope/preprocessors/nlp/space/dst_processors.py b/modelscope/preprocessors/nlp/space/dst_processors.py index 1f9920a9..1b6159b5 100644 --- a/modelscope/preprocessors/nlp/space/dst_processors.py +++ b/modelscope/preprocessors/nlp/space/dst_processors.py @@ -184,7 +184,7 @@ class multiwoz22Processor(DSTProcessor): # Loads the dialogue_acts.json and returns a list # of slot-value pairs. def load_acts(self, input_file): - with open(input_file) as f: + with open(input_file, encoding='utf-8') as f: acts = json.load(f) s_dict = {} for d in acts: diff --git a/modelscope/preprocessors/nlp/space/fields/gen_field.py b/modelscope/preprocessors/nlp/space/fields/gen_field.py index 1d1879fe..20b2c48a 100644 --- a/modelscope/preprocessors/nlp/space/fields/gen_field.py +++ b/modelscope/preprocessors/nlp/space/fields/gen_field.py @@ -359,12 +359,14 @@ class MultiWOZBPETextField(BPETextField): test_list = [ line.strip().lower() for line in open( os.path.join(kwargs['data_dir'], 'testListFile.json'), - 'r').readlines() + 'r', + encoding='utf-8').readlines() ] dev_list = [ line.strip().lower() for line in open( os.path.join(kwargs['data_dir'], 'valListFile.json'), - 'r').readlines() + 'r', + encoding='utf-8').readlines() ] self.dev_files, self.test_files = {}, {} diff --git a/modelscope/preprocessors/nlp/space/tokenizer.py b/modelscope/preprocessors/nlp/space/tokenizer.py index 1bd0ce11..798ce3b7 100644 --- a/modelscope/preprocessors/nlp/space/tokenizer.py +++ b/modelscope/preprocessors/nlp/space/tokenizer.py @@ -531,7 +531,7 @@ class GPT2Tokenizer(object): special_tokens=None, max_len=None): self.max_len = max_len if max_len is not None else int(1e12) - self.encoder = json.load(open(vocab_file)) + self.encoder = json.load(open(vocab_file, encoding='utf-8')) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() diff --git a/modelscope/preprocessors/nlp/space_T_cn/fields/database.py b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py index 5ceb5c79..1300cc95 100644 --- a/modelscope/preprocessors/nlp/space_T_cn/fields/database.py +++ b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py @@ -32,12 +32,12 @@ class Database: tables = {} lines = [] if type(table_file_path) == str: - with open(table_file_path, 'r') as fo: + with open(table_file_path, 'r', encoding='utf-8') as fo: for line in fo: lines.append(line) elif type(table_file_path) == list: for path in table_file_path: - with open(path, 'r') as fo: + with open(path, 'r', encoding='utf-8') as fo: for line in fo: lines.append(line) else: diff --git a/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py index 00c7bcd7..0ebd857e 100644 --- a/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py +++ b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py @@ -45,7 +45,7 @@ class ConversationalTextToSqlPreprocessor(Preprocessor): and torch.cuda.is_available() else 'cpu' self.processor = None self.table_path = os.path.join(self.model_dir, 'tables.json') - self.tables = json.load(open(self.table_path, 'r')) + self.tables = json.load(open(self.table_path, 'r', encoding='utf-8')) self.output_tables = None self.path_cache = [] self.graph_processor = GraphProcessor() @@ -89,7 +89,7 @@ class ConversationalTextToSqlPreprocessor(Preprocessor): 'local_db_path'] not in self.path_cache: self.path_cache.append(data['local_db_path']) path = os.path.join(data['local_db_path'], 'tables.json') - self.tables = json.load(open(path, 'r')) + self.tables = json.load(open(path, 'r', encoding='utf-8')) self.processor.db_dir = os.path.join(data['local_db_path'], 'db') self.output_tables = process_tables(self.processor, self.tables) Example.configuration( diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py index 55b3895d..e5c30ff8 100644 --- a/modelscope/preprocessors/ofa/base.py +++ b/modelscope/preprocessors/ofa/base.py @@ -76,7 +76,7 @@ class OfaBasePreprocessor: self.constraint_trie = None if self.cfg.model.get('answer2label', None): ans2label_file = osp.join(model_dir, self.cfg.model.answer2label) - with open(ans2label_file, 'r') as reader: + with open(ans2label_file, 'r', encoding='utf-8') as reader: ans2label_dict = json.load(reader) self.ans2label = ans2label_dict self.label2ans = {v: k for k, v in self.ans2label.items()} diff --git a/modelscope/preprocessors/science/uni_fold.py b/modelscope/preprocessors/science/uni_fold.py index 2a44c885..ae72433c 100644 --- a/modelscope/preprocessors/science/uni_fold.py +++ b/modelscope/preprocessors/science/uni_fold.py @@ -201,7 +201,7 @@ def run_mmseqs2( a3m_lines = {} for a3m_file in a3m_files: update_M, M = True, None - with open(a3m_file, 'r') as f: + with open(a3m_file, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: if len(line) > 0: diff --git a/modelscope/trainers/nlp/space/eval.py b/modelscope/trainers/nlp/space/eval.py index f315ff07..2db40cae 100644 --- a/modelscope/trainers/nlp/space/eval.py +++ b/modelscope/trainers/nlp/space/eval.py @@ -771,7 +771,8 @@ class CamRestEvaluator(GenericEvaluator): def get_entities(self, entity_path): entities_flat = [] entitiy_to_slot_dict = {} - raw_entities = json.loads(open(entity_path).read().lower()) + raw_entities = json.loads( + open(entity_path, encoding='utf-8').read().lower()) for s in raw_entities['informable']: entities_flat.extend(raw_entities['informable'][s]) for v in raw_entities['informable'][s]: diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py index 32e2fa54..1ae5c8d2 100644 --- a/modelscope/utils/audio/audio_utils.py +++ b/modelscope/utils/audio/audio_utils.py @@ -47,7 +47,7 @@ def update_conf(origin_config_file, new_config_file, conf_item: [str, str]): else: return None - with open(origin_config_file) as f: + with open(origin_config_file, encoding='utf-8') as f: lines = f.readlines() with open(new_config_file, 'w') as f: for line in lines: diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py index e46da7df..b3512251 100644 --- a/modelscope/utils/config.py +++ b/modelscope/utils/config.py @@ -178,7 +178,7 @@ class Config: if cfg_text: text = cfg_text elif filename: - with open(filename, 'r') as f: + with open(filename, 'r', encoding='utf-8') as f: text = f.read() else: text = '' diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py index 105b3ffa..93cc20e2 100644 --- a/modelscope/utils/hub.py +++ b/modelscope/utils/hub.py @@ -124,7 +124,7 @@ def parse_label_mapping(model_dir): label2id = None label_path = os.path.join(model_dir, ModelFile.LABEL_MAPPING) if os.path.exists(label_path): - with open(label_path) as f: + with open(label_path, encoding='utf-8') as f: label_mapping = json.load(f) label2id = {name: idx for name, idx in label_mapping.items()} diff --git a/modelscope/utils/nlp/space/clean_dataset.py b/modelscope/utils/nlp/space/clean_dataset.py index 2c971b10..cbd0ebde 100644 --- a/modelscope/utils/nlp/space/clean_dataset.py +++ b/modelscope/utils/nlp/space/clean_dataset.py @@ -59,7 +59,9 @@ def clean_text(data_dir, text): text) # 'abc.xyz' -> 'abc . xyz' text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text) # if 'abc. ' -> 'abc . ' - with open(os.path.join(data_dir, 'mapping.pair'), 'r') as fin: + with open( + os.path.join(data_dir, 'mapping.pair'), 'r', + encoding='utf-8') as fin: for line in fin.readlines(): fromx, tox = line.replace('\n', '').split('\t') text = ' ' + text + ' ' diff --git a/modelscope/utils/nlp/space/db_ops.py b/modelscope/utils/nlp/space/db_ops.py index d1d14ef9..27198b23 100644 --- a/modelscope/utils/nlp/space/db_ops.py +++ b/modelscope/utils/nlp/space/db_ops.py @@ -15,7 +15,9 @@ class MultiWozDB(object): self.dbs = {} self.sql_dbs = {} for domain in all_domains: - with open(os.path.join(db_dir, db_paths[domain]), 'r') as f: + with open( + os.path.join(db_dir, db_paths[domain]), 'r', + encoding='utf-8') as f: self.dbs[domain] = json.loads(f.read().lower()) def oneHotVector(self, domain, num): diff --git a/modelscope/utils/nlp/space/utils.py b/modelscope/utils/nlp/space/utils.py index 56e67671..70cb03a0 100644 --- a/modelscope/utils/nlp/space/utils.py +++ b/modelscope/utils/nlp/space/utils.py @@ -146,9 +146,9 @@ class MultiWOZVocab(object): def load_vocab(self, vocab_path): self._freq_dict = json.loads( - open(vocab_path + '.freq.json', 'r').read()) + open(vocab_path + '.freq.json', 'r', encoding='utf-8').read()) self._word2idx = json.loads( - open(vocab_path + '.word2idx.json', 'r').read()) + open(vocab_path + '.word2idx.json', 'r', encoding='utf-8').read()) self._idx2word = {} for w, idx in self._word2idx.items(): self._idx2word[idx] = w diff --git a/setup.py b/setup.py index eff2f8ba..d709dadc 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ def get_hash(): def get_version(): - with open(version_file, 'r') as f: + with open(version_file, 'r', encoding='utf-8') as f: exec(compile(f.read(), version_file, 'exec')) return locals()['__version__'] @@ -109,7 +109,7 @@ def parse_requirements(fname='requirements.txt', with_version=True): yield info def parse_require_file(fpath): - with open(fpath, 'r') as f: + with open(fpath, 'r', encoding='utf-8') as f: for line in f.readlines(): line = line.strip() if line.startswith('http'): diff --git a/tests/run.py b/tests/run.py index b286ecb5..0759379f 100644 --- a/tests/run.py +++ b/tests/run.py @@ -247,7 +247,7 @@ def run_in_subprocess(args): test_suite_env_map[test_suite_file] = 'default' if args.run_config is not None and Path(args.run_config).exists(): - with open(args.run_config) as f: + with open(args.run_config, encoding='utf-8') as f: run_config = yaml.load(f, Loader=yaml.FullLoader) if 'isolated' in run_config: isolated_cases = run_config['isolated'] diff --git a/tests/trainers/easycv/test_easycv_trainer.py b/tests/trainers/easycv/test_easycv_trainer.py index 4bd63c55..5d714097 100644 --- a/tests/trainers/easycv/test_easycv_trainer.py +++ b/tests/trainers/easycv/test_easycv_trainer.py @@ -109,7 +109,7 @@ class EasyCVTrainerTestSingleGpu(unittest.TestCase): json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json')) self.assertEqual(len(json_files), 1) - with open(json_files[0], 'r') as f: + with open(json_files[0], 'r', encoding='utf-8') as f: lines = [i.strip() for i in f.readlines()] self.assertDictContainsSubset( @@ -185,7 +185,7 @@ class EasyCVTrainerTestMultiGpus(DistributedTestCase): json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json')) self.assertEqual(len(json_files), 1) - with open(json_files[0], 'r') as f: + with open(json_files[0], 'r', encoding='utf-8') as f: lines = [i.strip() for i in f.readlines()] self.assertDictContainsSubset( diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py index c73a56a3..5d466ee0 100644 --- a/tests/trainers/test_trainer.py +++ b/tests/trainers/test_trainer.py @@ -248,7 +248,7 @@ class TrainerTest(unittest.TestCase): results_files = os.listdir(self.tmp_dir) json_file = os.path.join(self.tmp_dir, f'{trainer.timestamp}.log.json') - with open(json_file, 'r') as f: + with open(json_file, 'r', encoding='utf-8') as f: lines = [i.strip() for i in f.readlines()] self.assertDictContainsSubset( { @@ -367,7 +367,7 @@ class TrainerTest(unittest.TestCase): trainer.train() results_files = os.listdir(self.tmp_dir) json_file = os.path.join(self.tmp_dir, f'{trainer.timestamp}.log.json') - with open(json_file, 'r') as f: + with open(json_file, 'r', encoding='utf-8') as f: lines = [i.strip() for i in f.readlines()] self.assertDictContainsSubset( { diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py index 0176704a..c003f3c9 100644 --- a/tests/trainers/test_trainer_gpu.py +++ b/tests/trainers/test_trainer_gpu.py @@ -142,7 +142,7 @@ class TrainerTestSingleGpu(unittest.TestCase): json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json')) self.assertEqual(len(json_files), 1) - with open(json_files[0], 'r') as f: + with open(json_files[0], 'r', encoding='utf-8') as f: lines = [i.strip() for i in f.readlines()] self.assertDictContainsSubset( { @@ -236,7 +236,7 @@ class TrainerTestMultiGpus(DistributedTestCase): json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json')) self.assertEqual(len(json_files), 1) - with open(json_files[0], 'r') as f: + with open(json_files[0], 'r', encoding='utf-8') as f: lines = [i.strip() for i in f.readlines()] self.assertDictContainsSubset( @@ -320,7 +320,7 @@ class TrainerTestMultiGpus(DistributedTestCase): json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json')) self.assertEqual(len(json_files), 1) - with open(json_files[0], 'r') as f: + with open(json_files[0], 'r', encoding='utf-8') as f: lines = [i.strip() for i in f.readlines()] print(results_files, lines)