Browse Source

specifiy file encoding when open text for read

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10708723
master
yingda.chen 2 years ago
parent
commit
4e4faa9a30
64 changed files with 123 additions and 93 deletions
  1. +1
    -1
      docs/source/conf.py
  2. +2
    -2
      modelscope/hub/api.py
  3. +1
    -1
      modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
  4. +1
    -1
      modelscope/models/audio/tts/sambert_hifi.py
  5. +1
    -1
      modelscope/models/cv/tinynas_classfication/plain_net_utils.py
  6. +1
    -1
      modelscope/models/multi_modal/clip/bert_tokenizer.py
  7. +4
    -2
      modelscope/models/multi_modal/clip/model.py
  8. +4
    -2
      modelscope/models/multi_modal/diffusion/model.py
  9. +1
    -1
      modelscope/models/multi_modal/diffusion/structbert.py
  10. +1
    -1
      modelscope/models/multi_modal/diffusion/tokenizer.py
  11. +3
    -1
      modelscope/models/multi_modal/gemm/gemm_base.py
  12. +3
    -1
      modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
  13. +1
    -1
      modelscope/models/multi_modal/mplug/configuration_mplug.py
  14. +2
    -1
      modelscope/models/multi_modal/multi_stage_diffusion/model.py
  15. +1
    -1
      modelscope/models/multi_modal/ofa_for_all_tasks.py
  16. +1
    -1
      modelscope/models/nlp/mglm/arguments.py
  17. +1
    -1
      modelscope/models/nlp/mglm/data_utils/corpora.py
  18. +2
    -2
      modelscope/models/nlp/mglm/data_utils/datasets.py
  19. +3
    -1
      modelscope/models/nlp/mglm/data_utils/extraction.py
  20. +1
    -1
      modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
  21. +1
    -1
      modelscope/models/nlp/mglm/process_grid.py
  22. +1
    -1
      modelscope/models/nlp/mglm/tasks/language_model/dataset.py
  23. +6
    -4
      modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
  24. +4
    -4
      modelscope/models/nlp/mglm/tasks/superglue/dataset.py
  25. +1
    -1
      modelscope/models/nlp/mglm/tasks/superglue/pvp.py
  26. +2
    -2
      modelscope/models/nlp/mglm/utils.py
  27. +1
    -1
      modelscope/models/science/unifold/data/residue_constants.py
  28. +3
    -2
      modelscope/models/science/unifold/dataset.py
  29. +2
    -2
      modelscope/models/science/unifold/msa/pipeline.py
  30. +1
    -1
      modelscope/models/science/unifold/msa/templates.py
  31. +2
    -2
      modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
  32. +6
    -4
      modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py
  33. +1
    -1
      modelscope/msdatasets/task_datasets/video_summarization_dataset.py
  34. +1
    -1
      modelscope/pipelines/audio/asr_inference_pipeline.py
  35. +1
    -1
      modelscope/pipelines/cv/animal_recognition_pipeline.py
  36. +1
    -1
      modelscope/pipelines/cv/general_recognition_pipeline.py
  37. +1
    -1
      modelscope/pipelines/cv/ocr_recognition_pipeline.py
  38. +1
    -1
      modelscope/pipelines/cv/tinynas_classification_pipeline.py
  39. +1
    -1
      modelscope/pipelines/cv/video_category_pipeline.py
  40. +3
    -2
      modelscope/pipelines/science/protein_structure_pipeline.py
  41. +1
    -1
      modelscope/preprocessors/audio.py
  42. +1
    -1
      modelscope/preprocessors/kws.py
  43. +3
    -2
      modelscope/preprocessors/multi_modal.py
  44. +4
    -2
      modelscope/preprocessors/nlp/nlp_base.py
  45. +4
    -1
      modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
  46. +1
    -1
      modelscope/preprocessors/nlp/space/dst_processors.py
  47. +4
    -2
      modelscope/preprocessors/nlp/space/fields/gen_field.py
  48. +1
    -1
      modelscope/preprocessors/nlp/space/tokenizer.py
  49. +2
    -2
      modelscope/preprocessors/nlp/space_T_cn/fields/database.py
  50. +2
    -2
      modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
  51. +1
    -1
      modelscope/preprocessors/ofa/base.py
  52. +1
    -1
      modelscope/preprocessors/science/uni_fold.py
  53. +2
    -1
      modelscope/trainers/nlp/space/eval.py
  54. +1
    -1
      modelscope/utils/audio/audio_utils.py
  55. +1
    -1
      modelscope/utils/config.py
  56. +1
    -1
      modelscope/utils/hub.py
  57. +3
    -1
      modelscope/utils/nlp/space/clean_dataset.py
  58. +3
    -1
      modelscope/utils/nlp/space/db_ops.py
  59. +2
    -2
      modelscope/utils/nlp/space/utils.py
  60. +2
    -2
      setup.py
  61. +1
    -1
      tests/run.py
  62. +2
    -2
      tests/trainers/easycv/test_easycv_trainer.py
  63. +2
    -2
      tests/trainers/test_trainer.py
  64. +3
    -3
      tests/trainers/test_trainer_gpu.py

+ 1
- 1
docs/source/conf.py View File

@@ -25,7 +25,7 @@ version_file = '../../modelscope/version.py'


def get_version():
with open(version_file, 'r') as f:
with open(version_file, 'r', encoding='utf-8') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__version__']



+ 2
- 2
modelscope/hub/api.py View File

@@ -739,7 +739,7 @@ class ModelScopeConfig:
with open(
os.path.join(ModelScopeConfig.path_credential,
ModelScopeConfig.USER_INFO_FILE_NAME),
'r') as f:
'r', encoding='utf-8') as f:
info = f.read()
return info.split(':')[0], info.split(':')[1]
except FileNotFoundError:
@@ -760,7 +760,7 @@ class ModelScopeConfig:
with open(
os.path.join(ModelScopeConfig.path_credential,
ModelScopeConfig.GIT_TOKEN_FILE_NAME),
'r') as f:
'r', encoding='utf-8') as f:
token = f.read()
except FileNotFoundError:
pass


+ 1
- 1
modelscope/models/audio/tts/models/datasets/kantts_data4fs.py View File

@@ -21,7 +21,7 @@ class KanTtsText2MelDataset(Dataset):

self.cache = cache

with open(config_filename) as f:
with open(config_filename, encoding='utf-8') as f:
self._config = json.loads(f.read())

# Load metadata:


+ 1
- 1
modelscope/models/audio/tts/sambert_hifi.py View File

@@ -60,7 +60,7 @@ class SambertHifigan(Model):
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(model_dir)
voice_cfg_path = os.path.join(self.__voice_path, 'voices.json')
with open(voice_cfg_path, 'r') as f:
with open(voice_cfg_path, 'r', encoding='utf-8') as f:
voice_cfg = json.load(f)
if 'voices' not in voice_cfg:
raise TtsModelConfigurationException(


+ 1
- 1
modelscope/models/cv/tinynas_classfication/plain_net_utils.py View File

@@ -39,7 +39,7 @@ class PlainNet(nn.Module):
plainnet_struct_txt = self.module_opt.plainnet_struct_txt

if plainnet_struct_txt is not None:
with open(plainnet_struct_txt, 'r') as fid:
with open(plainnet_struct_txt, 'r', encoding='utf-8') as fid:
the_line = fid.readlines()[0].strip()
self.plainnet_struct = the_line
pass


+ 1
- 1
modelscope/models/multi_modal/clip/bert_tokenizer.py View File

@@ -120,7 +120,7 @@ def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, 'r') as reader:
with open(vocab_file, 'r', encoding='utf-8') as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:


+ 4
- 2
modelscope/models/multi_modal/clip/model.py View File

@@ -523,8 +523,10 @@ class CLIPForMultiModalEmbedding(TorchModel):
logger.info(f'Loading text model config from {text_model_config_file}')
assert os.path.exists(text_model_config_file)

with open(vision_model_config_file,
'r') as fv, open(text_model_config_file, 'r') as ft:
with open(
vision_model_config_file, 'r',
encoding='utf-8') as fv,\
open(text_model_config_file, 'r', encoding='utf-8') as ft:
self.model_info = json.load(fv)
for k, v in json.load(ft).items():
self.model_info[k] = v


+ 4
- 2
modelscope/models/multi_modal/diffusion/model.py View File

@@ -76,7 +76,7 @@ class DiffusionModel(nn.Module):
super(DiffusionModel, self).__init__()
# including text and generator config
model_config = json.load(
open('{}/model_config.json'.format(model_dir)))
open('{}/model_config.json'.format(model_dir), encoding='utf-8'))

# text encoder
text_config = model_config['text_config']
@@ -142,7 +142,9 @@ class DiffusionForTextToImageSynthesis(Model):

# diffusion process
diffusion_params = json.load(
open('{}/diffusion_config.json'.format(model_dir)))
open(
'{}/diffusion_config.json'.format(model_dir),
encoding='utf-8'))
self.diffusion_generator = make_diffusion(
**diffusion_params['generator_config'])
self.diffusion_upsampler_256 = make_diffusion(


+ 1
- 1
modelscope/models/multi_modal/diffusion/structbert.py View File

@@ -130,7 +130,7 @@ class BertConfig(object):
@classmethod
def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters."""
with open(json_file, 'r') as reader:
with open(json_file, 'r', encoding='utf-8') as reader:
text = reader.read()
return cls.from_dict(json.loads(text))



+ 1
- 1
modelscope/models/multi_modal/diffusion/tokenizer.py View File

@@ -67,7 +67,7 @@ def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, 'r') as reader:
with open(vocab_file, 'r', encoding='utf-8') as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:


+ 3
- 1
modelscope/models/multi_modal/gemm/gemm_base.py View File

@@ -522,7 +522,9 @@ class GEMMModel(nn.Module):

def __init__(self, model_dir):
super().__init__()
with open('{}/encoder_config.json'.format(model_dir), 'r') as f:
with open(
'{}/encoder_config.json'.format(model_dir), 'r',
encoding='utf-8') as f:
model_config = json.loads(f.read())
model_name = list(model_config.keys())[0]
config_args = model_config[model_name]


+ 3
- 1
modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py View File

@@ -35,7 +35,9 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
def __init__(self, model_dir, **kwargs):
super().__init__(model_dir=model_dir, **kwargs)
# model config parameters
with open(f'{model_dir}/{ModelFile.CONFIGURATION}', 'r') as json_file:
with open(
f'{model_dir}/{ModelFile.CONFIGURATION}', 'r',
encoding='utf-8') as json_file:
model_config = json.load(json_file)
model_config = model_config['paras']
model_config['model_dir'] = model_dir


+ 1
- 1
modelscope/models/multi_modal/mplug/configuration_mplug.py View File

@@ -111,6 +111,6 @@ class MPlugConfig(PretrainedConfig):
@classmethod
def from_yaml_file(cls, yaml_file: Union[str,
os.PathLike]) -> Dict[str, Any]:
with open(yaml_file, 'r') as reader:
with open(yaml_file, 'r', encoding='utf-8') as reader:
config_dict = yaml.load(reader, Loader=yaml.Loader)
return cls(**config_dict)

+ 2
- 1
modelscope/models/multi_modal/multi_stage_diffusion/model.py View File

@@ -50,7 +50,8 @@ class UnCLIP(nn.Module):
def __init__(self, model_dir):
super(UnCLIP, self).__init__()
self.model_dir = model_dir
self.config = json.load(open(f'{model_dir}/{ModelFile.CONFIGURATION}'))
self.config = json.load(
open(f'{model_dir}/{ModelFile.CONFIGURATION}', encoding='utf-8'))

# modules
self.clip = CLIP(**self.config['clip']).fp16()


+ 1
- 1
modelscope/models/multi_modal/ofa_for_all_tasks.py View File

@@ -312,7 +312,7 @@ class OfaForAllTasks(TorchModel):
if self.cfg.model.get('answer2label', None):
ans2label_file = osp.join(self.model_dir,
self.cfg.model.answer2label)
with open(ans2label_file, 'r') as reader:
with open(ans2label_file, 'r', encoding='utf-8') as reader:
self.ans2label_dict = json.load(reader)

def save_pretrained(self,


+ 1
- 1
modelscope/models/nlp/mglm/arguments.py View File

@@ -743,7 +743,7 @@ def get_args():

if hasattr(args, 'deepspeed'
) and args.deepspeed and args.deepspeed_config is not None:
with open(args.deepspeed_config) as file:
with open(args.deepspeed_config, encoding='utf-8') as file:
deepspeed_config = json.load(file)
if 'train_micro_batch_size_per_gpu' in deepspeed_config:
args.batch_size = deepspeed_config[


+ 1
- 1
modelscope/models/nlp/mglm/data_utils/corpora.py View File

@@ -156,7 +156,7 @@ class DataReader:
def read_input_to_queue():
for path in paths:
print_rank_0(f'Start reading {path}')
with open(path) as file:
with open(path, encoding='utf-8') as file:
items = json.load(file)
for item in items:
task_queue.put(item)


+ 2
- 2
modelscope/models/nlp/mglm/data_utils/datasets.py View File

@@ -511,12 +511,12 @@ class json_dataset(data.Dataset):

def load_json_stream(self, load_path):
if not self.loose_json:
jsons = json.load(open(load_path, 'r'))
jsons = json.load(open(load_path, 'r', encoding='utf-8'))
generator = iter(jsons)
else:

def gen_helper():
with open(load_path, 'r') as f:
with open(load_path, 'r', encoding='utf-8') as f:
for row in f:
yield json.loads(row)



+ 3
- 1
modelscope/models/nlp/mglm/data_utils/extraction.py View File

@@ -29,7 +29,9 @@ with open(output_path, 'w') as output:
print(filename)
article_lines = []
article_open = False
with open(filename, mode='r', newline='\n') as file:
with open(
filename, mode='r', newline='\n',
encoding='utf-8') as file:
for line in file:
line = line.rstrip()
if '<doc id=' in line:


+ 1
- 1
modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py View File

@@ -179,7 +179,7 @@ class GPT2Tokenizer(object):
special_tokens=None,
max_len=None):
self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file))
self.encoder = json.load(open(vocab_file), encoding='utf-8')
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()


+ 1
- 1
modelscope/models/nlp/mglm/process_grid.py View File

@@ -19,7 +19,7 @@ for dir_path in glob.glob(path_pattern, recursive=True):
valid_path = os.path.join(dir_path, 'results.json')
if os.path.exists(valid_path):
print(entry)
with open(valid_path) as file:
with open(valid_path, encoding='utf-8') as file:
valid_result = json.load(file)
else:
print(f'{entry} no validation results')


+ 1
- 1
modelscope/models/nlp/mglm/tasks/language_model/dataset.py View File

@@ -121,7 +121,7 @@ class LambadaDataset(torch.utils.data.Dataset):

self.tokens = []
self.labels = []
with open(data_path, 'r') as f:
with open(data_path, 'r', encoding='utf-8') as f:
for line in f.readlines():
text = json.loads(line)['text']
tokens, labels = self.get_tokens(text)


+ 6
- 4
modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py View File

@@ -209,14 +209,16 @@ class XSumProcessor:
raise NotImplementedError(split)
print_rank_0(f'Creating XSUM-{split} dataset from {self.data_dir}')
with open(
os.path.join(
self.data_dir,
'XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json')) as file:
os.path.join(self.data_dir,
'XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json'),
encoding='utf-8') as file:
id_list = json.load(file)
id_list = id_list[key]
source_texts, target_texts = [], []
for i, idx in enumerate(id_list):
with open(os.path.join(self.data_dir, f'{idx}.summary')) as file:
with open(
os.path.join(self.data_dir, f'{idx}.summary'),
encoding='utf-8') as file:
key, sentences = None, []
source_text, target_text = None, None
for line in file:


+ 4
- 4
modelscope/models/nlp/mglm/tasks/superglue/dataset.py View File

@@ -841,7 +841,7 @@ class RaceProcessor(DataProcessor):
path, 'middle', '*.txt')) + glob.glob(
os.path.join(path, 'high', '*.txt'))
for filename in filenames:
with open(filename, 'r') as f:
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
idx = data['id']
@@ -1127,7 +1127,7 @@ class AgnewsProcessor(DataProcessor):
def _create_examples(path: str, set_type: str) -> List[InputExample]:
examples = []

with open(path) as f:
with open(path, encoding='utf-8') as f:
reader = csv.reader(f, delimiter=',')
for idx, row in enumerate(reader):
label, headline, body = row
@@ -1209,7 +1209,7 @@ class YelpPolarityProcessor(DataProcessor):
def _create_examples(path: str, set_type: str) -> List[InputExample]:
examples = []

with open(path) as f:
with open(path, encoding='utf-8') as f:
reader = csv.reader(f, delimiter=',')
for idx, row in enumerate(reader):
label, body = row
@@ -1419,7 +1419,7 @@ class SquadProcessor(DataProcessor):
@staticmethod
def _create_examples(path: str, set_type: str) -> List[InputExample]:
examples = []
with open(path) as f:
with open(path, encoding='utf-8') as f:
data = json.load(f)['data']

for idx, passage in enumerate(data):


+ 1
- 1
modelscope/models/nlp/mglm/tasks/superglue/pvp.py View File

@@ -538,7 +538,7 @@ class PVP(ABC):
dict) # type: Dict[int, Dict[str, List[str]]]
current_pattern_id = None

with open(path, 'r') as fh:
with open(path, 'r', encoding='utf-8') as fh:
for line in fh.read().splitlines():
if line.isdigit():
current_pattern_id = int(line)


+ 2
- 2
modelscope/models/nlp/mglm/utils.py View File

@@ -77,7 +77,7 @@ def print_and_save_args(args, verbose=True, log_dir=None):
with open(json_file, 'w') as output:
json.dump(vars(args), output, sort_keys=True)
if args.deepspeed and args.deepspeed_config is not None:
with open(args.deepspeed_config) as file:
with open(args.deepspeed_config, encoding='utf-8') as file:
deepspeed_config = json.load(file)
deepspeed_json_file = os.path.join(log_dir,
'config_gpt_large.json')
@@ -324,7 +324,7 @@ def get_checkpoint_iteration(load_path):
print_rank_0(' will not load any checkpoints and will start from '
'random')
return load_path, 0, False, False
with open(tracker_filename, 'r') as f:
with open(tracker_filename, 'r', encoding='utf-8') as f:
metastring = f.read().strip()
release = metastring == 'release'
# try:


+ 1
- 1
modelscope/models/science/unifold/data/residue_constants.py View File

@@ -443,7 +443,7 @@ def load_stereo_chemical_props():
stereo_chemical_props_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'stereo_chemical_props.txt')
with open(stereo_chemical_props_path, 'rt') as f:
with open(stereo_chemical_props_path, 'rt', encoding='utf-8') as f:
stereo_chemical_props = f.read()
lines_iter = iter(stereo_chemical_props.splitlines())
# Load bond lengths.


+ 3
- 2
modelscope/models/science/unifold/dataset.py View File

@@ -250,7 +250,7 @@ class UnifoldDataset(UnicoreDataset):
self.path = data_path

def load_json(filename):
return json.load(open(filename, 'r'))
return json.load(open(filename, 'r', encoding='utf-8'))

sample_weight = load_json(
os.path.join(self.path,
@@ -400,7 +400,8 @@ class UnifoldMultimerDataset(UnifoldDataset):
self.pdb_assembly = json.load(
open(
os.path.join(self.data_path,
json_prefix + 'pdb_assembly.json')))
json_prefix + 'pdb_assembly.json'),
encoding='utf-8'))
self.pdb_chains = self.get_chains(self.inverse_multi_label)
self.monomer_feature_path = os.path.join(self.data_path,
'pdb_features')


+ 2
- 2
modelscope/models/science/unifold/msa/pipeline.py View File

@@ -99,7 +99,7 @@ def run_msa_tool(
f.write(result[msa_format])
else:
logging.warning('Reading MSA from file %s', msa_out_path)
with open(msa_out_path, 'r') as f:
with open(msa_out_path, 'r', encoding='utf-8') as f:
result = {msa_format: f.read()}
return result

@@ -153,7 +153,7 @@ class DataPipeline:
def process(self, input_fasta_path: str,
msa_output_dir: str) -> FeatureDict:
"""Runs alignment tools on the input sequence and creates features."""
with open(input_fasta_path) as f:
with open(input_fasta_path, encoding='utf-8') as f:
input_fasta_str = f.read()
input_seqs, input_descs = parsers.parse_fasta(input_fasta_str)
if len(input_seqs) != 1:


+ 1
- 1
modelscope/models/science/unifold/msa/templates.py View File

@@ -155,7 +155,7 @@ def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]:
"""Parses release dates file, returns a mapping from PDBs to release dates."""
if path.endswith('txt'):
release_dates = {}
with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
pdb_id, date = line.split(':')
date = date.strip()


+ 2
- 2
modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py View File

@@ -106,14 +106,14 @@ class MovieSceneSegmentationDataset(TorchTaskDataset):
self.tmpl = '{}/shot_{}_img_{}.jpg' # video_id, shot_id, shot_num

if not self.test_mode:
with open(self.ann_file) as f:
with open(self.ann_file, encoding='utf-8') as f:
self.anno_data = json.load(f)
self.vidsid2label = {
f"{it['video_id']}_{it['shot_id']}": it['boundary_label']
for it in self.anno_data
}
else:
with open(self.ann_file) as f:
with open(self.ann_file, encoding='utf-8') as f:
self.anno_data = json.load(f)

def init_sampler(self, cfg):


+ 6
- 4
modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py View File

@@ -146,7 +146,7 @@ class ReferringVideoObjectSegmentationDataset(TorchTaskDataset):
saved_annotations_file_path = osp.join(
root_path, f'sentences_single_frame_{subset}_annotations.json')
if osp.exists(saved_annotations_file_path):
with open(saved_annotations_file_path, 'r') as f:
with open(saved_annotations_file_path, 'r', encoding='utf-8') as f:
text_annotations_by_frame = [tuple(a) for a in json.load(f)]
return text_annotations_by_frame
elif (distributed and dist.get_rank() == 0) or not distributed:
@@ -203,7 +203,7 @@ class ReferringVideoObjectSegmentationDataset(TorchTaskDataset):
json.dump(text_annotations_by_frame, f)
if distributed:
dist.barrier()
with open(saved_annotations_file_path, 'r') as f:
with open(saved_annotations_file_path, 'r', encoding='utf-8') as f:
text_annotations_by_frame = [tuple(a) for a in json.load(f)]
return text_annotations_by_frame

@@ -267,8 +267,10 @@ def get_text_annotations_gt(root_path, subset):
osp.join(root_path, 'Release/videoset.csv'), header=None)
# 'vid', 'label', 'start_time', 'end_time', 'height', 'width', 'total_frames', 'annotated_frames', 'subset'
a2d_data_info.columns = ['vid', '', '', '', '', '', '', '', 'subset']
with open(osp.join(root_path, 'text_annotations/missed_videos.txt'),
'r') as f:
with open(
osp.join(root_path, 'text_annotations/missed_videos.txt'),
'r',
encoding='utf-8') as f:
unused_videos = f.read().splitlines()
subsets = {'train': 0, 'test': 1}
# filter unused videos and videos which do not belong to our train/test subset:


+ 1
- 1
modelscope/msdatasets/task_datasets/video_summarization_dataset.py View File

@@ -26,7 +26,7 @@ class VideoSummarizationDataset(TorchTaskDataset):
self.list_n_frames = []
self.list_positions = []

with open(self.split_filename) as f:
with open(self.split_filename, encoding='utf-8') as f:
data = json.loads(f.read())
for i, split in enumerate(data):
if i == self.split_index:


+ 1
- 1
modelscope/pipelines/audio/asr_inference_pipeline.py View File

@@ -116,7 +116,7 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
}

if self.framework == Frameworks.torch:
config_file = open(inputs['asr_model_config'])
config_file = open(inputs['asr_model_config'], encoding='utf-8')
root = yaml.full_load(config_file)
config_file.close()
frontend_conf = None


+ 1
- 1
modelscope/pipelines/cv/animal_recognition_pipeline.py View File

@@ -109,7 +109,7 @@ class AnimalRecognitionPipeline(Pipeline):

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
label_mapping_path = osp.join(self.local_path, 'label_mapping.txt')
with open(label_mapping_path, 'r') as f:
with open(label_mapping_path, 'r', encoding='utf-8') as f:
label_mapping = f.readlines()
score = torch.max(inputs['outputs'])
inputs = {


+ 1
- 1
modelscope/pipelines/cv/general_recognition_pipeline.py View File

@@ -110,7 +110,7 @@ class GeneralRecognitionPipeline(Pipeline):

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
label_mapping_path = osp.join(self.local_path, 'meta_info.txt')
with open(label_mapping_path, 'r') as f:
with open(label_mapping_path, 'r', encoding='utf-8') as f:
label_mapping = f.readlines()
score = torch.max(inputs['outputs'])
inputs = {


+ 1
- 1
modelscope/pipelines/cv/ocr_recognition_pipeline.py View File

@@ -49,7 +49,7 @@ class OCRRecognitionPipeline(Pipeline):
self.infer_model.load_state_dict(
torch.load(model_path, map_location=self.device))
self.labelMapping = dict()
with open(label_path, 'r') as f:
with open(label_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
cnt = 2
for line in lines:


+ 1
- 1
modelscope/pipelines/cv/tinynas_classification_pipeline.py View File

@@ -82,7 +82,7 @@ class TinynasClassificationPipeline(Pipeline):

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
label_mapping_path = osp.join(self.path, 'label_map.txt')
f = open(label_mapping_path)
f = open(label_mapping_path, encoding='utf-8')
content = f.read()
f.close()
label_dict = eval(content)


+ 1
- 1
modelscope/pipelines/cv/video_category_pipeline.py View File

@@ -36,7 +36,7 @@ class VideoCategoryPipeline(Pipeline):
super().__init__(model=model, **kwargs)
config_path = osp.join(self.model, ModelFile.CONFIGURATION)
logger.info(f'loading configuration from {config_path}')
with open(config_path, 'r') as f:
with open(config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
self.frame_num = config['frame_num']
self.level_1_num = config['level_1_num']


+ 3
- 2
modelscope/pipelines/science/protein_structure_pipeline.py View File

@@ -59,8 +59,9 @@ def load_feature_for_one_target(

else:
uniprot_msa_dir = data_folder
sequence_ids = open(os.path.join(data_folder,
'chains.txt')).readline().split()
sequence_ids = open(
os.path.join(data_folder, 'chains.txt'),
encoding='utf-8').readline().split()

if symmetry_group is None:
batch, _ = load_and_process(


+ 1
- 1
modelscope/preprocessors/audio.py View File

@@ -15,7 +15,7 @@ from modelscope.utils.constant import Fields


def load_kaldi_feature_transform(filename):
fp = open(filename, 'r')
fp = open(filename, 'r', encoding='utf-8')
all_str = fp.read()
pos1 = all_str.find('AddShift')
pos2 = all_str.find('[', pos1)


+ 1
- 1
modelscope/preprocessors/kws.py View File

@@ -78,7 +78,7 @@ class WavToLists(Preprocessor):
assert os.path.exists(
inputs['config_path']), 'model config yaml file does not exist'

config_file = open(inputs['config_path'])
config_file = open(inputs['config_path'], encoding='utf-8')
root = yaml.full_load(config_file)
config_file.close()



+ 3
- 2
modelscope/preprocessors/multi_modal.py View File

@@ -145,8 +145,9 @@ class CLIPPreprocessor(Preprocessor):
self.image_resolution = kwargs['resolution']
else:
self.image_resolution = json.load(
open('{}/vision_model_config.json'.format(
model_dir)))['image_resolution']
open(
'{}/vision_model_config.json'.format(model_dir),
encoding='utf-8'))['image_resolution']
self.img_preprocess = self._build_image_transform()
# key mapping
# specify the input keys, compatible with training and inference whose key names may be different


+ 4
- 2
modelscope/preprocessors/nlp/nlp_base.py View File

@@ -59,8 +59,10 @@ class NLPBasePreprocessor(Preprocessor, ABC):
self.use_fast = False
elif self.use_fast is None and os.path.isfile(
os.path.join(model_dir, 'tokenizer_config.json')):
with open(os.path.join(model_dir, 'tokenizer_config.json'),
'r') as f:
with open(
os.path.join(model_dir, 'tokenizer_config.json'),
'r',
encoding='utf-8') as f:
json_config = json.load(f)
self.use_fast = json_config.get('use_fast')
self.use_fast = False if self.use_fast is None else self.use_fast


+ 4
- 1
modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py View File

@@ -35,7 +35,10 @@ class DialogIntentPredictionPreprocessor(Preprocessor):
self.model_dir, config=self.config)

self.categories = None
with open(os.path.join(self.model_dir, 'categories.json'), 'r') as f:
with open(
os.path.join(self.model_dir, 'categories.json'),
'r',
encoding='utf-8') as f:
self.categories = json.load(f)
assert len(self.categories) == 77



+ 1
- 1
modelscope/preprocessors/nlp/space/dst_processors.py View File

@@ -184,7 +184,7 @@ class multiwoz22Processor(DSTProcessor):
# Loads the dialogue_acts.json and returns a list
# of slot-value pairs.
def load_acts(self, input_file):
with open(input_file) as f:
with open(input_file, encoding='utf-8') as f:
acts = json.load(f)
s_dict = {}
for d in acts:


+ 4
- 2
modelscope/preprocessors/nlp/space/fields/gen_field.py View File

@@ -359,12 +359,14 @@ class MultiWOZBPETextField(BPETextField):
test_list = [
line.strip().lower() for line in open(
os.path.join(kwargs['data_dir'], 'testListFile.json'),
'r').readlines()
'r',
encoding='utf-8').readlines()
]
dev_list = [
line.strip().lower() for line in open(
os.path.join(kwargs['data_dir'], 'valListFile.json'),
'r').readlines()
'r',
encoding='utf-8').readlines()
]

self.dev_files, self.test_files = {}, {}


+ 1
- 1
modelscope/preprocessors/nlp/space/tokenizer.py View File

@@ -531,7 +531,7 @@ class GPT2Tokenizer(object):
special_tokens=None,
max_len=None):
self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file))
self.encoder = json.load(open(vocab_file, encoding='utf-8'))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()


+ 2
- 2
modelscope/preprocessors/nlp/space_T_cn/fields/database.py View File

@@ -32,12 +32,12 @@ class Database:
tables = {}
lines = []
if type(table_file_path) == str:
with open(table_file_path, 'r') as fo:
with open(table_file_path, 'r', encoding='utf-8') as fo:
for line in fo:
lines.append(line)
elif type(table_file_path) == list:
for path in table_file_path:
with open(path, 'r') as fo:
with open(path, 'r', encoding='utf-8') as fo:
for line in fo:
lines.append(line)
else:


+ 2
- 2
modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py View File

@@ -45,7 +45,7 @@ class ConversationalTextToSqlPreprocessor(Preprocessor):
and torch.cuda.is_available() else 'cpu'
self.processor = None
self.table_path = os.path.join(self.model_dir, 'tables.json')
self.tables = json.load(open(self.table_path, 'r'))
self.tables = json.load(open(self.table_path, 'r', encoding='utf-8'))
self.output_tables = None
self.path_cache = []
self.graph_processor = GraphProcessor()
@@ -89,7 +89,7 @@ class ConversationalTextToSqlPreprocessor(Preprocessor):
'local_db_path'] not in self.path_cache:
self.path_cache.append(data['local_db_path'])
path = os.path.join(data['local_db_path'], 'tables.json')
self.tables = json.load(open(path, 'r'))
self.tables = json.load(open(path, 'r', encoding='utf-8'))
self.processor.db_dir = os.path.join(data['local_db_path'], 'db')
self.output_tables = process_tables(self.processor, self.tables)
Example.configuration(


+ 1
- 1
modelscope/preprocessors/ofa/base.py View File

@@ -76,7 +76,7 @@ class OfaBasePreprocessor:
self.constraint_trie = None
if self.cfg.model.get('answer2label', None):
ans2label_file = osp.join(model_dir, self.cfg.model.answer2label)
with open(ans2label_file, 'r') as reader:
with open(ans2label_file, 'r', encoding='utf-8') as reader:
ans2label_dict = json.load(reader)
self.ans2label = ans2label_dict
self.label2ans = {v: k for k, v in self.ans2label.items()}


+ 1
- 1
modelscope/preprocessors/science/uni_fold.py View File

@@ -201,7 +201,7 @@ def run_mmseqs2(
a3m_lines = {}
for a3m_file in a3m_files:
update_M, M = True, None
with open(a3m_file, 'r') as f:
with open(a3m_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
if len(line) > 0:


+ 2
- 1
modelscope/trainers/nlp/space/eval.py View File

@@ -771,7 +771,8 @@ class CamRestEvaluator(GenericEvaluator):
def get_entities(self, entity_path):
entities_flat = []
entitiy_to_slot_dict = {}
raw_entities = json.loads(open(entity_path).read().lower())
raw_entities = json.loads(
open(entity_path, encoding='utf-8').read().lower())
for s in raw_entities['informable']:
entities_flat.extend(raw_entities['informable'][s])
for v in raw_entities['informable'][s]:


+ 1
- 1
modelscope/utils/audio/audio_utils.py View File

@@ -47,7 +47,7 @@ def update_conf(origin_config_file, new_config_file, conf_item: [str, str]):
else:
return None

with open(origin_config_file) as f:
with open(origin_config_file, encoding='utf-8') as f:
lines = f.readlines()
with open(new_config_file, 'w') as f:
for line in lines:


+ 1
- 1
modelscope/utils/config.py View File

@@ -178,7 +178,7 @@ class Config:
if cfg_text:
text = cfg_text
elif filename:
with open(filename, 'r') as f:
with open(filename, 'r', encoding='utf-8') as f:
text = f.read()
else:
text = ''


+ 1
- 1
modelscope/utils/hub.py View File

@@ -124,7 +124,7 @@ def parse_label_mapping(model_dir):
label2id = None
label_path = os.path.join(model_dir, ModelFile.LABEL_MAPPING)
if os.path.exists(label_path):
with open(label_path) as f:
with open(label_path, encoding='utf-8') as f:
label_mapping = json.load(f)
label2id = {name: idx for name, idx in label_mapping.items()}



+ 3
- 1
modelscope/utils/nlp/space/clean_dataset.py View File

@@ -59,7 +59,9 @@ def clean_text(data_dir, text):
text) # 'abc.xyz' -> 'abc . xyz'
text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text) # if 'abc. ' -> 'abc . '

with open(os.path.join(data_dir, 'mapping.pair'), 'r') as fin:
with open(
os.path.join(data_dir, 'mapping.pair'), 'r',
encoding='utf-8') as fin:
for line in fin.readlines():
fromx, tox = line.replace('\n', '').split('\t')
text = ' ' + text + ' '


+ 3
- 1
modelscope/utils/nlp/space/db_ops.py View File

@@ -15,7 +15,9 @@ class MultiWozDB(object):
self.dbs = {}
self.sql_dbs = {}
for domain in all_domains:
with open(os.path.join(db_dir, db_paths[domain]), 'r') as f:
with open(
os.path.join(db_dir, db_paths[domain]), 'r',
encoding='utf-8') as f:
self.dbs[domain] = json.loads(f.read().lower())

def oneHotVector(self, domain, num):


+ 2
- 2
modelscope/utils/nlp/space/utils.py View File

@@ -146,9 +146,9 @@ class MultiWOZVocab(object):

def load_vocab(self, vocab_path):
self._freq_dict = json.loads(
open(vocab_path + '.freq.json', 'r').read())
open(vocab_path + '.freq.json', 'r', encoding='utf-8').read())
self._word2idx = json.loads(
open(vocab_path + '.word2idx.json', 'r').read())
open(vocab_path + '.word2idx.json', 'r', encoding='utf-8').read())
self._idx2word = {}
for w, idx in self._word2idx.items():
self._idx2word[idx] = w


+ 2
- 2
setup.py View File

@@ -50,7 +50,7 @@ def get_hash():


def get_version():
with open(version_file, 'r') as f:
with open(version_file, 'r', encoding='utf-8') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__version__']

@@ -109,7 +109,7 @@ def parse_requirements(fname='requirements.txt', with_version=True):
yield info

def parse_require_file(fpath):
with open(fpath, 'r') as f:
with open(fpath, 'r', encoding='utf-8') as f:
for line in f.readlines():
line = line.strip()
if line.startswith('http'):


+ 1
- 1
tests/run.py View File

@@ -247,7 +247,7 @@ def run_in_subprocess(args):
test_suite_env_map[test_suite_file] = 'default'

if args.run_config is not None and Path(args.run_config).exists():
with open(args.run_config) as f:
with open(args.run_config, encoding='utf-8') as f:
run_config = yaml.load(f, Loader=yaml.FullLoader)
if 'isolated' in run_config:
isolated_cases = run_config['isolated']


+ 2
- 2
tests/trainers/easycv/test_easycv_trainer.py View File

@@ -109,7 +109,7 @@ class EasyCVTrainerTestSingleGpu(unittest.TestCase):
json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
self.assertEqual(len(json_files), 1)

with open(json_files[0], 'r') as f:
with open(json_files[0], 'r', encoding='utf-8') as f:
lines = [i.strip() for i in f.readlines()]

self.assertDictContainsSubset(
@@ -185,7 +185,7 @@ class EasyCVTrainerTestMultiGpus(DistributedTestCase):
json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
self.assertEqual(len(json_files), 1)

with open(json_files[0], 'r') as f:
with open(json_files[0], 'r', encoding='utf-8') as f:
lines = [i.strip() for i in f.readlines()]

self.assertDictContainsSubset(


+ 2
- 2
tests/trainers/test_trainer.py View File

@@ -248,7 +248,7 @@ class TrainerTest(unittest.TestCase):
results_files = os.listdir(self.tmp_dir)

json_file = os.path.join(self.tmp_dir, f'{trainer.timestamp}.log.json')
with open(json_file, 'r') as f:
with open(json_file, 'r', encoding='utf-8') as f:
lines = [i.strip() for i in f.readlines()]
self.assertDictContainsSubset(
{
@@ -367,7 +367,7 @@ class TrainerTest(unittest.TestCase):
trainer.train()
results_files = os.listdir(self.tmp_dir)
json_file = os.path.join(self.tmp_dir, f'{trainer.timestamp}.log.json')
with open(json_file, 'r') as f:
with open(json_file, 'r', encoding='utf-8') as f:
lines = [i.strip() for i in f.readlines()]
self.assertDictContainsSubset(
{


+ 3
- 3
tests/trainers/test_trainer_gpu.py View File

@@ -142,7 +142,7 @@ class TrainerTestSingleGpu(unittest.TestCase):
json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
self.assertEqual(len(json_files), 1)

with open(json_files[0], 'r') as f:
with open(json_files[0], 'r', encoding='utf-8') as f:
lines = [i.strip() for i in f.readlines()]
self.assertDictContainsSubset(
{
@@ -236,7 +236,7 @@ class TrainerTestMultiGpus(DistributedTestCase):
json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
self.assertEqual(len(json_files), 1)

with open(json_files[0], 'r') as f:
with open(json_files[0], 'r', encoding='utf-8') as f:
lines = [i.strip() for i in f.readlines()]

self.assertDictContainsSubset(
@@ -320,7 +320,7 @@ class TrainerTestMultiGpus(DistributedTestCase):
json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
self.assertEqual(len(json_files), 1)

with open(json_files[0], 'r') as f:
with open(json_files[0], 'r', encoding='utf-8') as f:
lines = [i.strip() for i in f.readlines()]

print(results_files, lines)


Loading…
Cancel
Save