Browse Source

[to #46106568]feat: parallel run ci case

master
mulin.lyh 2 years ago
parent
commit
90a5efa1c2
11 changed files with 178 additions and 28 deletions
  1. +1
    -4
      .dev_scripts/ci_container_test.sh
  2. +14
    -14
      .dev_scripts/dockerci.sh
  3. +0
    -1
      modelscope/models/cv/face_detection/mogface/models/detectors.py
  4. +0
    -1
      modelscope/models/cv/face_detection/mtcnn/models/detector.py
  5. +0
    -1
      modelscope/models/cv/face_detection/retinaface/detection.py
  6. +0
    -1
      modelscope/models/cv/face_detection/ulfd_slim/detection.py
  7. +0
    -1
      modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
  8. +0
    -1
      modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py
  9. +134
    -2
      tests/run.py
  10. +28
    -1
      tests/run_config.yaml
  11. +1
    -1
      tests/trainers/test_dialog_intent_trainer.py

+ 1
- 4
.dev_scripts/ci_container_test.sh View File

@@ -1,6 +1,3 @@
echo "Testing envs"
printenv
echo "ENV END"
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
pip install -r requirements/tests.txt pip install -r requirements/tests.txt
git config --global --add safe.directory /Maas-lib git config --global --add safe.directory /Maas-lib
@@ -23,7 +20,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
awk -F: '/^[^#]/ { print $1 }' requirements/science.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html awk -F: '/^[^#]/ { print $1 }' requirements/science.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/tests.txt
# test with install # test with install
python setup.py install python setup.py install
else else


+ 14
- 14
.dev_scripts/dockerci.sh View File

@@ -3,30 +3,32 @@ MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache
CODE_DIR=$PWD CODE_DIR=$PWD
CODE_DIR_IN_CONTAINER=/Maas-lib CODE_DIR_IN_CONTAINER=/Maas-lib
echo "$USER" echo "$USER"
gpus='7 6 5 4 3 2 1 0'
cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
gpus='0,1 2,3 4,5 6,7'
cpu_sets='45-58 31-44 16-30 0-15'
cpu_sets_arr=($cpu_sets) cpu_sets_arr=($cpu_sets)
is_get_file_lock=false is_get_file_lock=false
# export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml'
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND}
CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml'
echo "ci command: $CI_COMMAND" echo "ci command: $CI_COMMAND"
idx=0
for gpu in $gpus for gpu in $gpus
do do
exec {lock_fd}>"/tmp/gpu$gpu" || exit 1 exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; continue; }
flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; }
echo "get gpu lock $gpu" echo "get gpu lock $gpu"
CONTAINER_NAME="modelscope-ci-$gpu"

CONTAINER_NAME="modelscope-ci-$idx"
let is_get_file_lock=true let is_get_file_lock=true


# pull image if there are update # pull image if there are update
docker pull ${IMAGE_NAME}:${IMAGE_VERSION} docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
echo 'debugging'
docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
--cpuset-cpus=${cpu_sets_arr[$gpu]} \
--gpus="device=$gpu" \
--cpuset-cpus=${cpu_sets_arr[$idx]} \
--gpus='"'"device=$gpu"'"' \
-v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
-v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-v $MODELSCOPE_HOME_CACHE/$gpu:/root \
-v $MODELSCOPE_HOME_CACHE/$idx:/root \
-v /home/admin/pre-commit:/home/admin/pre-commit \ -v /home/admin/pre-commit:/home/admin/pre-commit \
-e CI_TEST=True \ -e CI_TEST=True \
-e TEST_LEVEL=$TEST_LEVEL \ -e TEST_LEVEL=$TEST_LEVEL \
@@ -41,16 +43,15 @@ do
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
-e MODEL_TAG_URL=$MODEL_TAG_URL \ -e MODEL_TAG_URL=$MODEL_TAG_URL \
--workdir=$CODE_DIR_IN_CONTAINER \ --workdir=$CODE_DIR_IN_CONTAINER \
--net host \
${IMAGE_NAME}:${IMAGE_VERSION} \ ${IMAGE_NAME}:${IMAGE_VERSION} \
$CI_COMMAND $CI_COMMAND
else else
docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
--cpuset-cpus=${cpu_sets_arr[$gpu]} \
--gpus="device=$gpu" \
--cpuset-cpus=${cpu_sets_arr[$idx]} \
--gpus='"'"device=$gpu"'"' \
-v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
-v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-v $MODELSCOPE_HOME_CACHE/$gpu:/root \
-v $MODELSCOPE_HOME_CACHE/$idx:/root \
-v /home/admin/pre-commit:/home/admin/pre-commit \ -v /home/admin/pre-commit:/home/admin/pre-commit \
-e CI_TEST=True \ -e CI_TEST=True \
-e TEST_LEVEL=$TEST_LEVEL \ -e TEST_LEVEL=$TEST_LEVEL \
@@ -64,7 +65,6 @@ do
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
-e MODEL_TAG_URL=$MODEL_TAG_URL \ -e MODEL_TAG_URL=$MODEL_TAG_URL \
--workdir=$CODE_DIR_IN_CONTAINER \ --workdir=$CODE_DIR_IN_CONTAINER \
--net host \
${IMAGE_NAME}:${IMAGE_VERSION} \ ${IMAGE_NAME}:${IMAGE_VERSION} \
$CI_COMMAND $CI_COMMAND
fi fi


+ 0
- 1
modelscope/models/cv/face_detection/mogface/models/detectors.py View File

@@ -20,7 +20,6 @@ class MogFaceDetector(TorchModel):


def __init__(self, model_path, device='cuda'): def __init__(self, model_path, device='cuda'):
super().__init__(model_path) super().__init__(model_path)
torch.set_grad_enabled(False)
cudnn.benchmark = True cudnn.benchmark = True
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device


+ 0
- 1
modelscope/models/cv/face_detection/mtcnn/models/detector.py View File

@@ -21,7 +21,6 @@ class MtcnnFaceDetector(TorchModel):


def __init__(self, model_path, device='cuda'): def __init__(self, model_path, device='cuda'):
super().__init__(model_path) super().__init__(model_path)
torch.set_grad_enabled(False)
cudnn.benchmark = True cudnn.benchmark = True
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device


+ 0
- 1
modelscope/models/cv/face_detection/retinaface/detection.py View File

@@ -18,7 +18,6 @@ class RetinaFaceDetection(TorchModel):


def __init__(self, model_path, device='cuda'): def __init__(self, model_path, device='cuda'):
super().__init__(model_path) super().__init__(model_path)
torch.set_grad_enabled(False)
cudnn.benchmark = True cudnn.benchmark = True
self.model_path = model_path self.model_path = model_path
self.cfg = Config.from_file( self.cfg = Config.from_file(


+ 0
- 1
modelscope/models/cv/face_detection/ulfd_slim/detection.py View File

@@ -24,7 +24,6 @@ class UlfdFaceDetector(TorchModel):


def __init__(self, model_path, device='cuda'): def __init__(self, model_path, device='cuda'):
super().__init__(model_path) super().__init__(model_path)
torch.set_grad_enabled(False)
cudnn.benchmark = True cudnn.benchmark = True
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device


+ 0
- 1
modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py View File

@@ -24,7 +24,6 @@ class FacialExpressionRecognition(TorchModel):


def __init__(self, model_path, device='cuda'): def __init__(self, model_path, device='cuda'):
super().__init__(model_path) super().__init__(model_path)
torch.set_grad_enabled(False)
cudnn.benchmark = True cudnn.benchmark = True
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device


+ 0
- 1
modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py View File

@@ -31,7 +31,6 @@ cfg_re50 = {
class RetinaFaceDetection(object): class RetinaFaceDetection(object):


def __init__(self, model_path, device='cuda'): def __init__(self, model_path, device='cuda'):
torch.set_grad_enabled(False)
cudnn.benchmark = True cudnn.benchmark = True
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device


+ 134
- 2
tests/run.py View File

@@ -3,11 +3,13 @@


import argparse import argparse
import datetime import datetime
import math
import multiprocessing import multiprocessing
import os import os
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
import time
import unittest import unittest
from fnmatch import fnmatch from fnmatch import fnmatch
from multiprocessing.managers import BaseManager from multiprocessing.managers import BaseManager
@@ -158,6 +160,21 @@ def run_command_with_popen(cmd):
sys.stdout.write(line) sys.stdout.write(line)




def async_run_command_with_popen(cmd, device_id):
logger.info('Worker id: %s args: %s' % (device_id, cmd))
env = os.environ.copy()
env['CUDA_VISIBLE_DEVICES'] = '%s' % device_id
sub_process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1,
universal_newlines=True,
env=env,
encoding='utf8')
return sub_process


def save_test_result(df, args): def save_test_result(df, args):
if args.result_dir is not None: if args.result_dir is not None:
file_name = str(int(datetime.datetime.now().timestamp() * 1000)) file_name = str(int(datetime.datetime.now().timestamp() * 1000))
@@ -199,6 +216,108 @@ def install_requirements(requirements):
run_command(cmd) run_command(cmd)




def wait_for_free_worker(workers):
while True:
for idx, worker in enumerate(workers):
if worker is None:
logger.info('return free worker: %s' % (idx))
return idx
if worker.poll() is None: # running, get output
for line in iter(worker.stdout.readline, ''):
if line != '':
sys.stdout.write(line)
else:
break
else: # worker process completed.
logger.info('Process end: %s' % (idx))
workers[idx] = None
return idx
time.sleep(0.001)


def wait_for_workers(workers):
while True:
for idx, worker in enumerate(workers):
if worker is None:
continue
# check worker is completed.
if worker.poll() is None:
for line in iter(worker.stdout.readline, ''):
if line != '':
sys.stdout.write(line)
else:
break
else:
logger.info('Process idx: %s end!' % (idx))
workers[idx] = None

is_all_completed = True
for idx, worker in enumerate(workers):
if worker is not None:
is_all_completed = False
break

if is_all_completed:
logger.info('All sub porcess is completed!')
break
time.sleep(0.001)


def parallel_run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
result_dir, parallel):
logger.info('Running case in env: %s' % env_name)
# install requirements and deps # run_config['envs'][env]
if 'requirements' in env:
install_requirements(env['requirements'])
if 'dependencies' in env:
install_packages(env['dependencies'])
# case worker processes
worker_processes = [None] * parallel
for test_suite_file in isolated_cases: # run case in subprocess
if test_suite_file in test_suite_env_map and test_suite_env_map[
test_suite_file] == env_name:
cmd = [
'python',
'tests/run.py',
'--pattern',
test_suite_file,
'--result_dir',
result_dir,
]
worker_idx = wait_for_free_worker(worker_processes)
worker_process = async_run_command_with_popen(cmd, worker_idx)
os.set_blocking(worker_process.stdout.fileno(), False)
worker_processes[worker_idx] = worker_process
else:
pass # case not in run list.

# run remain cases in a process.
remain_suite_files = []
for k, v in test_suite_env_map.items():
if k not in isolated_cases and v == env_name:
remain_suite_files.append(k)
if len(remain_suite_files) == 0:
return
# roughly split case in parallel
part_count = math.ceil(len(remain_suite_files) / parallel)
suites_chunks = [
remain_suite_files[x:x + part_count]
for x in range(0, len(remain_suite_files), part_count)
]
for suites_chunk in suites_chunks:
worker_idx = wait_for_free_worker(worker_processes)
cmd = [
'python', 'tests/run.py', '--result_dir', result_dir, '--suites'
]
for suite in suites_chunk:
cmd.append(suite)
worker_process = async_run_command_with_popen(cmd, worker_idx)
os.set_blocking(worker_process.stdout.fileno(), False)
worker_processes[worker_idx] = worker_process

wait_for_workers(worker_processes)


def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases, def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
result_dir): result_dir):
# install requirements and deps # run_config['envs'][env] # install requirements and deps # run_config['envs'][env]
@@ -264,8 +383,9 @@ def run_in_subprocess(args):


with tempfile.TemporaryDirectory() as temp_result_dir: with tempfile.TemporaryDirectory() as temp_result_dir:
for env in set(test_suite_env_map.values()): for env in set(test_suite_env_map.values()):
run_case_in_env(env, run_config['envs'][env], test_suite_env_map,
isolated_cases, temp_result_dir)
parallel_run_case_in_env(env, run_config['envs'][env],
test_suite_env_map, isolated_cases,
temp_result_dir, args.parallel)


result_dfs = [] result_dfs = []
result_path = Path(temp_result_dir) result_path = Path(temp_result_dir)
@@ -312,6 +432,10 @@ class TimeCostTextTestResult(TextTestResult):
self.stream.writeln( self.stream.writeln(
'Test case: %s stop at: %s, cost time: %s(seconds)' % 'Test case: %s stop at: %s, cost time: %s(seconds)' %
(test.test_full_name, test.stop_time, test.time_cost)) (test.test_full_name, test.stop_time, test.time_cost))
if torch.cuda.is_available(
) and test.time_cost > 5.0: # print nvidia-smi
cmd = ['nvidia-smi']
run_command_with_popen(cmd)
super(TimeCostTextTestResult, self).stopTest(test) super(TimeCostTextTestResult, self).stopTest(test)


def addSuccess(self, test): def addSuccess(self, test):
@@ -383,6 +507,8 @@ def main(args):
os.path.abspath(args.test_dir), args.pattern, args.list_tests) os.path.abspath(args.test_dir), args.pattern, args.list_tests)
if not args.list_tests: if not args.list_tests:
result = runner.run(test_suite) result = runner.run(test_suite)
logger.info('Running case completed, pid: %s, suites: %s' %
(os.getpid(), args.suites))
result = collect_test_results(result) result = collect_test_results(result)
df = test_cases_result_to_df(result) df = test_cases_result_to_df(result)
if args.result_dir is not None: if args.result_dir is not None:
@@ -417,6 +543,12 @@ if __name__ == '__main__':
'--result_dir', '--result_dir',
default=None, default=None,
help='Save result to directory, internal use only') help='Save result to directory, internal use only')
parser.add_argument(
'--parallel',
default=1,
type=int,
help='Set case parallels, default single process, set with gpu number.'
)
parser.add_argument( parser.add_argument(
'--suites', '--suites',
nargs='*', nargs='*',


+ 28
- 1
tests/run_config.yaml View File

@@ -1,5 +1,5 @@
# isolate cases in env, we can install different dependencies in each env. # isolate cases in env, we can install different dependencies in each env.
isolated: # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process.
isolated: # test cases that may require excessive anmount of GPU memory or run long time, which will be executed in dedicagted process.
- test_text_to_speech.py - test_text_to_speech.py
- test_multi_modal_embedding.py - test_multi_modal_embedding.py
- test_ofa_tasks.py - test_ofa_tasks.py
@@ -12,6 +12,33 @@ isolated: # test cases that may require excessive anmount of GPU memory, which
- test_segmentation_pipeline.py - test_segmentation_pipeline.py
- test_image_inpainting.py - test_image_inpainting.py
- test_mglm_text_summarization.py - test_mglm_text_summarization.py
- test_team_transfer_trainer.py
- test_image_denoise_trainer.py
- test_dialog_intent_trainer.py
- test_finetune_mplug.py
- test_image_instance_segmentation_trainer.py
- test_image_portrait_enhancement_trainer.py
- test_translation_trainer.py
- test_unifold.py
- test_automatic_post_editing.py
- test_mplug_tasks.py
- test_movie_scene_segmentation.py
- test_body_3d_keypoints.py
- test_finetune_text_generation.py
- test_clip_trainer.py
- test_ofa_trainer.py
- test_fill_mask.py
- test_hand_2d_keypoints.py
- test_referring_video_object_segmentation.py
- test_easycv_trainer_hand_2d_keypoints.py
- test_card_detection_scrfd_trainer.py
- test_referring_video_object_segmentation_trainer.py
- test_person_image_cartoon.py
- test_image_style_transfer.py
- test_ocr_detection.py
- test_automatic_speech_recognition.py
- test_image_matting.py
- test_skin_retouching.py


envs: envs:
default: # default env, case not in other env will in default, pytorch. default: # default env, case not in other env will in default, pytorch.


+ 1
- 1
tests/trainers/test_dialog_intent_trainer.py View File

@@ -94,7 +94,7 @@ class TestDialogIntentTrainer(unittest.TestCase):
cfg.Model.update(config['Model']) cfg.Model.update(config['Model'])
if self.debugging: if self.debugging:
cfg.Trainer.save_checkpoint = False cfg.Trainer.save_checkpoint = False
cfg.Trainer.num_epochs = 5
cfg.Trainer.num_epochs = 1
cfg.Trainer.batch_size_label = 64 cfg.Trainer.batch_size_label = 64
return cfg return cfg




Loading…
Cancel
Save