Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10328357master
@@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel): | |||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
]) | |||
self.infer_result = {'vid': [], 'sid': [], 'pred': []} | |||
sampling_method = self.cfg.dataset.sampling_method.name | |||
self.neighbor_size = self.cfg.dataset.sampling_method.params[ | |||
sampling_method].neighbor_size | |||
@@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel): | |||
shot_num = len(sids) | |||
cnt = shot_num // bs + 1 | |||
infer_sid, infer_pred = [], [] | |||
infer_result = {} | |||
for i in range(cnt): | |||
start = i * bs | |||
end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num | |||
@@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel): | |||
input_ = torch.stack(input_) | |||
outputs = self.shared_step(input_) # shape [b,2] | |||
prob = F.softmax(outputs, dim=1) | |||
self.infer_result['sid'].extend(sid_.cpu().detach().numpy()) | |||
self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy()) | |||
self.infer_result['pred'] = np.stack(self.infer_result['pred']) | |||
infer_sid.extend(sid_.cpu().detach().numpy()) | |||
infer_pred.extend(prob[:, 1].cpu().detach().numpy()) | |||
infer_result.update({'pred': np.stack(infer_pred)}) | |||
infer_result.update({'sid': infer_sid}) | |||
assert len(self.infer_result['sid']) == len(sids) | |||
assert len(self.infer_result['pred']) == len(inputs) | |||
return self.infer_result | |||
assert len(infer_result['sid']) == len(sids) | |||
assert len(infer_result['pred']) == len(inputs) | |||
return infer_result | |||
def shared_step(self, inputs): | |||
with torch.no_grad(): | |||
@@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel): | |||
thres = self.cfg.pipeline.save_threshold | |||
anno_dict = get_pred_boundary(pred_dict, thres) | |||
scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict) | |||
scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene( | |||
self.shot2keyf, anno_dict) | |||
if self.cfg.pipeline.save_split_scene: | |||
re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) | |||
print(f'Split scene video saved to {re_dir}') | |||
return len(scene_list), scene_dict_lst | |||
return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst | |||
def preprocess(self, inputs): | |||
logger.info('Begin shot detect......') | |||
@@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict): | |||
scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) | |||
scene_dict_lst = [] | |||
shot_num = len(shot2keyf) | |||
shot_dict_lst = [] | |||
for item in shot2keyf: | |||
tmp = item.split(' ') | |||
shot_dict_lst.append({ | |||
'frame': [tmp[0], tmp[1]], | |||
'timestamps': [tmp[-2], tmp[-1]] | |||
}) | |||
assert len(scene_list) == len(pair_list) | |||
for scene_ind, scene_item in enumerate(scene_list): | |||
scene_dict_lst.append({ | |||
'shot': pair_list[scene_ind], | |||
'frame': scene_item[0], | |||
'timestamp': scene_item[1] | |||
'timestamps': scene_item[1] | |||
}) | |||
return scene_dict_lst, scene_list | |||
return scene_dict_lst, scene_list, shot_num, shot_dict_lst | |||
def scene2video(source_movie_fn, scene_list, thres): | |||
@@ -38,8 +38,10 @@ class OutputKeys(object): | |||
KWS_LIST = 'kws_list' | |||
HISTORY = 'history' | |||
TIMESTAMPS = 'timestamps' | |||
SPLIT_VIDEO_NUM = 'split_video_num' | |||
SPLIT_META_LIST = 'split_meta_list' | |||
SHOT_NUM = 'shot_num' | |||
SCENE_NUM = 'scene_num' | |||
SCENE_META_LIST = 'scene_meta_list' | |||
SHOT_META_LIST = 'shot_meta_list' | |||
TASK_OUTPUTS = { | |||
@@ -309,19 +311,30 @@ TASK_OUTPUTS = { | |||
Tasks.shop_segmentation: [OutputKeys.MASKS], | |||
# movide scene segmentation result for a single video | |||
# { | |||
# "split_video_num":3, | |||
# "split_meta_list": | |||
# "shot_num":15, | |||
# "shot_meta_list": | |||
# [ | |||
# { | |||
# "frame": [start_frame, end_frame], | |||
# "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||
# | |||
# } | |||
# ] | |||
# "scene_num":3, | |||
# "scene_meta_list": | |||
# [ | |||
# { | |||
# "shot": [0,1,2], | |||
# "frame": [start_frame, end_frame], | |||
# "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||
# "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||
# } | |||
# ] | |||
# | |||
# } | |||
Tasks.movie_scene_segmentation: | |||
[OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST], | |||
Tasks.movie_scene_segmentation: [ | |||
OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM, | |||
OutputKeys.SCENE_META_LIST | |||
], | |||
# ============ nlp tasks =================== | |||
@@ -60,9 +60,12 @@ class MovieSceneSegmentationPipeline(Pipeline): | |||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
data = {'input_video_pth': self.input_video_pth, 'feat': inputs} | |||
video_num, meta_lst = self.model.postprocess(data) | |||
scene_num, scene_meta_lst, shot_num, shot_meta_lst = self.model.postprocess( | |||
data) | |||
result = { | |||
OutputKeys.SPLIT_VIDEO_NUM: video_num, | |||
OutputKeys.SPLIT_META_LIST: meta_lst | |||
OutputKeys.SHOT_NUM: shot_num, | |||
OutputKeys.SHOT_META_LIST: shot_meta_lst, | |||
OutputKeys.SCENE_NUM: scene_num, | |||
OutputKeys.SCENE_META_LIST: scene_meta_lst | |||
} | |||
return result |