modelscope
/
ModelScope

 
			
							# Copyright (c) Alibaba, Inc. and its affiliates.

import cv2
import matplotlib.pyplot as plt
import numpy as np

from modelscope.outputs import OutputKeys
from modelscope.preprocessors.image import load_image


def numpy_to_cv2img(img_array):
    """to convert a np.array with shape(h, w) to cv2 img

    Args:
        img_array (np.array): input data

    Returns:
        cv2 img
    """
    img_array = (img_array - img_array.min()) / (
        img_array.max() - img_array.min() + 1e-5)
    img_array = (img_array * 255).astype(np.uint8)
    img_array = cv2.applyColorMap(img_array, cv2.COLORMAP_JET)
    return img_array


def draw_joints(image, np_kps, score, threshold=0.2):
    lst_parent_ids_17 = [0, 0, 0, 1, 2, 0, 0, 5, 6, 7, 8, 5, 6, 11, 12, 13, 14]
    lst_left_ids_17 = [1, 3, 5, 7, 9, 11, 13, 15]
    lst_right_ids_17 = [2, 4, 6, 8, 10, 12, 14, 16]

    lst_parent_ids_15 = [0, 0, 1, 2, 3, 1, 5, 6, 14, 8, 9, 14, 11, 12, 1]
    lst_left_ids_15 = [2, 3, 4, 8, 9, 10]
    lst_right_ids_15 = [5, 6, 7, 11, 12, 13]

    if np_kps.shape[0] == 17:
        lst_parent_ids = lst_parent_ids_17
        lst_left_ids = lst_left_ids_17
        lst_right_ids = lst_right_ids_17

    elif np_kps.shape[0] == 15:
        lst_parent_ids = lst_parent_ids_15
        lst_left_ids = lst_left_ids_15
        lst_right_ids = lst_right_ids_15

    for i in range(len(lst_parent_ids)):
        pid = lst_parent_ids[i]
        if i == pid:
            continue

        if (score[i] < threshold or score[1] < threshold):
            continue

        if i in lst_left_ids and pid in lst_left_ids:
            color = (0, 255, 0)
        elif i in lst_right_ids and pid in lst_right_ids:
            color = (255, 0, 0)
        else:
            color = (0, 255, 255)

        cv2.line(image, (int(np_kps[i, 0]), int(np_kps[i, 1])),
                 (int(np_kps[pid][0]), int(np_kps[pid, 1])), color, 3)

    for i in range(np_kps.shape[0]):
        if score[i] < threshold:
            continue
        cv2.circle(image, (int(np_kps[i, 0]), int(np_kps[i, 1])), 5,
                   (0, 0, 255), -1)


def draw_box(image, box):
    cv2.rectangle(image, (int(box[0]), int(box[1])),
                  (int(box[2]), int(box[3])), (0, 0, 255), 2)


def realtime_object_detection_bbox_vis(image, bboxes):
    for bbox in bboxes:
        cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
                      (255, 0, 0), 2)
    return image


def draw_keypoints(output, original_image):
    poses = np.array(output[OutputKeys.KEYPOINTS])
    scores = np.array(output[OutputKeys.SCORES])
    boxes = np.array(output[OutputKeys.BOXES])
    assert len(poses) == len(scores) and len(poses) == len(boxes)
    image = cv2.imread(original_image, -1)
    for i in range(len(poses)):
        draw_box(image, np.array(boxes[i]))
        draw_joints(image, np.array(poses[i]), np.array(scores[i]))
    return image


def draw_106face_keypoints(in_path,
                           keypoints,
                           boxes,
                           scale=4.0,
                           save_path=None):
    face_contour_point_index = [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
    ]
    left_eye_brow_point_index = [33, 34, 35, 36, 37, 38, 39, 40, 41, 33]
    right_eye_brow_point_index = [42, 43, 44, 45, 46, 47, 48, 49, 50, 42]
    left_eye_point_index = [66, 67, 68, 69, 70, 71, 72, 73, 66]
    right_eye_point_index = [75, 76, 77, 78, 79, 80, 81, 82, 75]
    nose_bridge_point_index = [51, 52, 53, 54]
    nose_contour_point_index = [55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]
    mouth_outer_point_index = [
        84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 84
    ]
    mouth_inter_point_index = [96, 97, 98, 99, 100, 101, 102, 103, 96]

    img = cv2.imread(in_path)

    for i in range(len(boxes)):
        draw_box(img, np.array(boxes[i]))

    image = cv2.resize(img, dsize=None, fx=scale, fy=scale)

    def draw_line(point_index, image, point):
        for i in range(len(point_index) - 1):
            cur_index = point_index[i]
            next_index = point_index[i + 1]
            cur_pt = (int(point[cur_index][0] * scale),
                      int(point[cur_index][1] * scale))
            next_pt = (int(point[next_index][0] * scale),
                       int(point[next_index][1] * scale))
            cv2.line(image, cur_pt, next_pt, (0, 0, 255), thickness=2)

    for i in range(len(keypoints)):
        points = keypoints[i]

        draw_line(face_contour_point_index, image, points)
        draw_line(left_eye_brow_point_index, image, points)
        draw_line(right_eye_brow_point_index, image, points)
        draw_line(left_eye_point_index, image, points)
        draw_line(right_eye_point_index, image, points)
        draw_line(nose_bridge_point_index, image, points)
        draw_line(nose_contour_point_index, image, points)
        draw_line(mouth_outer_point_index, image, points)
        draw_line(mouth_inter_point_index, image, points)

        size = len(points)
        for i in range(size):
            x = int(points[i][0])
            y = int(points[i][1])
            cv2.putText(image, str(i), (int(x * scale), int(y * scale)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.circle(image, (int(x * scale), int(y * scale)), 2, (0, 255, 0),
                       cv2.FILLED)

    if save_path is not None:
        cv2.imwrite(save_path, image)

    return image


def draw_face_detection_no_lm_result(img_path, detection_result):
    bboxes = np.array(detection_result[OutputKeys.BOXES])
    scores = np.array(detection_result[OutputKeys.SCORES])
    img = cv2.imread(img_path)
    assert img is not None, f"Can't read img: {img_path}"
    for i in range(len(scores)):
        bbox = bboxes[i].astype(np.int32)
        x1, y1, x2, y2 = bbox
        score = scores[i]
        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
        cv2.putText(
            img,
            f'{score:.2f}', (x1, y2),
            1,
            1.0, (0, 255, 0),
            thickness=1,
            lineType=8)
    print(f'Found {len(scores)} faces')
    return img


def draw_facial_expression_result(img_path, facial_expression_result):
    scores = facial_expression_result[OutputKeys.SCORES]
    labels = facial_expression_result[OutputKeys.LABELS]
    label = labels[np.argmax(scores)]
    img = cv2.imread(img_path)
    assert img is not None, f"Can't read img: {img_path}"
    cv2.putText(
        img,
        'facial expression: {}'.format(label), (10, 10),
        1,
        1.0, (0, 255, 0),
        thickness=1,
        lineType=8)
    print('facial expression: {}'.format(label))
    return img


def draw_face_detection_result(img_path, detection_result):
    bboxes = np.array(detection_result[OutputKeys.BOXES])
    kpss = np.array(detection_result[OutputKeys.KEYPOINTS])
    scores = np.array(detection_result[OutputKeys.SCORES])
    img = cv2.imread(img_path)
    assert img is not None, f"Can't read img: {img_path}"
    for i in range(len(scores)):
        bbox = bboxes[i].astype(np.int32)
        kps = kpss[i].reshape(-1, 2).astype(np.int32)
        score = scores[i]
        x1, y1, x2, y2 = bbox
        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
        for kp in kps:
            cv2.circle(img, tuple(kp), 1, (0, 0, 255), 1)
        cv2.putText(
            img,
            f'{score:.2f}', (x1, y2),
            1,
            1.0, (0, 255, 0),
            thickness=1,
            lineType=8)
    print(f'Found {len(scores)} faces')
    return img


def draw_card_detection_result(img_path, detection_result):

    def warp_img(src_img, kps, ratio):
        short_size = 500
        if ratio > 1:
            obj_h = short_size
            obj_w = int(obj_h * ratio)
        else:
            obj_w = short_size
            obj_h = int(obj_w / ratio)
        input_pts = np.float32([kps[0], kps[1], kps[2], kps[3]])
        output_pts = np.float32([[0, obj_h - 1], [0, 0], [obj_w - 1, 0],
                                 [obj_w - 1, obj_h - 1]])
        M = cv2.getPerspectiveTransform(input_pts, output_pts)
        obj_img = cv2.warpPerspective(src_img, M, (obj_w, obj_h))
        return obj_img

    bboxes = np.array(detection_result[OutputKeys.BOXES])
    kpss = np.array(detection_result[OutputKeys.KEYPOINTS])
    scores = np.array(detection_result[OutputKeys.SCORES])
    img_list = []
    ver_col = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (0, 255, 255)]
    img = cv2.imread(img_path)
    img_list += [img]
    assert img is not None, f"Can't read img: {img_path}"
    for i in range(len(scores)):
        bbox = bboxes[i].astype(np.int32)
        kps = kpss[i].reshape(-1, 2).astype(np.int32)
        _w = (kps[0][0] - kps[3][0])**2 + (kps[0][1] - kps[3][1])**2
        _h = (kps[0][0] - kps[1][0])**2 + (kps[0][1] - kps[1][1])**2
        ratio = 1.59 if _w >= _h else 1 / 1.59
        card_img = warp_img(img, kps, ratio)
        img_list += [card_img]
        score = scores[i]
        x1, y1, x2, y2 = bbox
        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 4)
        for k, kp in enumerate(kps):
            cv2.circle(img, tuple(kp), 1, color=ver_col[k], thickness=10)
        cv2.putText(
            img,
            f'{score:.2f}', (x1, y2),
            1,
            1.0, (0, 255, 0),
            thickness=1,
            lineType=8)
    return img_list


def created_boxed_image(image_in, box):
    image = load_image(image_in)
    img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
    cv2.rectangle(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])),
                  (0, 255, 0), 3)
    return img


def show_video_tracking_result(video_in_path, bboxes, video_save_path):
    cap = cv2.VideoCapture(video_in_path)
    for i in range(len(bboxes)):
        box = bboxes[i]
        success, frame = cap.read()
        if success is False:
            raise Exception(video_in_path,
                            ' can not be correctly decoded by OpenCV.')
        if i == 0:
            size = (frame.shape[1], frame.shape[0])
            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
            video_writer = cv2.VideoWriter(video_save_path, fourcc,
                                           cap.get(cv2.CAP_PROP_FPS), size,
                                           True)
        cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0),
                      5)
        video_writer.write(frame)
    video_writer.release
    cap.release()


def show_video_object_detection_result(video_in_path, bboxes_list, labels_list,
                                       video_save_path):

    PALETTE = {
        'person': [128, 0, 0],
        'bicycle': [128, 128, 0],
        'car': [64, 0, 0],
        'motorcycle': [0, 128, 128],
        'bus': [64, 128, 0],
        'truck': [192, 128, 0],
        'traffic light': [64, 0, 128],
        'stop sign': [192, 0, 128],
    }
    from tqdm import tqdm
    import math
    cap = cv2.VideoCapture(video_in_path)
    with tqdm(total=len(bboxes_list)) as pbar:
        pbar.set_description(
            'Writing results to video: {}'.format(video_save_path))
        for i in range(len(bboxes_list)):
            bboxes = bboxes_list[i].astype(int)
            labels = labels_list[i]
            success, frame = cap.read()
            if success is False:
                raise Exception(video_in_path,
                                ' can not be correctly decoded by OpenCV.')
            if i == 0:
                size = (frame.shape[1], frame.shape[0])
                fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
                video_writer = cv2.VideoWriter(video_save_path, fourcc,
                                               cap.get(cv2.CAP_PROP_FPS), size,
                                               True)

            FONT_SCALE = 1e-3  # Adjust for larger font size in all images
            THICKNESS_SCALE = 1e-3  # Adjust for larger thickness in all images
            TEXT_Y_OFFSET_SCALE = 1e-2  # Adjust for larger Y-offset of text and bounding box
            H, W, _ = frame.shape
            zeros_mask = np.zeros((frame.shape)).astype(np.uint8)
            for bbox, l in zip(bboxes, labels):
                cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
                              PALETTE[l], 1)
                cv2.putText(
                    frame,
                    l, (bbox[0], bbox[1] - int(TEXT_Y_OFFSET_SCALE * H)),
                    fontFace=cv2.FONT_HERSHEY_TRIPLEX,
                    fontScale=min(H, W) * FONT_SCALE,
                    thickness=math.ceil(min(H, W) * THICKNESS_SCALE),
                    color=PALETTE[l])
                zeros_mask = cv2.rectangle(
                    zeros_mask, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
                    color=PALETTE[l],
                    thickness=-1)

            frame = cv2.addWeighted(frame, 1., zeros_mask, .65, 0)
            video_writer.write(frame)
            pbar.update(1)
    video_writer.release
    cap.release()


def panoptic_seg_masks_to_image(masks):
    draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])
    from mmdet.core.visualization.palette import get_palette
    mask_palette = get_palette('coco', 133)

    from mmdet.core.visualization.image import _get_bias_color
    taken_colors = set([0, 0, 0])
    for i, mask in enumerate(masks):
        color_mask = mask_palette[i]
        while tuple(color_mask) in taken_colors:
            color_mask = _get_bias_color(color_mask)
        taken_colors.add(tuple(color_mask))

        mask = mask.astype(bool)
        draw_img[mask] = color_mask

    return draw_img


def semantic_seg_masks_to_image(masks):
    from mmdet.core.visualization.palette import get_palette
    mask_palette = get_palette('coco', 133)

    draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])

    for i, mask in enumerate(masks):
        color_mask = mask_palette[i]
        mask = mask.astype(bool)
        draw_img[mask] = color_mask
    return draw_img


def show_video_summarization_result(video_in_path, result, video_save_path):
    frame_indexes = result[OutputKeys.OUTPUT]
    cap = cv2.VideoCapture(video_in_path)
    for i in range(len(frame_indexes)):
        idx = frame_indexes[i]
        success, frame = cap.read()
        if success is False:
            raise Exception(video_in_path,
                            ' can not be correctly decoded by OpenCV.')
        if i == 0:
            size = (frame.shape[1], frame.shape[0])
            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
            video_writer = cv2.VideoWriter(video_save_path, fourcc,
                                           cap.get(cv2.CAP_PROP_FPS), size,
                                           True)
        if idx == 1:
            video_writer.write(frame)
    video_writer.release()
    cap.release()


def show_image_object_detection_auto_result(img_path,
                                            detection_result,
                                            save_path=None):
    scores = detection_result[OutputKeys.SCORES]
    labels = detection_result[OutputKeys.LABELS]
    bboxes = detection_result[OutputKeys.BOXES]
    img = cv2.imread(img_path)
    assert img is not None, f"Can't read img: {img_path}"

    for (score, label, box) in zip(scores, labels, bboxes):
        cv2.rectangle(img, (int(box[0]), int(box[1])),
                      (int(box[2]), int(box[3])), (0, 0, 255), 2)
        cv2.putText(
            img,
            f'{score:.2f}', (int(box[0]), int(box[1])),
            1,
            1.0, (0, 255, 0),
            thickness=1,
            lineType=8)
        cv2.putText(
            img,
            label, (int((box[0] + box[2]) * 0.5), int(box[1])),
            1,
            1.0, (0, 255, 0),
            thickness=1,
            lineType=8)

    if save_path is not None:
        cv2.imwrite(save_path, img)
    return img


def depth_to_color(depth):
    colormap = plt.get_cmap('plasma')
    depth_color = (colormap(
        (depth.max() - depth) / depth.max()) * 2**8).astype(np.uint8)[:, :, :3]
    depth_color = cv2.cvtColor(depth_color, cv2.COLOR_RGB2BGR)
    return depth_color