diff --git a/modelscope/models/cv/face_emotion/emotion_infer.py b/modelscope/models/cv/face_emotion/emotion_infer.py
index e3398592..618822ff 100644
--- a/modelscope/models/cv/face_emotion/emotion_infer.py
+++ b/modelscope/models/cv/face_emotion/emotion_infer.py
@@ -25,9 +25,9 @@ emotion_list = [
 ]
 
 
-def inference(image_path, model, face_model, score_thre=0.5, GPU=0):
-    image = Image.open(image_path).convert('RGB')
-
+def inference(image, model, face_model, score_thre=0.5, GPU=0):
+    image = image.cpu().numpy()
+    image = Image.fromarray(image)
     face, bbox = face_detection_PIL_v2(image, face_model)
     if bbox is None:
         logger.warn('no face detected!')
diff --git a/modelscope/models/cv/face_human_hand_detection/det_infer.py b/modelscope/models/cv/face_human_hand_detection/det_infer.py
index 7a7225ee..6822bd9f 100644
--- a/modelscope/models/cv/face_human_hand_detection/det_infer.py
+++ b/modelscope/models/cv/face_human_hand_detection/det_infer.py
@@ -115,9 +115,9 @@ std = [57.375, 57.12, 58.395]
 class_names = ['person', 'face', 'hand']
 
 
-def inference(model, device, img_path):
+def inference(model, device, img):
+    img = img.cpu().numpy()
     img_info = {'id': 0}
-    img = cv2.imread(img_path)
     height, width = img.shape[:2]
     img_info['height'] = height
     img_info['width'] = width
@@ -130,4 +130,9 @@ def inference(model, device, img_path):
     with torch.no_grad():
         res = model(meta)
     result = overlay_bbox_cv(res[0], class_names, score_thresh=0.35)
-    return result
+    cls_list, bbox_list, score_list = [], [], []
+    for pred in result:
+        cls_list.append(pred[0])
+        bbox_list.append([pred[1], pred[2], pred[3], pred[4]])
+        score_list.append(pred[5])
+    return cls_list, bbox_list, score_list
diff --git a/modelscope/models/cv/hand_static/hand_model.py b/modelscope/models/cv/hand_static/hand_model.py
index 38517307..7a8a323e 100644
--- a/modelscope/models/cv/hand_static/hand_model.py
+++ b/modelscope/models/cv/hand_static/hand_model.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn.functional as F
 from PIL import Image
 from torch import nn
-from torchvision.transforms import transforms
+from torchvision import transforms
 
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
@@ -80,9 +80,9 @@ class HandStatic(TorchModel):
         return pred_result
 
 
-def infer(img_path, model, device):
-
-    img = Image.open(img_path)
+def infer(img, model, device):
+    img = img.cpu().numpy()
+    img = Image.fromarray(img)
     clip = spatial_transform(img)
     clip = clip.unsqueeze(0).to(device).float()
     outputs = model(clip)
diff --git a/modelscope/models/cv/product_segmentation/seg_infer.py b/modelscope/models/cv/product_segmentation/seg_infer.py
index 876fac66..8814d619 100644
--- a/modelscope/models/cv/product_segmentation/seg_infer.py
+++ b/modelscope/models/cv/product_segmentation/seg_infer.py
@@ -59,9 +59,8 @@ mean, std = np.array([[[124.55, 118.90,
                         102.94]]]), np.array([[[56.77, 55.97, 57.50]]])
 
 
-def inference(model, device, input_path):
-    img = Image.open(input_path)
-    img = np.array(img.convert('RGB')).astype(np.float32)
+def inference(model, device, img):
+    img = img.cpu().numpy()
     img = (img - mean) / std
     img = cv2.resize(img, dsize=(448, 448), interpolation=cv2.INTER_LINEAR)
     img = torch.from_numpy(img)
diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index 721fb271..cbdeede4 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -762,12 +762,13 @@ TASK_OUTPUTS = {
     # }
     Tasks.hand_static: [OutputKeys.OUTPUT],
 
-    #     'output': [
-    #                [2, 75, 287, 240, 510, 0.8335018754005432],
-    #                [1, 127, 83, 332, 366, 0.9175254702568054],
-    #                [0, 0, 0, 367, 639, 0.9693422317504883]]
+    # {    'labels': [2, 1, 0],
+    #      'boxes':[[[78, 282, 240, 504], [127, 87, 332, 370], [0, 0, 367, 639]]
+    #      'scores':[0.8202137351036072, 0.8987470269203186, 0.9679114818572998]
     # }
-    Tasks.face_human_hand_detection: [OutputKeys.OUTPUT],
+    Tasks.face_human_hand_detection: [
+        OutputKeys.LABELS, OutputKeys.BOXES, OutputKeys.SCORES
+    ],
 
     # {
     #   {'output': 'Happiness', 'boxes': (203, 104, 663, 564)}
diff --git a/modelscope/pipelines/cv/face_emotion_pipeline.py b/modelscope/pipelines/cv/face_emotion_pipeline.py
index 249493b6..9d9aa6ee 100644
--- a/modelscope/pipelines/cv/face_emotion_pipeline.py
+++ b/modelscope/pipelines/cv/face_emotion_pipeline.py
@@ -1,11 +1,14 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import Any, Dict
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.face_emotion import emotion_infer
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 
@@ -28,10 +31,11 @@ class FaceEmotionPipeline(Pipeline):
         logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        return input
+        img = LoadImage.convert_to_ndarray(input['img_path'])
+        return img
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        result, bbox = emotion_infer.inference(input['img_path'], self.model,
+        result, bbox = emotion_infer.inference(input, self.model,
                                                self.face_model)
         return {OutputKeys.OUTPUT: result, OutputKeys.BOXES: bbox}
 
diff --git a/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py b/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
index d9f214c9..d41a14dd 100644
--- a/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
+++ b/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
@@ -2,11 +2,14 @@
 
 from typing import Any, Dict
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.face_human_hand_detection import det_infer
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -29,14 +32,19 @@ class NanoDettForFaceHumanHandDetectionPipeline(Pipeline):
         logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        return input
+        img = LoadImage.convert_to_ndarray(input['input_path'])
+        return img
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
 
-        result = det_infer.inference(self.model, self.device,
-                                     input['input_path'])
-        logger.info(result)
-        return {OutputKeys.OUTPUT: result}
+        cls_list, bbox_list, score_list = det_infer.inference(
+            self.model, self.device, input)
+        logger.info(cls_list, bbox_list, score_list)
+        return {
+            OutputKeys.LABELS: cls_list,
+            OutputKeys.BOXES: bbox_list,
+            OutputKeys.SCORES: score_list
+        }
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/cv/hand_static_pipeline.py b/modelscope/pipelines/cv/hand_static_pipeline.py
index 1219c873..c020b7aa 100644
--- a/modelscope/pipelines/cv/hand_static_pipeline.py
+++ b/modelscope/pipelines/cv/hand_static_pipeline.py
@@ -1,11 +1,14 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import Any, Dict
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.hand_static import hand_model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -27,10 +30,11 @@ class HandStaticPipeline(Pipeline):
         logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        return input
+        img = LoadImage.convert_to_ndarray(input['img_path'])
+        return img
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        result = hand_model.infer(input['img_path'], self.model, self.device)
+        result = hand_model.infer(input, self.model, self.device)
         return {OutputKeys.OUTPUT: result}
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/cv/product_segmentation_pipeline.py b/modelscope/pipelines/cv/product_segmentation_pipeline.py
index 244b01d7..3b1b2381 100644
--- a/modelscope/pipelines/cv/product_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/product_segmentation_pipeline.py
@@ -2,11 +2,14 @@
 
 from typing import Any, Dict
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.product_segmentation import seg_infer
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -28,12 +31,13 @@ class F3NetForProductSegmentationPipeline(Pipeline):
         logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        return input
+        img = LoadImage.convert_to_ndarray(input['input_path'])
+        img = img.astype(np.float32)
+        return img
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
 
-        mask = seg_infer.inference(self.model, self.device,
-                                   input['input_path'])
+        mask = seg_infer.inference(self.model, self.device, input)
         return {OutputKeys.MASKS: mask}
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: