|
- import os.path as osp
- from typing import Any, Dict
-
- import cv2
- import numpy as np
- import PIL
- import tensorflow as tf
-
- from modelscope.metainfo import Pipelines
- from modelscope.outputs import OutputKeys
- from modelscope.pipelines.base import Input
- from modelscope.preprocessors import load_image
- from modelscope.utils.constant import ModelFile, Tasks
- from modelscope.utils.logger import get_logger
- from ..base import Pipeline
- from ..builder import PIPELINES
- from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils
-
- if tf.__version__ >= '2.0':
- import tf_slim as slim
- else:
- from tensorflow.contrib import slim
-
- if tf.__version__ >= '2.0':
- tf = tf.compat.v1
- tf.compat.v1.disable_eager_execution()
-
- logger = get_logger()
-
- # constant
- RBOX_DIM = 5
- OFFSET_DIM = 6
- WORD_POLYGON_DIM = 8
- OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
-
- FLAGS = tf.app.flags.FLAGS
- tf.app.flags.DEFINE_float('node_threshold', 0.4,
- 'Confidence threshold for nodes')
- tf.app.flags.DEFINE_float('link_threshold', 0.6,
- 'Confidence threshold for links')
-
-
- @PIPELINES.register_module(
- Tasks.ocr_detection, module_name=Pipelines.ocr_detection)
- class OCRDetectionPipeline(Pipeline):
-
- def __init__(self, model: str):
- """
- use `model` and `preprocessor` to create a kws pipeline for prediction
- Args:
- model: model id on modelscope hub.
- """
- super().__init__(model=model)
- tf.reset_default_graph()
- model_path = osp.join(
- osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
- 'checkpoint-80000')
-
- config = tf.ConfigProto(allow_soft_placement=True)
- config.gpu_options.allow_growth = True
- self._session = tf.Session(config=config)
- self.input_images = tf.placeholder(
- tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
- self.output = {}
-
- with tf.variable_scope('', reuse=tf.AUTO_REUSE):
- global_step = tf.get_variable(
- 'global_step', [],
- initializer=tf.constant_initializer(0),
- dtype=tf.int64,
- trainable=False)
- variable_averages = tf.train.ExponentialMovingAverage(
- 0.997, global_step)
-
- # detector
- detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector()
- all_maps = detector.build_model(
- self.input_images, is_training=False)
-
- # decode local predictions
- all_nodes, all_links, all_reg = [], [], []
- for i, maps in enumerate(all_maps):
- cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2]
- reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
-
- cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))
-
- lnk_prob_pos = tf.nn.softmax(
- tf.reshape(lnk_maps, [-1, 4])[:, :2])
- lnk_prob_mut = tf.nn.softmax(
- tf.reshape(lnk_maps, [-1, 4])[:, 2:])
- lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1)
-
- all_nodes.append(cls_prob)
- all_links.append(lnk_prob)
- all_reg.append(reg_maps)
-
- # decode segments and links
- image_size = tf.shape(self.input_images)[1:3]
- segments, group_indices, segment_counts, _ = ops.decode_segments_links_python(
- image_size,
- all_nodes,
- all_links,
- all_reg,
- anchor_sizes=list(detector.anchor_sizes))
-
- # combine segments
- combined_rboxes, combined_counts = ops.combine_segments_python(
- segments, group_indices, segment_counts)
- self.output['combined_rboxes'] = combined_rboxes
- self.output['combined_counts'] = combined_counts
-
- with self._session.as_default() as sess:
- logger.info(f'loading model from {model_path}')
- # load model
- model_loader = tf.train.Saver(
- variable_averages.variables_to_restore())
- model_loader.restore(sess, model_path)
-
- def preprocess(self, input: Input) -> Dict[str, Any]:
- if isinstance(input, str):
- img = np.array(load_image(input))
- elif isinstance(input, PIL.Image.Image):
- img = np.array(input.convert('RGB'))
- elif isinstance(input, np.ndarray):
- if len(input.shape) == 2:
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
- img = input[:, :, ::-1] # in rgb order
- else:
- raise TypeError(f'input should be either str, PIL.Image,'
- f' np.array, but got {type(input)}')
- h, w, c = img.shape
- img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
- img_pad[:h, :w, :] = img
-
- resize_size = 1024
- img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
- img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
- img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94],
- dtype=np.float32)
-
- resize_size = tf.stack([resize_size, resize_size])
- orig_size = tf.stack([max(h, w), max(h, w)])
- self.output['orig_size'] = orig_size
- self.output['resize_size'] = resize_size
-
- result = {'img': np.expand_dims(img_pad_resize, axis=0)}
- return result
-
- def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
- with self._session.as_default():
- feed_dict = {self.input_images: input['img']}
- sess_outputs = self._session.run(self.output, feed_dict=feed_dict)
- return sess_outputs
-
- def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
- rboxes = inputs['combined_rboxes'][0]
- count = inputs['combined_counts'][0]
- rboxes = rboxes[:count, :]
-
- # convert rboxes to polygons and find its coordinates on the original image
- orig_h, orig_w = inputs['orig_size']
- resize_h, resize_w = inputs['resize_size']
- polygons = utils.rboxes_to_polygons(rboxes)
- scale_y = float(orig_h) / float(resize_h)
- scale_x = float(orig_w) / float(resize_w)
-
- # confine polygons inside image
- polygons[:, ::2] = np.maximum(
- 0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
- polygons[:, 1::2] = np.maximum(
- 0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
- polygons = np.round(polygons).astype(np.int32)
-
- # nms
- dt_n9 = [o + [utils.cal_width(o)] for o in polygons.tolist()]
- dt_nms = utils.nms_python(dt_n9)
- dt_polygons = np.array([o[:8] for o in dt_nms])
-
- result = {OutputKeys.POLYGONS: dt_polygons}
- return result
|