You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nn_detect_ops.h 16 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef GE_OP_NN_DETECT_OPS_H_
  17. #define GE_OP_NN_DETECT_OPS_H_
  18. #include "graph/operator_reg.h"
  19. #include "graph/operator.h"
  20. namespace ge {
  21. /**
  22. *@brief Generates bounding boxes based on "rois" and "deltas". It is a customized FasterRcnn operator.
  23. *@par Inputs:
  24. * Two inputs, including: \n
  25. *@li rois: Region of interests (ROIs) generated by the region proposal network (RPN). A 2D Tensor of type float 32 with shape (N, 4). "N" indicates the number of ROIs, and the value "4" refers to "x0", "x1", "y0", and "y1".
  26. *@li deltas: Absolute variation between the ROIs generated by the RPN and ground truth boxes. A 2D Tensor of type float32 with shape (N, 4). "N" indicates the number of errors, and 4 indicates "dx", "dy", "dw", and "dh".
  27. *@par Attributes:
  28. *@li means: An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means".
  29. *@li stds: An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means".
  30. *@li max_shape: Shape [h, w], specifying the size of the image transferred to the network. Used to ensure that the bbox shape after conversion does not exceed "max_shape".
  31. *@li wh_ratio_clip: Defaults to "16/1000". The values of "dw" and "dh" fall within (-wh_ratio_clip, wh_ratio_clip).
  32. *@par Outputs:
  33. *bboxes: Bboxes generated based on "rois" and "deltas". Have the same format and type as "rois".
  34. */
  35. REG_OP(BoundingBoxDecode)
  36. .INPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT}))
  37. .INPUT(deltas, TensorType({DT_FLOAT16, DT_FLOAT}))
  38. .OUTPUT(bboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
  39. .ATTR(means, ListFloat, {0.0, 0.0, 0.0, 0.0})
  40. .ATTR(stds, ListFloat, {1.0, 1.0, 1.0, 1.0})
  41. .REQUIRED_ATTR(max_shape, ListInt)
  42. .ATTR(wh_ratio_clip, Float, 0.016)
  43. .OP_END_FACTORY_REG(BoundingBoxDecode)
  44. /**
  45. *@brief Computes the coordinate variations between bboxes and ground truth boxes. It is a customized FasterRcnn operator.
  46. *@par Inputs:
  47. * Two inputs, including: \n
  48. *@li anchor_box: Anchor boxes. A 2D Tensor of float32 with shape (N, 4). "N" indicates the number of bounding boxes, and the value "4" refers to "x0", "x1", "y0", and "y1".
  49. *@li ground_truth_box: Ground truth boxes. A 2D Tensor of float32 with shape (N, 4). "N" indicates the number of bounding boxes, and the value "4" refers to "x0", "x1", "y0", and "y1".
  50. *@par Attributes:
  51. *@li means: An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means".
  52. *@li stds: An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means".
  53. *@par Outputs:
  54. *delats: A 2D Tensor of type float32 with shape (N, 4), specifying the variations between all anchor boxes and ground truth boxes.
  55. */
  56. REG_OP(BoundingBoxEncode)
  57. .INPUT(anchor_box, TensorType({DT_FLOAT16, DT_FLOAT}))
  58. .INPUT(ground_truth_box, TensorType({DT_FLOAT16, DT_FLOAT}))
  59. .OUTPUT(delats, TensorType({DT_FLOAT16, DT_FLOAT}))
  60. .ATTR(means, ListFloat, {0.0, 0.0, 0.0, 0.0})
  61. .ATTR(stds, ListFloat, {1.0, 1.0, 1.0, 1.0})
  62. .OP_END_FACTORY_REG(BoundingBoxEncode)
  63. /**
  64. *@brief Judges whether the bounding box is valid. It is a customized FasterRcnn operator.
  65. *@par Inputs:
  66. * Two inputs, including: \n
  67. *@li bbox_tensor: Bounding box. A 2D Tensor of type float16 with shape (N, 4). "N" indicates the number of bounding boxes, the value "4" indicates "x0", "x1", "y0", and "y1".
  68. *@li img_metas: Valid boundary value of the image. A 1D Tensor of type float16 with shape (16,)
  69. *@par Outputs:
  70. *valid_tensor: A bool with shape (N, 1), specifying whether an input anchor is in an image. "1" indicates valid, while "0" indicates invalid.
  71. *@attention Constraints:
  72. * 16 "img_metas" are input. The first three numbers (height, width, ratio) are valid, specifying the valid boundary (heights x ratio, weights x ratio).
  73. */
  74. REG_OP(CheckValid)
  75. .INPUT(bbox_tensor, TensorType({DT_FLOAT16}))
  76. .INPUT(img_metas, TensorType({DT_FLOAT16}))
  77. .OUTPUT(valid_tensor, TensorType({DT_INT8}))
  78. .OP_END_FACTORY_REG(CheckValid)
  79. /**
  80. *@brief Computes the intersection over union (iou) or the intersection over foreground (iof) based on the ground-truth and predicted regions.
  81. *@par Inputs:
  82. * Two inputs, including: \n
  83. *@li bboxes: Bounding boxes, a 2D Tensor of type float16 with shape (N, 4). "N" indicates the number of bounding boxes, and the value "4" refers to "x0", "x1", "y0", and "y1".
  84. *@li gtboxes: Ground-truth boxes, a 2D Tensor of type float16 with shape (M, 4). "M" indicates the number of ground truth boxes, and the value "4" refers to "x0", "x1", "y0", and "y1".
  85. *@par Attributes:
  86. *mode: Computation mode, a character string with the value range of [iou, iof].
  87. *@par Outputs:
  88. *overlap: A 2D Tensor of type float16 with shape [M, N], specifying the IoU or IoF ratio.
  89. *@attention Constraints:
  90. * Only computation of float16 data is supported. To avoid overflow, the input length and width are scaled by 0.2 internally.
  91. */
  92. REG_OP(Iou)
  93. .INPUT(bboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
  94. .INPUT(gtboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
  95. .OUTPUT(overlap, TensorType({DT_FLOAT16, DT_FLOAT}))
  96. .ATTR(mode, String, "iou")
  97. .OP_END_FACTORY_REG(Iou)
  98. /**
  99. *@brief Performs the backpropagation of ROIAlign for training scenarios.
  100. *@par Inputs:
  101. * Three inputs, including: \n
  102. *@li ydiff: A 5HD gradient input of type float32.
  103. *@li rois: ROI position. A 2D Tensor of float32 with shape (N, 5). "N" indicates the number of ROIs, the value "5" indicates the indexes of images where the ROIs are located, "x0", "x1", "y0", and "y1".
  104. *@li rois_n: An optional input, specifying the number of valid ROIs. This parameter is reserved.
  105. *@par Attributes:
  106. *@li xdiff_shape: A required list of 4 ints, obtained based on the shape of "features" of ROIAlign.
  107. *@li pooled_width: A required attribute of type int, specifying the W dimension.
  108. *@li pooled_height: A required attribute of type int, specifying the H dimension.
  109. *@li spatial_scale: A required attribute of type float, specifying the scaling ratio of "features" to the original image.
  110. *@li sample_num: An optional attribute of type int, specifying the horizontal and vertical sampling frequency of each output. If this attribute is set to "0", the sampling frequency is equal to the rounded up value of "rois", which is a floating point number. Defaults to "2".
  111. *@par Outputs:
  112. *xdiff: Gradient added to input "features". Has the same 5HD shape as input "features".
  113. */
  114. REG_OP(ROIAlignGrad)
  115. .INPUT(ydiff, TensorType({DT_FLOAT}))
  116. .INPUT(rois, TensorType({DT_FLOAT}))
  117. .OPTIONAL_INPUT(rois_n, TensorType({DT_INT32}))
  118. .OUTPUT(xdiff, TensorType({DT_FLOAT}))
  119. .REQUIRED_ATTR(xdiff_shape, ListInt)
  120. .REQUIRED_ATTR(pooled_width, Int)
  121. .REQUIRED_ATTR(pooled_height, Int)
  122. .REQUIRED_ATTR(spatial_scale, Float)
  123. .ATTR(sample_num, Int, 2)
  124. .OP_END_FACTORY_REG(ROIAlignGrad)
  125. /**
  126. *@brief Obtains the ROI feature matrix from the feature map. It is a customized FasterRcnn operator.
  127. *@par Inputs:
  128. * Three inputs, including: \n
  129. *@li features: A 5HD Tensor of type float32.
  130. *@li rois: ROI position. A 2D Tensor of float32 with shape (N, 5). "N" indicates the number of ROIs, the value "5" indicates the indexes of images where the ROIs are located, "x0", "x1", "y0", and "y1".
  131. *@li rois_n: An optional input, specifying the number of valid ROIs. This parameter is reserved.
  132. *@par Attributes:
  133. *@li spatial_scale: A required attribute of type float, specifying the scaling ratio of "features" to the original image.
  134. *@li pooled_height: A required attribute of type int, specifying the H dimension.
  135. *@li pooled_width: A required attribute of type int, specifying the W dimension.
  136. *@li sample_num: An optional attribute of type int, specifying the horizontal and vertical sampling frequency of each output. If this attribute is set to "0", the sampling frequency is equal to the rounded up value of "rois", which is a floating point number. Defaults to "2".
  137. *@par Outputs:
  138. *output: Outputs the feature sample of each ROI position. The format is 5HD. The axis N is the number of input ROIs. Axes H, W, and C are consistent with the values of "pooled_height", "pooled_width", and "features", respectively.
  139. */
  140. REG_OP(ROIAlign)
  141. .INPUT(features, TensorType({DT_FLOAT16, DT_FLOAT}))
  142. .INPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT}))
  143. .OPTIONAL_INPUT(rois_n, TensorType({DT_INT32}))
  144. .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
  145. .REQUIRED_ATTR(spatial_scale, Float)
  146. .REQUIRED_ATTR(pooled_height, Int)
  147. .REQUIRED_ATTR(pooled_width, Int)
  148. .ATTR(sample_num, Int, 2)
  149. .ATTR(roi_end_mode, Int, 1)
  150. .OP_END_FACTORY_REG(ROIAlign)
  151. /**
  152. *@brief Performs SSD prior box detection.
  153. *@par Inputs:
  154. * Two inputs, including:
  155. *@li x: An NC1HWC0 or NCHW feature map of type is float32 or float16.
  156. *@li img: source image. Has the same type and format as "x".
  157. *@par Attributes:
  158. *@li min_size: A required float32, specifying the minimum edge length of a square prior box.
  159. *@li max_size: A required float32, specifying the maximum edge length of a square prior box: sqrt(min_size * max_size)
  160. *@li aspect_ratio: An required float32, specifying the aspect ratio for generated rectangle boxes. The height is min_size/sqrt(aspect_ratio), the width is min_size*sqrt(aspect_ratio). Defaults to "1.0".
  161. *@li img_h: An optional int32, specifying the source image height. Defaults to "0".
  162. *@li img_w: An optional int32, specifying the source image width. Defaults to "0".
  163. *@li step_h: An optional float32, specifying the height step for mapping the center point from the feature map to the source image. Defaults to "0.0".
  164. *@li step_w: An optional float32, specifying the width step for mapping the center point from the feature map to the source image. Defaults to "0.0".
  165. *@li flip: An optional bool. If "True", "aspect_ratio" will be flipped. Defaults to "True".
  166. *@li clip: An optional bool. If "True", a prior box is clipped to within [0, 1]. Defaults to "False".
  167. *@li offset: An optional float32, specifying the offset. Defaults to "0.5".
  168. *@li variance: An optional float32, specifying the variance of a prior box, either one or four variances. Defaults to "0.1" (one value).
  169. *@par Outputs:
  170. *y: An ND tensor of type float32 or float16, specifying the prior box information, including its coordinates and variance.
  171. *@attention Constraints:\n
  172. * This operator applies only to SSD networks.
  173. *@see SSDDetectionOutput()
  174. */
  175. REG_OP(PriorBox)
  176. .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
  177. .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT}))
  178. .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
  179. .REQUIRED_ATTR(min_size, ListFloat)
  180. .REQUIRED_ATTR(max_size, ListFloat)
  181. .REQUIRED_ATTR(aspect_ratio, ListFloat)
  182. .ATTR(img_h, Int, 0)
  183. .ATTR(img_w, Int, 0)
  184. .ATTR(step_h, Float, 0.0)
  185. .ATTR(step_w, Float, 0.0)
  186. .ATTR(flip, Bool, true)
  187. .ATTR(clip, Bool, false)
  188. .ATTR(offset, Float, 0.5)
  189. .ATTR(variance, ListFloat, {0.1})
  190. .OP_END_FACTORY_REG(PriorBox);
  191. /**
  192. *@brief Performs SSD prior box detection, with four additional matrices and the "aspect_ratio" attribute deleted compared to PriorBox.
  193. *@par Inputs:
  194. * Six inputs, including:
  195. *@li x: An NC1HWC0 or NCHW feature map of type is float32 or float16.
  196. *@li img: source image. Has the same type and format as "x".
  197. *@li data_h: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the matrix for indexing the feature map height.
  198. *@li data_w: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the matrix for indexing the feature map width.
  199. *@li box_height: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the height of each prior box.
  200. *@li box_width: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the width of each prior box.
  201. *@par Attributes:
  202. *@li min_size: A required float32, specifying the minimum edge length of a square prior box.
  203. *@li max_size: A required float32, specifying the maximum edge length of a square prior box: sqrt(min_size * max_size)
  204. *@li img_h: An optional int32, specifying the height of the source image.
  205. *@li img_w: An optional int32, specifying the width of the source image.
  206. *@li step_h: An optional float32, specifying the height step for mapping the center point from the feature map to the source image.
  207. *@li step_w: An optional float32, specifying the width step for mapping the center point from the feature map to the source image.
  208. *@li flip: An optional bool. If "True", "aspect_ratio" will be flipped. Defaults to "True".
  209. *@li clip: An optional bool. If "True", a prior box is clipped to within [0, 1]. Defaults to "False".
  210. *@li offset: An optional float32, specifying the offset. Defaults to "0.5".
  211. *@li variance: An optional float32, specifying the variance of a prior box, either one or four variances. Defaults to "0.1" (one value).
  212. *@par Outputs:
  213. *y: An ND tensor of type float32 or float16, specifying the prior box information, including its coordinates and variance.
  214. *@attention Constraints:\n
  215. * This operator applies only to SSD networks.
  216. *@see SSDDetectionOutput()
  217. */
  218. REG_OP(PriorBoxD)
  219. .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
  220. .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT}))
  221. .INPUT(data_h, TensorType({DT_FLOAT16, DT_FLOAT}))
  222. .INPUT(data_w, TensorType({DT_FLOAT16, DT_FLOAT}))
  223. .INPUT(box_height, TensorType({DT_FLOAT16, DT_FLOAT}))
  224. .INPUT(box_width, TensorType({DT_FLOAT16, DT_FLOAT}))
  225. .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
  226. .REQUIRED_ATTR(min_size, ListFloat)
  227. .REQUIRED_ATTR(max_size, ListFloat)
  228. .ATTR(img_h, Int, 0)
  229. .ATTR(img_w, Int, 0)
  230. .ATTR(step_h, Float, 0.0)
  231. .ATTR(step_w, Float, 0.0)
  232. .ATTR(flip, Bool, true)
  233. .ATTR(clip, Bool, false)
  234. .ATTR(offset, Float, 0.5)
  235. .ATTR(variance, ListFloat, {0.1})
  236. .OP_END_FACTORY_REG(PriorBoxD);
  237. /**
  238. *@brief Performs Position Sensitive ROI Pooling.
  239. *@par Inputs:
  240. * Two inputs, including:
  241. *@li x: An NC1HWC0 tensor of type float16 or float32, describing the feature
  242. * map, dimension C1 must be equal to
  243. * (int(output_dim+15)/C0))*group_size*group_size.
  244. *@li rois: A tensor of type float16 or float32, with shape
  245. * [batch, 5, rois_num], describing the ROIs, each ROI consists of five
  246. * elements: "batch_id", "x1", "y1", "x2", and "y2", which "batch_id" indicates
  247. * the index of the input feature map, "x1", "y1", "x2", or "y2" must be
  248. * greater than or equal to "0.0".
  249. *@par Attributes:
  250. *@li output_dim: A required int32, specifying the number of output channels,
  251. * must be greater than 0.
  252. *@li group_size: A required int32, specifying the number of groups to encode
  253. * position-sensitive score maps, must be within the range (0, 128).
  254. *@li spatial_scale: A required scaling factor for mapping the input
  255. * coordinates to the ROI coordinates.
  256. *@par Outputs:
  257. *y: An NC1HWC0 tensor of type float16 or float32, describing the result
  258. * feature map.
  259. *@attention Constraints:
  260. * HC1HWC0: channel must be Group_size squared, rois_num is a multiple of 16
  261. */
  262. REG_OP(PSROIPooling)
  263. .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
  264. .INPUT(rois, TensorType({DT_FLOAT, DT_FLOAT16}))
  265. .ATTR(output_dim, Int, 0)
  266. .ATTR(group_size, Int, 0)
  267. .ATTR(spatial_scale, Float, 0.0625)
  268. .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
  269. .OP_END_FACTORY_REG(PSROIPooling)
  270. } // namespace ge
  271. #endif // GE_OP_NN_DETECT_OPS_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示