You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

string_ops.h 15 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef GE_OP_STRING_OPS_H_
  17. #define GE_OP_STRING_OPS_H_
  18. #include <sstream>
  19. #include "graph/operator_reg.h"
  20. namespace ge {
  21. /**
  22. *@brief Split elements of input based on delimiter into a SparseTensor.
  23. *@par Inputs:
  24. include: \n
  25. *@li input:1-D. Strings to split.
  26. *@li delimiter:0-D. Delimiter characters (bytes), or empty string.
  27. *@par Attributes:
  28. * skip_empty:A bool. If True, skip the empty strings from the result.
  29. *@par Outputs:
  30. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  31. *@li values:A vector of strings corresponding to the splited values.
  32. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,\n
  33. *where the first value is N and the second value is the maximum number of tokens\n
  34. *in a single input entry.
  35. *@see StringSplit()
  36. */
  37. REG_OP(StringSplit)
  38. .INPUT(input, TensorType({DT_STRING}))
  39. .INPUT(delimiter, TensorType({DT_STRING}))
  40. .OUTPUT(indices, TensorType({DT_INT64}))
  41. .OUTPUT(values, TensorType({DT_STRING}))
  42. .OUTPUT(shape, TensorType({DT_INT64}))
  43. .ATTR(skip_empty, Bool, true)
  44. .OP_END_FACTORY_REG(StringSplit)
  45. /**
  46. *@brief Split elements of source based on sep into a SparseTensor.
  47. *@par Inputs:
  48. include: \n
  49. *@li input:1-D. Strings to split.
  50. *@li sep:0-D string Tensor, the delimiter character.
  51. *@par Attributes:
  52. * maxsplit:An int. If maxsplit > 0, limit of the split of the result.
  53. *@par Outputs:
  54. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  55. *@li values:A vector of strings corresponding to the splited values.
  56. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,\n
  57. *where the first value is N and the second value is the maximum number of tokens\n
  58. *in a single input entry.
  59. *@see StringSplitV2()
  60. */
  61. REG_OP(StringSplitV2)
  62. .INPUT(input, TensorType({DT_STRING}))
  63. .INPUT(sep, TensorType({DT_STRING}))
  64. .OUTPUT(indices, TensorType({DT_INT64}))
  65. .OUTPUT(values, TensorType({DT_STRING}))
  66. .OUTPUT(shape, TensorType({DT_INT64}))
  67. .ATTR(maxsplit, Int, -1)
  68. .OP_END_FACTORY_REG(StringSplitV2)
  69. /**
  70. *@brief Determine the script codes of a given tensor of Unicode integer code points.
  71. *@par Inputs:
  72. include: \n
  73. *x:A Tensor of int32 Unicode code points.
  74. *@par Outputs:
  75. *y:A Tensor of int32 script codes corresponding to each input code point.
  76. *@attention Constraints:\n
  77. *This operation converts Unicode code points to script codes corresponding to\n
  78. *each code point.\nScript codes correspond to International Components for\n
  79. *Unicode (ICU) UScriptCode values.\n
  80. *See http://icu-project.org/apiref/icu4c/uscript_8h.html.\n
  81. *Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints.\n
  82. *Output shape will match input shape.
  83. *@see UnicodeScript()
  84. */
  85. REG_OP(UnicodeScript)
  86. .INPUT(x, TensorType({DT_INT32}))
  87. .OUTPUT(y, TensorType({DT_INT32}))
  88. .OP_END_FACTORY_REG(UnicodeScript)
  89. /**
  90. *@brief Return substrings from Tensor of strings.
  91. *@par Inputs:
  92. include: \n
  93. *@li input:Tensor of strings.
  94. *@li pos:Scalar defining the position of first character in each substring.
  95. *@li len:Scalar defining the number of characters to include in each substring.
  96. *@par Outputs:
  97. *output:Tensor of substrings.
  98. *@attention Constraints:\n
  99. *The hash function is deterministic on the content of the string within\n
  100. *the process and will never change. However, it is not suitable for\n
  101. *cryptography. This function may be used when CPU time is scarce and\n
  102. *inputs are trusted or unimportant. There is a risk of adversaries\n
  103. *constructing inputs that all hash to the same bucket.\n
  104. *To prevent this problem, use a strong hash function with\n
  105. *@see Substr()
  106. */
  107. REG_OP(Substr)
  108. .INPUT(input, TensorType({DT_STRING}))
  109. .INPUT(pos, TensorType({DT_INT32, DT_INT64}))
  110. .INPUT(len, TensorType({DT_INT32, DT_INT64}))
  111. .OUTPUT(output, TensorType({DT_STRING}))
  112. .OP_END_FACTORY_REG(Substr)
  113. /**
  114. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets.
  115. *@par Inputs:
  116. include: \n
  117. *string_tensor:The strings to assign a hash bucket.
  118. *@par Outputs:
  119. *y:A Tensor of the same shape as the input x.
  120. *@attention Constraints:\n
  121. *The hash function is deterministic on the content of the string within\n
  122. *the process and will never change. However, it is not suitable for cryptography.\n
  123. *This function may be used when CPU time is scarce and inputs are trusted or\n
  124. *unimportant. There is a risk of adversaries constructing inputs that all hash\n
  125. *to the same bucket. To prevent this problem, use a strong hash function with\n
  126. *@see StringToHashBucketFast()
  127. */
  128. REG_OP(StringToHashBucketFast)
  129. .INPUT(x, TensorType({DT_STRING}))
  130. .OUTPUT(y, TensorType({DT_INT64}))
  131. .ATTR(num_buckets, Int, 1)
  132. .OP_END_FACTORY_REG(StringToHashBucketFast)
  133. /**
  134. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets.
  135. *@par Inputs:
  136. include: \n
  137. *x:The strings to assign a hash bucket.
  138. *@par Attributes:
  139. *num_buckets:The number of buckets.
  140. *@par Outputs:
  141. *y:A Tensor of the same shape as the input x.
  142. *@attention Constraints:\n
  143. *@li A strong hash is important when inputs may be malicious, e.g. URLs with\n
  144. *additional components. Adversaries could try to make their inputs hash to\n
  145. *the same bucket for a denial-of-service attack or to skew the results.\n
  146. *A strong hash can be used to make it difficult to find inputs with a skewed\n
  147. * hash value distribution over buckets. This requires that the hash function\
  148. *is seeded by a high-entropy (random) "key" unknown to the adversary.
  149. *@li The additional robustness comes at a cost of roughly 4x higher\n
  150. *@see StringToHashBucketStrong()
  151. */
  152. REG_OP(StringToHashBucketStrong)
  153. .INPUT(x, TensorType({DT_STRING}))
  154. .OUTPUT(y, TensorType({DT_INT64}))
  155. .ATTR(num_buckets, Int, 1)
  156. .REQUIRED_ATTR(key, ListInt)
  157. .OP_END_FACTORY_REG(StringToHashBucketStrong)
  158. /**
  159. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets.
  160. *@par Inputs:
  161. include: \n
  162. *string_tensor:The strings to assign a hash bucket.
  163. *@par Attributes:
  164. *num_buckets:The number of buckets.
  165. *@par Outputs:
  166. *y:A Tensor of the same shape as the input string_tensor.
  167. *@see StringToHashBucket()
  168. */
  169. REG_OP(StringToHashBucket)
  170. .INPUT(string_tensor, TensorType({DT_STRING}))
  171. .OUTPUT(y, TensorType({DT_INT64}))
  172. .ATTR(num_buckets, Int, 1)
  173. .OP_END_FACTORY_REG(StringToHashBucket)
  174. /**
  175. *@brief Strip leading and trailing whitespaces from the Tensor.
  176. *@par Inputs:
  177. include: \n
  178. *x:A string Tensor of any shape.
  179. *@par Outputs:
  180. *y:A string Tensor of the same shape as the input.
  181. *@see StringStrip()
  182. */
  183. REG_OP(StringStrip)
  184. .INPUT(x, TensorType({DT_STRING}))
  185. .OUTPUT(y, TensorType({DT_STRING}))
  186. .OP_END_FACTORY_REG(StringStrip)
  187. /**
  188. *@brief Computes the length of each string given in the input tensor.
  189. *@par Inputs:
  190. include: \n
  191. *x:The string for which to compute the length.
  192. *@par Attributes:
  193. *unit:The unit that is counted to compute string length.\n
  194. *One of: "BYTE" (for the number of bytes in each string) or\n
  195. *"UTF8_CHAR" (for the number of UTF-8 encoded Unicode code points in each string).\n
  196. *Results are undefined if unit=UTF8_CHAR and the input strings do not contain\N
  197. *structurally valid UTF-8.
  198. *@par Outputs:
  199. *y:Integer tensor that has the same shape as input.\n
  200. *The output contains the element-wise string lengths of input.
  201. *@see StringLength()
  202. */
  203. REG_OP(StringLength)
  204. .INPUT(x, TensorType({DT_STRING}))
  205. .OUTPUT(y, TensorType({DT_INT32}))
  206. .ATTR(unit, String, "BYTE")
  207. .OP_END_FACTORY_REG(StringLength)
  208. /**
  209. *@brief Joins the strings in the given list of string tensors into one tensor.
  210. *@par Inputs:
  211. *The input is a string tensor of any shape. The pattern is a scalar string tensor\n
  212. *which is applied to every element of the input tensor. The boolean values\n
  213. *(True or False) of the output tensor indicate if the input matches the regex\n
  214. *pattern provided. The pattern follows the re2 syntax\n
  215. *(https://github.com/google/re2/wiki/Syntax).: \n
  216. include: \n
  217. *x:A list of string tensors. The tensors must all have the same shape,\n
  218. *or be scalars. Scalars may be mixed in; these will be broadcast to the shape\n
  219. *of non-scalar inputs.
  220. *@par Attributes:
  221. *@li N:The length of input x.
  222. *@li separator:string, an optional join separator.
  223. *@par Outputs:
  224. *y:The output tensor.
  225. *@see StringJoin()
  226. */
  227. REG_OP(StringJoin)
  228. .DYNAMIC_INPUT(x, TensorType({DT_STRING}))
  229. .OUTPUT(y, TensorType({DT_STRING}))
  230. .REQUIRED_ATTR(N, Int)
  231. .ATTR(separator, String, "")
  232. .OP_END_FACTORY_REG(StringJoin)
  233. /**
  234. *@brief Formats a string template using a list of tensors.
  235. *@par Inputs:
  236. *The input is a string tensor of any shape. The pattern is a scalar string tensor\n
  237. *which is applied to every element of the input tensor.\n
  238. *The boolean values (True or False) of the output tensor indicate if the input\n
  239. *matches the regex pattern provided. The pattern follows the re2 syntax\n
  240. *(https://github.com/google/re2/wiki/Syntax).: \n
  241. include: \n
  242. *x:The tensors to format into the placeholder string.
  243. *@par Attributes:
  244. *@li template:A string, the template to format tensor summaries into.
  245. *@li placeholder:A string, at each placeholder in the template a subsequent tensor summary will be inserted.
  246. *@li summarize:When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
  247. *@par Outputs:
  248. *y:The resulting string scalar.
  249. *@see StringFormat()
  250. */
  251. REG_OP(StringFormat)
  252. .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
  253. DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_STRING, DT_FLOAT16, \
  254. DT_FLOAT, DT_DOUBLE, DT_BOOL}))
  255. .OUTPUT(y, TensorType({DT_STRING}))
  256. .ATTR(template, String, "%s")
  257. .ATTR(placeholder, String, "%s")
  258. .ATTR(summarize, Int, 3)
  259. .OP_END_FACTORY_REG(StringFormat)
  260. /**
  261. *@brief Check if the input matches the regex pattern.
  262. *@par Inputs:
  263. *The input is a string tensor of any shape. The pattern is a scalar string tensor\n
  264. *which is applied to every element of the input tensor. The boolean values \n
  265. *(True or False) of the output tensor indicate if the input matches the regex\n
  266. *pattern provided. The pattern follows the re2 syntax\n
  267. *(https://github.com/google/re2/wiki/Syntax).: \n
  268. include: \n
  269. *@li x:A string tensor of the text to be processed.
  270. *@li pattern:A scalar string tensor containing the regular expression to match the input.
  271. *@par Outputs:
  272. *y:A bool tensor with the same shape as input.
  273. *@see RegexFullMatch()
  274. */
  275. REG_OP(RegexFullMatch)
  276. .INPUT(x, TensorType({DT_STRING}))
  277. .INPUT(pattern, TensorType({DT_STRING}))
  278. .OUTPUT(y, TensorType({DT_BOOL}))
  279. .OP_END_FACTORY_REG(RegexFullMatch)
  280. /**
  281. *@brief Replaces matches of the pattern regular expression in input with the\n
  282. *replacement string provided in rewrite.
  283. *@par Inputs:
  284. *It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax).: \n
  285. include: \n
  286. *@li x:The text to be processed.
  287. *@li pattern:The regular expression to be matched in the input strings.
  288. *@li rewrite:The rewrite string to be substituted for the pattern expression\n
  289. *where it is matched in the input strings.
  290. *@par Attributes:
  291. *replace_global:If True, the replacement is global\n
  292. *(that is, all matches of the pattern regular expression in each input string\n
  293. *are rewritten), otherwise the rewrite substitution is only made for the first\n
  294. * pattern match.
  295. *@par Outputs:
  296. *y:The text after applying pattern match and rewrite substitution.
  297. *@see RegexReplace()
  298. */
  299. REG_OP(RegexReplace)
  300. .INPUT(x, TensorType({DT_STRING}))
  301. .INPUT(pattern, TensorType({DT_STRING}))
  302. .INPUT(rewrite, TensorType({DT_STRING}))
  303. .OUTPUT(y, TensorType({DT_STRING}))
  304. .ATTR(replace_global, Bool, true)
  305. .OP_END_FACTORY_REG(RegexReplace)
  306. /**
  307. *@brief Converts each entry in the given tensor to strings.
  308. *@par Inputs:
  309. *Supports many numeric types and boolean.: \n
  310. include: \n
  311. *x:A tensor can be trans to string.
  312. *@par Attributes:
  313. *@li precision:The post-decimal precision to use for floating point numbers.\n
  314. *Only used if precision > -1.
  315. *@li scientific:Use scientific notation for floating point numbers.
  316. *@li shortest:Use shortest representation (either scientific or standard)\n
  317. *for floating point numbers..
  318. *@li width:Pad pre-decimal numbers to this width. Applies to both floating\n
  319. *point and integer numbers. Only used if width > -1.
  320. *@li fill:The value to pad if width > -1. If empty, pads with spaces.\n
  321. *Another typical value is '0'. String cannot be longer than 1 character.
  322. *@par Outputs:
  323. *y:The output tensor.
  324. *@see AsString()
  325. */
  326. REG_OP(AsString)
  327. .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \
  328. DT_DOUBLE, DT_BOOL}))
  329. .OUTPUT(y, TensorType({DT_STRING}))
  330. .ATTR(precision, Int, -1)
  331. .ATTR(scientific, Bool, false)
  332. .ATTR(shortest, Bool, false)
  333. .ATTR(width, Int, -1)
  334. .ATTR(fill, String, "")
  335. .OP_END_FACTORY_REG(AsString)
  336. /**
  337. *@brief Encode strings into web-safe base64 format.
  338. *@par Inputs:
  339. *Input may or may not have padding at the end. See EncodeBase64 for padding.\n
  340. *Web-safe means that input must use - and _ instead of + and /.: \n
  341. include: \n
  342. *x:Strings to be encoded.
  343. *@par Attributes:
  344. *pad:Bool whether padding is applied at the ends.
  345. *@par Outputs:
  346. *y:Input strings encoded in base64.
  347. *@attention Constraints:\n
  348. *Refer to the following article for more information on base64 format:\n
  349. *en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '='\n
  350. *at the end so that the encoded has length multiple of 4.\n
  351. *See Padding section of the link above. Web-safe means that the encoder\n
  352. *uses - and _ instead of + and /.
  353. *@see EncodeBase64()
  354. */
  355. REG_OP(EncodeBase64)
  356. .INPUT(x, TensorType({DT_STRING}))
  357. .OUTPUT(y, TensorType({DT_STRING}))
  358. .ATTR(pad, Bool, false)
  359. .OP_END_FACTORY_REG(EncodeBase64)
  360. /**
  361. *@brief Decode web-safe base64-encoded strings.
  362. *@par Inputs:
  363. *Input may or may not have padding at the end. See EncodeBase64 for padding.\n
  364. *Web-safe means that input must use - and _ instead of + and /.: \n
  365. include: \n
  366. *x:Base64 strings to decode.
  367. *@par Outputs:
  368. *y:Decoded strings.
  369. *@see DecodeBase64()
  370. */
  371. REG_OP(DecodeBase64)
  372. .INPUT(x, TensorType({DT_STRING}))
  373. .OUTPUT(y, TensorType({DT_STRING}))
  374. .OP_END_FACTORY_REG(DecodeBase64)
  375. } // namespace ge
  376. #endif // GE_OP_STRING_OPS_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示