You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

string_ops.h 15 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef GE_OP_STRING_OPS_H_
  17. #define GE_OP_STRING_OPS_H_
  18. #include <sstream>
  19. #include "graph/operator_reg.h"
  20. namespace ge {
  21. /**
  22. *@brief Split elements of input based on delimiter into a SparseTensor.
  23. *@par Inputs:
  24. include: \n
  25. *@li input:1-D. Strings to split.
  26. *@li delimiter:0-D. Delimiter characters (bytes), or empty string.
  27. *@par Attributes:
  28. * skip_empty:A bool. If True, skip the empty strings from the result.
  29. *@par Outputs:
  30. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  31. *@li values:A vector of strings corresponding to the splited values.
  32. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,\n
  33. *where the first value is N and the second value is the maximum number of tokens\n
  34. *in a single input entry.
  35. *@see StringSplit()
  36. */
  37. REG_OP(StringSplit)
  38. .INPUT(input, TensorType({DT_STRING}))
  39. .INPUT(delimiter, TensorType({DT_STRING}))
  40. .OUTPUT(indices, TensorType({DT_INT64}))
  41. .OUTPUT(values, TensorType({DT_STRING}))
  42. .OUTPUT(shape, TensorType({DT_INT64}))
  43. .ATTR(skip_empty, Bool, true)
  44. .OP_END_FACTORY_REG(StringSplit)
  45. /**
  46. *@brief Split elements of source based on sep into a SparseTensor.
  47. *@par Inputs:
  48. include: \n
  49. *@li input:1-D. Strings to split.
  50. *@li sep:0-D string Tensor, the delimiter character.
  51. *@par Attributes:
  52. * maxsplit:An int. If maxsplit > 0, limit of the split of the result.
  53. *@par Outputs:
  54. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  55. *@li values:A vector of strings corresponding to the splited values.
  56. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,\n
  57. *where the first value is N and the second value is the maximum number of tokens\n
  58. *in a single input entry.
  59. *@see StringSplitV2()
  60. */
  61. REG_OP(StringSplitV2)
  62. .INPUT(input, TensorType({DT_STRING}))
  63. .INPUT(sep, TensorType({DT_STRING}))
  64. .OUTPUT(indices, TensorType({DT_INT64}))
  65. .OUTPUT(values, TensorType({DT_STRING}))
  66. .OUTPUT(shape, TensorType({DT_INT64}))
  67. .ATTR(maxsplit, Int, -1)
  68. .OP_END_FACTORY_REG(StringSplitV2)
  69. /**
  70. *@brief Determine the script codes of a given tensor of Unicode integer code points.
  71. *@par Inputs:
  72. include: \n
  73. *x:A Tensor of int32 Unicode code points.
  74. *@par Outputs:
  75. *y:A Tensor of int32 script codes corresponding to each input code point.
  76. *@attention Constraints:\n
  77. *This operation converts Unicode code points to script codes corresponding to\n
  78. *each code point.\nScript codes correspond to International Components for\n
  79. *Unicode (ICU) UScriptCode values.\n
  80. *See http://icu-project.org/apiref/icu4c/uscript_8h.html.\n
  81. *Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints.\n
  82. *Output shape will match input shape.
  83. *@see UnicodeScript()
  84. */
  85. REG_OP(UnicodeScript)
  86. .INPUT(x, TensorType({DT_INT32}))
  87. .OUTPUT(y, TensorType({DT_INT32}))
  88. .OP_END_FACTORY_REG(UnicodeScript)
  89. /**
  90. *@brief Return substrings from Tensor of strings.
  91. *@par Inputs:
  92. include: \n
  93. *@li input:Tensor of strings.
  94. *@li pos:Scalar defining the position of first character in each substring.
  95. *@li len:Scalar defining the number of characters to include in each substring.
  96. *@par Outputs:
  97. *output:Tensor of substrings.
  98. *@attention Constraints:\n
  99. *The hash function is deterministic on the content of the string within\n
  100. *the process and will never change. However, it is not suitable for\n
  101. *cryptography. This function may be used when CPU time is scarce and\n
  102. *inputs are trusted or unimportant. There is a risk of adversaries\n
  103. *constructing inputs that all hash to the same bucket.\n
  104. *To prevent this problem, use a strong hash function with\n
  105. *tf.string_to_hash_bucket_strong.
  106. *@see Substr()
  107. */
  108. REG_OP(Substr)
  109. .INPUT(input, TensorType({DT_STRING}))
  110. .INPUT(pos, TensorType({DT_INT32, DT_INT64}))
  111. .INPUT(len, TensorType({DT_INT32, DT_INT64}))
  112. .OUTPUT(output, TensorType({DT_STRING}))
  113. .OP_END_FACTORY_REG(Substr)
  114. /**
  115. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets.
  116. *@par Inputs:
  117. include: \n
  118. *string_tensor:The strings to assign a hash bucket.
  119. *@par Outputs:
  120. *y:A Tensor of the same shape as the input x.
  121. *@attention Constraints:\n
  122. *The hash function is deterministic on the content of the string within\n
  123. *the process and will never change. However, it is not suitable for cryptography.\n
  124. *This function may be used when CPU time is scarce and inputs are trusted or\n
  125. *unimportant. There is a risk of adversaries constructing inputs that all hash\n
  126. *to the same bucket. To prevent this problem, use a strong hash function with\n
  127. *tf.string_to_hash_bucket_strong.
  128. *@see StringToHashBucketFast()
  129. */
  130. REG_OP(StringToHashBucketFast)
  131. .INPUT(x, TensorType({DT_STRING}))
  132. .OUTPUT(y, TensorType({DT_INT64}))
  133. .ATTR(num_buckets, Int, 1)
  134. .OP_END_FACTORY_REG(StringToHashBucketFast)
  135. /**
  136. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets.
  137. *@par Inputs:
  138. include: \n
  139. *x:The strings to assign a hash bucket.
  140. *@par Attributes:
  141. *num_buckets:The number of buckets.
  142. *@par Outputs:
  143. *y:A Tensor of the same shape as the input x.
  144. *@attention Constraints:\n
  145. *@li A strong hash is important when inputs may be malicious, e.g. URLs with\n
  146. *additional components. Adversaries could try to make their inputs hash to\n
  147. *the same bucket for a denial-of-service attack or to skew the results.\n
  148. *A strong hash can be used to make it difficult to find inputs with a skewed\n
  149. * hash value distribution over buckets. This requires that the hash function\
  150. *is seeded by a high-entropy (random) "key" unknown to the adversary.
  151. *@li The additional robustness comes at a cost of roughly 4x higher\n
  152. *compute time than tf.string_to_hash_bucket_fast.
  153. *@see StringToHashBucketStrong()
  154. */
  155. REG_OP(StringToHashBucketStrong)
  156. .INPUT(x, TensorType({DT_STRING}))
  157. .OUTPUT(y, TensorType({DT_INT64}))
  158. .ATTR(num_buckets, Int, 1)
  159. .REQUIRED_ATTR(key, ListInt)
  160. .OP_END_FACTORY_REG(StringToHashBucketStrong)
  161. /**
  162. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets.
  163. *@par Inputs:
  164. include: \n
  165. *string_tensor:The strings to assign a hash bucket.
  166. *@par Attributes:
  167. *num_buckets:The number of buckets.
  168. *@par Outputs:
  169. *y:A Tensor of the same shape as the input string_tensor.
  170. *@see StringToHashBucket()
  171. */
  172. REG_OP(StringToHashBucket)
  173. .INPUT(string_tensor, TensorType({DT_STRING}))
  174. .OUTPUT(y, TensorType({DT_INT64}))
  175. .ATTR(num_buckets, Int, 1)
  176. .OP_END_FACTORY_REG(StringToHashBucket)
  177. /**
  178. *@brief Strip leading and trailing whitespaces from the Tensor.
  179. *@par Inputs:
  180. include: \n
  181. *x:A string Tensor of any shape.
  182. *@par Outputs:
  183. *y:A string Tensor of the same shape as the input.
  184. *@see StringStrip()
  185. */
  186. REG_OP(StringStrip)
  187. .INPUT(x, TensorType({DT_STRING}))
  188. .OUTPUT(y, TensorType({DT_STRING}))
  189. .OP_END_FACTORY_REG(StringStrip)
  190. /**
  191. *@brief Computes the length of each string given in the input tensor.
  192. *@par Inputs:
  193. include: \n
  194. *x:The string for which to compute the length.
  195. *@par Attributes:
  196. *unit:The unit that is counted to compute string length.\n
  197. *One of: "BYTE" (for the number of bytes in each string) or\n
  198. *"UTF8_CHAR" (for the number of UTF-8 encoded Unicode code points in each string).\n
  199. *Results are undefined if unit=UTF8_CHAR and the input strings do not contain\N
  200. *structurally valid UTF-8.
  201. *@par Outputs:
  202. *y:Integer tensor that has the same shape as input.\n
  203. *The output contains the element-wise string lengths of input.
  204. *@see StringLength()
  205. */
  206. REG_OP(StringLength)
  207. .INPUT(x, TensorType({DT_STRING}))
  208. .OUTPUT(y, TensorType({DT_INT32}))
  209. .ATTR(unit, String, "BYTE")
  210. .OP_END_FACTORY_REG(StringLength)
  211. /**
  212. *@brief Joins the strings in the given list of string tensors into one tensor.
  213. *@par Inputs:
  214. *The input is a string tensor of any shape. The pattern is a scalar string tensor\n
  215. *which is applied to every element of the input tensor. The boolean values\n
  216. *(True or False) of the output tensor indicate if the input matches the regex\n
  217. *pattern provided. The pattern follows the re2 syntax\n
  218. *(https://github.com/google/re2/wiki/Syntax).: \n
  219. include: \n
  220. *x:A list of string tensors. The tensors must all have the same shape,\n
  221. *or be scalars. Scalars may be mixed in; these will be broadcast to the shape\n
  222. *of non-scalar inputs.
  223. *@par Attributes:
  224. *@li N:The length of input x.
  225. *@li separator:string, an optional join separator.
  226. *@par Outputs:
  227. *y:The output tensor.
  228. *@see StringJoin()
  229. */
  230. REG_OP(StringJoin)
  231. .DYNAMIC_INPUT(x, TensorType({DT_STRING}))
  232. .OUTPUT(y, TensorType({DT_STRING}))
  233. .REQUIRED_ATTR(N, Int)
  234. .ATTR(separator, String, "")
  235. .OP_END_FACTORY_REG(StringJoin)
  236. /**
  237. *@brief Formats a string template using a list of tensors.
  238. *@par Inputs:
  239. *The input is a string tensor of any shape. The pattern is a scalar string tensor\n
  240. *which is applied to every element of the input tensor.\n
  241. *The boolean values (True or False) of the output tensor indicate if the input\n
  242. *matches the regex pattern provided. The pattern follows the re2 syntax\n
  243. *(https://github.com/google/re2/wiki/Syntax).: \n
  244. include: \n
  245. *x:The tensors to format into the placeholder string.
  246. *@par Attributes:
  247. *@li template:A string, the template to format tensor summaries into.
  248. *@li placeholder:A string, at each placeholder in the template a subsequent tensor summary will be inserted.
  249. *@li summarize:When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
  250. *@par Outputs:
  251. *y:The resulting string scalar.
  252. *@see StringFormat()
  253. */
  254. REG_OP(StringFormat)
  255. .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
  256. DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_STRING, DT_FLOAT16, \
  257. DT_FLOAT, DT_DOUBLE, DT_BOOL}))
  258. .OUTPUT(y, TensorType({DT_STRING}))
  259. .ATTR(template, String, "%s")
  260. .ATTR(placeholder, String, "%s")
  261. .ATTR(summarize, Int, 3)
  262. .OP_END_FACTORY_REG(StringFormat)
  263. /**
  264. *@brief Check if the input matches the regex pattern.
  265. *@par Inputs:
  266. *The input is a string tensor of any shape. The pattern is a scalar string tensor\n
  267. *which is applied to every element of the input tensor. The boolean values \n
  268. *(True or False) of the output tensor indicate if the input matches the regex\n
  269. *pattern provided. The pattern follows the re2 syntax\n
  270. *(https://github.com/google/re2/wiki/Syntax).: \n
  271. include: \n
  272. *@li x:A string tensor of the text to be processed.
  273. *@li pattern:A scalar string tensor containing the regular expression to match the input.
  274. *@par Outputs:
  275. *y:A bool tensor with the same shape as input.
  276. *@see RegexFullMatch()
  277. */
  278. REG_OP(RegexFullMatch)
  279. .INPUT(x, TensorType({DT_STRING}))
  280. .INPUT(pattern, TensorType({DT_STRING}))
  281. .OUTPUT(y, TensorType({DT_BOOL}))
  282. .OP_END_FACTORY_REG(RegexFullMatch)
  283. /**
  284. *@brief Replaces matches of the pattern regular expression in input with the\n
  285. *replacement string provided in rewrite.
  286. *@par Inputs:
  287. *It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax).: \n
  288. include: \n
  289. *@li x:The text to be processed.
  290. *@li pattern:The regular expression to be matched in the input strings.
  291. *@li rewrite:The rewrite string to be substituted for the pattern expression\n
  292. *where it is matched in the input strings.
  293. *@par Attributes:
  294. *replace_global:If True, the replacement is global\n
  295. *(that is, all matches of the pattern regular expression in each input string\n
  296. *are rewritten), otherwise the rewrite substitution is only made for the first\n
  297. * pattern match.
  298. *@par Outputs:
  299. *y:The text after applying pattern match and rewrite substitution.
  300. *@see RegexReplace()
  301. */
  302. REG_OP(RegexReplace)
  303. .INPUT(x, TensorType({DT_STRING}))
  304. .INPUT(pattern, TensorType({DT_STRING}))
  305. .INPUT(rewrite, TensorType({DT_STRING}))
  306. .OUTPUT(y, TensorType({DT_STRING}))
  307. .ATTR(replace_global, Bool, true)
  308. .OP_END_FACTORY_REG(RegexReplace)
  309. /**
  310. *@brief Converts each entry in the given tensor to strings.
  311. *@par Inputs:
  312. *Supports many numeric types and boolean.: \n
  313. include: \n
  314. *x:A tensor can be trans to string.
  315. *@par Attributes:
  316. *@li precision:The post-decimal precision to use for floating point numbers.\n
  317. *Only used if precision > -1.
  318. *@li scientific:Use scientific notation for floating point numbers.
  319. *@li shortest:Use shortest representation (either scientific or standard)\n
  320. *for floating point numbers..
  321. *@li width:Pad pre-decimal numbers to this width. Applies to both floating\n
  322. *point and integer numbers. Only used if width > -1.
  323. *@li fill:The value to pad if width > -1. If empty, pads with spaces.\n
  324. *Another typical value is '0'. String cannot be longer than 1 character.
  325. *@par Outputs:
  326. *y:The output tensor.
  327. *@see AsString()
  328. */
  329. REG_OP(AsString)
  330. .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \
  331. DT_DOUBLE, DT_BOOL}))
  332. .OUTPUT(y, TensorType({DT_STRING}))
  333. .ATTR(precision, Int, -1)
  334. .ATTR(scientific, Bool, false)
  335. .ATTR(shortest, Bool, false)
  336. .ATTR(width, Int, -1)
  337. .ATTR(fill, String, "")
  338. .OP_END_FACTORY_REG(AsString)
  339. /**
  340. *@brief Encode strings into web-safe base64 format.
  341. *@par Inputs:
  342. *Input may or may not have padding at the end. See EncodeBase64 for padding.\n
  343. *Web-safe means that input must use - and _ instead of + and /.: \n
  344. include: \n
  345. *x:Strings to be encoded.
  346. *@par Attributes:
  347. *pad:Bool whether padding is applied at the ends.
  348. *@par Outputs:
  349. *y:Input strings encoded in base64.
  350. *@attention Constraints:\n
  351. *Refer to the following article for more information on base64 format:\n
  352. *en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '='\n
  353. *at the end so that the encoded has length multiple of 4.\n
  354. *See Padding section of the link above. Web-safe means that the encoder\n
  355. *uses - and _ instead of + and /.
  356. *@see EncodeBase64()
  357. */
  358. REG_OP(EncodeBase64)
  359. .INPUT(x, TensorType({DT_STRING}))
  360. .OUTPUT(y, TensorType({DT_STRING}))
  361. .ATTR(pad, Bool, false)
  362. .OP_END_FACTORY_REG(EncodeBase64)
  363. /**
  364. *@brief Decode web-safe base64-encoded strings.
  365. *@par Inputs:
  366. *Input may or may not have padding at the end. See EncodeBase64 for padding.\n
  367. *Web-safe means that input must use - and _ instead of + and /.: \n
  368. include: \n
  369. *x:Base64 strings to decode.
  370. *@par Outputs:
  371. *y:Decoded strings.
  372. *@see DecodeBase64()
  373. */
  374. REG_OP(DecodeBase64)
  375. .INPUT(x, TensorType({DT_STRING}))
  376. .OUTPUT(y, TensorType({DT_STRING}))
  377. .OP_END_FACTORY_REG(DecodeBase64)
  378. } // namespace ge
  379. #endif // GE_OP_STRING_OPS_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示