You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

string_ops.h 16 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef GE_OP_STRING_OPS_H_
  17. #define GE_OP_STRING_OPS_H_
  18. #include <sstream>
  19. #include "graph/operator_reg.h"
  20. namespace ge {
  21. /**
  22. *@brief Split elements of input based on delimiter into a SparseTensor.
  23. *@par Inputs:
  24. include: \n
  25. *@li input:1-D. Strings to split.
  26. *@li delimiter:0-D. Delimiter characters (bytes), or empty string.
  27. *@par Attributes:
  28. * skip_empty:A bool. If True, skip the empty strings from the result.
  29. *@par Outputs:
  30. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  31. *@li values:A vector of strings corresponding to the splited values.
  32. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,\n
  33. *where the first value is N and the second value is the maximum number of tokens\n
  34. *in a single input entry.
  35. *@see StringSplit()
  36. *@par Third-party framework compatibility
  37. *compatible with StringSplit op of tensorflow
  38. */
  39. REG_OP(StringSplit)
  40. .INPUT(input, TensorType({DT_STRING}))
  41. .INPUT(delimiter, TensorType({DT_STRING}))
  42. .OUTPUT(indices, TensorType({DT_INT64}))
  43. .OUTPUT(values, TensorType({DT_STRING}))
  44. .OUTPUT(shape, TensorType({DT_INT64}))
  45. .ATTR(skip_empty, Bool, true)
  46. .OP_END_FACTORY_REG(StringSplit)
  47. /**
  48. *@brief Split elements of source based on sep into a SparseTensor.
  49. *@par Inputs:
  50. include: \n
  51. *@li input:1-D. Strings to split.
  52. *@li sep:0-D string Tensor, the delimiter character.
  53. *@par Attributes:
  54. * maxsplit:An int. If maxsplit > 0, limit of the split of the result.
  55. *@par Outputs:
  56. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  57. *@li values:A vector of strings corresponding to the splited values.
  58. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,\n
  59. *where the first value is N and the second value is the maximum number of tokens\n
  60. *in a single input entry.
  61. *@see StringSplitV2()
  62. *@par Third-party framework compatibility
  63. *compatible with StringSplitV2 op of tensorflow
  64. */
  65. REG_OP(StringSplitV2)
  66. .INPUT(input, TensorType({DT_STRING}))
  67. .INPUT(sep, TensorType({DT_STRING}))
  68. .OUTPUT(indices, TensorType({DT_INT64}))
  69. .OUTPUT(values, TensorType({DT_STRING}))
  70. .OUTPUT(shape, TensorType({DT_INT64}))
  71. .ATTR(maxsplit, Int, -1)
  72. .OP_END_FACTORY_REG(StringSplitV2)
  73. /**
  74. *@brief Determine the script codes of a given tensor of Unicode integer code points.
  75. *@par Inputs:
  76. include: \n
  77. *x:A Tensor of int32 Unicode code points.
  78. *@par Outputs:
  79. *y:A Tensor of int32 script codes corresponding to each input code point.
  80. *@attention Constraints:\n
  81. *This operation converts Unicode code points to script codes corresponding to\n
  82. *each code point.\nScript codes correspond to International Components for\n
  83. *Unicode (ICU) UScriptCode values.\n
  84. *See http://icu-project.org/apiref/icu4c/uscript_8h.html.\n
  85. *Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints.\n
  86. *Output shape will match input shape.
  87. *@see UnicodeScript()
  88. *@par Third-party framework compatibility
  89. *compatible with UnicodeScript op of tensorflow
  90. */
  91. REG_OP(UnicodeScript)
  92. .INPUT(x, TensorType({DT_INT32}))
  93. .OUTPUT(y, TensorType({DT_INT32}))
  94. .OP_END_FACTORY_REG(UnicodeScript)
  95. /**
  96. *@brief Return substrings from Tensor of strings.
  97. *@par Inputs:
  98. include: \n
  99. *@li input:Tensor of strings.
  100. *@li pos:Scalar defining the position of first character in each substring.
  101. *@li len:Scalar defining the number of characters to include in each substring.
  102. *@par Outputs:
  103. *output:Tensor of substrings.
  104. *@attention Constraints:\n
  105. *The hash function is deterministic on the content of the string within\n
  106. *the process and will never change. However, it is not suitable for\n
  107. *cryptography. This function may be used when CPU time is scarce and\n
  108. *inputs are trusted or unimportant. There is a risk of adversaries\n
  109. *constructing inputs that all hash to the same bucket.\n
  110. *To prevent this problem, use a strong hash function with\n
  111. *tf.string_to_hash_bucket_strong.
  112. *@see Substr()
  113. *@par Third-party framework compatibility
  114. *compatible with Substr op of tensorflow
  115. */
  116. REG_OP(Substr)
  117. .INPUT(input, TensorType({DT_STRING}))
  118. .INPUT(pos, TensorType({DT_INT32, DT_INT64}))
  119. .INPUT(len, TensorType({DT_INT32, DT_INT64}))
  120. .OUTPUT(output, TensorType({DT_STRING}))
  121. .OP_END_FACTORY_REG(Substr)
  122. /**
  123. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets.
  124. *@par Inputs:
  125. include: \n
  126. *string_tensor:The strings to assign a hash bucket.
  127. *@par Outputs:
  128. *y:A Tensor of the same shape as the input x.
  129. *@attention Constraints:\n
  130. *The hash function is deterministic on the content of the string within\n
  131. *the process and will never change. However, it is not suitable for cryptography.\n
  132. *This function may be used when CPU time is scarce and inputs are trusted or\n
  133. *unimportant. There is a risk of adversaries constructing inputs that all hash\n
  134. *to the same bucket. To prevent this problem, use a strong hash function with\n
  135. *tf.string_to_hash_bucket_strong.
  136. *@see StringToHashBucketFast()
  137. *@par Third-party framework compatibility
  138. *compatible with StringToHashBucketFast op of tensorflow
  139. */
  140. REG_OP(StringToHashBucketFast)
  141. .INPUT(x, TensorType({DT_STRING}))
  142. .OUTPUT(y, TensorType({DT_INT64}))
  143. .ATTR(num_buckets, Int, 1)
  144. .OP_END_FACTORY_REG(StringToHashBucketFast)
  145. /**
  146. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets.
  147. *@par Inputs:
  148. include: \n
  149. *x:The strings to assign a hash bucket.
  150. *@par Attributes:
  151. *num_buckets:The number of buckets.
  152. *@par Outputs:
  153. *y:A Tensor of the same shape as the input x.
  154. *@attention Constraints:\n
  155. *@li A strong hash is important when inputs may be malicious, e.g. URLs with\n
  156. *additional components. Adversaries could try to make their inputs hash to\n
  157. *the same bucket for a denial-of-service attack or to skew the results.\n
  158. *A strong hash can be used to make it difficult to find inputs with a skewed\n
  159. * hash value distribution over buckets. This requires that the hash function\
  160. *is seeded by a high-entropy (random) "key" unknown to the adversary.
  161. *@li The additional robustness comes at a cost of roughly 4x higher\n
  162. *compute time than tf.string_to_hash_bucket_fast.
  163. *@see StringToHashBucketStrong()
  164. *@par Third-party framework compatibility
  165. *compatible with StringToHashBucketStrong op of tensorflow
  166. */
  167. REG_OP(StringToHashBucketStrong)
  168. .INPUT(x, TensorType({DT_STRING}))
  169. .OUTPUT(y, TensorType({DT_INT64}))
  170. .ATTR(num_buckets, Int, 1)
  171. .REQUIRED_ATTR(key, ListInt)
  172. .OP_END_FACTORY_REG(StringToHashBucketStrong)
  173. /**
  174. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets.
  175. *@par Inputs:
  176. include: \n
  177. *string_tensor:The strings to assign a hash bucket.
  178. *@par Attributes:
  179. *num_buckets:The number of buckets.
  180. *@par Outputs:
  181. *y:A Tensor of the same shape as the input string_tensor.
  182. *@see StringToHashBucket()
  183. *@par Third-party framework compatibility
  184. *compatible with StringToHashBucket op of tensorflow
  185. */
  186. REG_OP(StringToHashBucket)
  187. .INPUT(string_tensor, TensorType({DT_STRING}))
  188. .OUTPUT(y, TensorType({DT_INT64}))
  189. .ATTR(num_buckets, Int, 1)
  190. .OP_END_FACTORY_REG(StringToHashBucket)
  191. /**
  192. *@brief Strip leading and trailing whitespaces from the Tensor.
  193. *@par Inputs:
  194. include: \n
  195. *x:A string Tensor of any shape.
  196. *@par Outputs:
  197. *y:A string Tensor of the same shape as the input.
  198. *@see StringStrip()
  199. *@par Third-party framework compatibility
  200. *compatible with StringStrip op of tensorflow
  201. */
  202. REG_OP(StringStrip)
  203. .INPUT(x, TensorType({DT_STRING}))
  204. .OUTPUT(y, TensorType({DT_STRING}))
  205. .OP_END_FACTORY_REG(StringStrip)
  206. /**
  207. *@brief Computes the length of each string given in the input tensor.
  208. *@par Inputs:
  209. include: \n
  210. *x:The string for which to compute the length.
  211. *@par Attributes:
  212. *unit:The unit that is counted to compute string length.\n
  213. *One of: "BYTE" (for the number of bytes in each string) or\n
  214. *"UTF8_CHAR" (for the number of UTF-8 encoded Unicode code points in each string).\n
  215. *Results are undefined if unit=UTF8_CHAR and the input strings do not contain\N
  216. *structurally valid UTF-8.
  217. *@par Outputs:
  218. *y:Integer tensor that has the same shape as input.\n
  219. *The output contains the element-wise string lengths of input.
  220. *@see StringLength()
  221. *@par Third-party framework compatibility
  222. *compatible with StringLength op of tensorflow
  223. */
  224. REG_OP(StringLength)
  225. .INPUT(x, TensorType({DT_STRING}))
  226. .OUTPUT(y, TensorType({DT_INT32}))
  227. .ATTR(unit, String, "BYTE")
  228. .OP_END_FACTORY_REG(StringLength)
  229. /**
  230. *@brief Joins the strings in the given list of string tensors into one tensor.
  231. *@par Inputs:
  232. *The input is a string tensor of any shape. The pattern is a scalar string tensor\n
  233. *which is applied to every element of the input tensor. The boolean values\n
  234. *(True or False) of the output tensor indicate if the input matches the regex\n
  235. *pattern provided. The pattern follows the re2 syntax\n
  236. *(https://github.com/google/re2/wiki/Syntax).: \n
  237. include: \n
  238. *x:A list of string tensors. The tensors must all have the same shape,\n
  239. *or be scalars. Scalars may be mixed in; these will be broadcast to the shape\n
  240. *of non-scalar inputs.
  241. *@par Attributes:
  242. *@li N:The length of input x.
  243. *@li separator:string, an optional join separator.
  244. *@par Outputs:
  245. *y:The output tensor.
  246. *@see StringJoin()
  247. *@par Third-party framework compatibility
  248. *compatible with StringJoin op of tensorflow
  249. */
  250. REG_OP(StringJoin)
  251. .DYNAMIC_INPUT(x, TensorType({DT_STRING}))
  252. .OUTPUT(y, TensorType({DT_STRING}))
  253. .REQUIRED_ATTR(N, Int)
  254. .ATTR(separator, String, "")
  255. .OP_END_FACTORY_REG(StringJoin)
  256. /**
  257. *@brief Formats a string template using a list of tensors.
  258. *@par Inputs:
  259. *The input is a string tensor of any shape. The pattern is a scalar string tensor\n
  260. *which is applied to every element of the input tensor.\n
  261. *The boolean values (True or False) of the output tensor indicate if the input\n
  262. *matches the regex pattern provided. The pattern follows the re2 syntax\n
  263. *(https://github.com/google/re2/wiki/Syntax).: \n
  264. include: \n
  265. *x:The tensors to format into the placeholder string.
  266. *@par Attributes:
  267. *@li template:A string, the template to format tensor summaries into.
  268. *@li placeholder:A string, at each placeholder in the template a subsequent tensor summary will be inserted.
  269. *@li summarize:When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
  270. *@par Outputs:
  271. *y:The resulting string scalar.
  272. *@see StringFormat()
  273. *@par Third-party framework compatibility
  274. * compatible with StringFormat op of tensorflow
  275. */
  276. REG_OP(StringFormat)
  277. .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
  278. DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_STRING, DT_FLOAT16, \
  279. DT_FLOAT, DT_DOUBLE, DT_BOOL}))
  280. .OUTPUT(y, TensorType({DT_STRING}))
  281. .ATTR(template, String, "%s")
  282. .ATTR(placeholder, String, "%s")
  283. .ATTR(summarize, Int, 3)
  284. .OP_END_FACTORY_REG(StringFormat)
  285. /**
  286. *@brief Check if the input matches the regex pattern.
  287. *@par Inputs:
  288. *The input is a string tensor of any shape. The pattern is a scalar string tensor\n
  289. *which is applied to every element of the input tensor. The boolean values \n
  290. *(True or False) of the output tensor indicate if the input matches the regex\n
  291. *pattern provided. The pattern follows the re2 syntax\n
  292. *(https://github.com/google/re2/wiki/Syntax).: \n
  293. include: \n
  294. *@li x:A string tensor of the text to be processed.
  295. *@li pattern:A scalar string tensor containing the regular expression to match the input.
  296. *@par Outputs:
  297. *y:A bool tensor with the same shape as input.
  298. *@see RegexFullMatch()
  299. *@par Third-party framework compatibility
  300. *compatible with RegexFullMatch op of tensorflow
  301. */
  302. REG_OP(RegexFullMatch)
  303. .INPUT(x, TensorType({DT_STRING}))
  304. .INPUT(pattern, TensorType({DT_STRING}))
  305. .OUTPUT(y, TensorType({DT_BOOL}))
  306. .OP_END_FACTORY_REG(RegexFullMatch)
  307. /**
  308. *@brief Replaces matches of the pattern regular expression in input with the\n
  309. *replacement string provided in rewrite.
  310. *@par Inputs:
  311. *It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax).: \n
  312. include: \n
  313. *@li x:The text to be processed.
  314. *@li pattern:The regular expression to be matched in the input strings.
  315. *@li rewrite:The rewrite string to be substituted for the pattern expression\n
  316. *where it is matched in the input strings.
  317. *@par Attributes:
  318. *replace_global:If True, the replacement is global\n
  319. *(that is, all matches of the pattern regular expression in each input string\n
  320. *are rewritten), otherwise the rewrite substitution is only made for the first\n
  321. * pattern match.
  322. *@par Outputs:
  323. *y:The text after applying pattern match and rewrite substitution.
  324. *@see RegexReplace()
  325. *@par Third-party framework compatibility
  326. *compatible with RegexReplace op of tensorflow
  327. */
  328. REG_OP(RegexReplace)
  329. .INPUT(x, TensorType({DT_STRING}))
  330. .INPUT(pattern, TensorType({DT_STRING}))
  331. .INPUT(rewrite, TensorType({DT_STRING}))
  332. .OUTPUT(y, TensorType({DT_STRING}))
  333. .ATTR(replace_global, Bool, true)
  334. .OP_END_FACTORY_REG(RegexReplace)
  335. /**
  336. *@brief Converts each entry in the given tensor to strings.
  337. *@par Inputs:
  338. *Supports many numeric types and boolean.: \n
  339. include: \n
  340. *x:A tensor can be trans to string.
  341. *@par Attributes:
  342. *@li precision:The post-decimal precision to use for floating point numbers.\n
  343. *Only used if precision > -1.
  344. *@li scientific:Use scientific notation for floating point numbers.
  345. *@li shortest:Use shortest representation (either scientific or standard)\n
  346. *for floating point numbers..
  347. *@li width:Pad pre-decimal numbers to this width. Applies to both floating\n
  348. *point and integer numbers. Only used if width > -1.
  349. *@li fill:The value to pad if width > -1. If empty, pads with spaces.\n
  350. *Another typical value is '0'. String cannot be longer than 1 character.
  351. *@par Outputs:
  352. *y:The output tensor.
  353. *@see AsString()
  354. *@par Third-party framework compatibility
  355. *compatible with AsString op of tensorflow
  356. */
  357. REG_OP(AsString)
  358. .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \
  359. DT_DOUBLE, DT_BOOL}))
  360. .OUTPUT(y, TensorType({DT_STRING}))
  361. .ATTR(precision, Int, -1)
  362. .ATTR(scientific, Bool, false)
  363. .ATTR(shortest, Bool, false)
  364. .ATTR(width, Int, -1)
  365. .ATTR(fill, String, "")
  366. .OP_END_FACTORY_REG(AsString)
  367. /**
  368. *@brief Encode strings into web-safe base64 format.
  369. *@par Inputs:
  370. *Input may or may not have padding at the end. See EncodeBase64 for padding.\n
  371. *Web-safe means that input must use - and _ instead of + and /.: \n
  372. include: \n
  373. *x:Strings to be encoded.
  374. *@par Attributes:
  375. *pad:Bool whether padding is applied at the ends.
  376. *@par Outputs:
  377. *y:Input strings encoded in base64.
  378. *@attention Constraints:\n
  379. *Refer to the following article for more information on base64 format:\n
  380. *en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '='\n
  381. *at the end so that the encoded has length multiple of 4.\n
  382. *See Padding section of the link above. Web-safe means that the encoder\n
  383. *uses - and _ instead of + and /.
  384. *@see EncodeBase64()
  385. *@par Third-party framework compatibility
  386. *compatible with EncodeBase64 op of tensorflow
  387. */
  388. REG_OP(EncodeBase64)
  389. .INPUT(x, TensorType({DT_STRING}))
  390. .OUTPUT(y, TensorType({DT_STRING}))
  391. .ATTR(pad, Bool, false)
  392. .OP_END_FACTORY_REG(EncodeBase64)
  393. /**
  394. *@brief Decode web-safe base64-encoded strings.
  395. *@par Inputs:
  396. *Input may or may not have padding at the end. See EncodeBase64 for padding.\n
  397. *Web-safe means that input must use - and _ instead of + and /.: \n
  398. include: \n
  399. *x:Base64 strings to decode.
  400. *@par Outputs:
  401. *y:Decoded strings.
  402. *@see DecodeBase64()
  403. *@par Third-party framework compatibility
  404. *compatible with DecodeBase64 op of tensorflow
  405. */
  406. REG_OP(DecodeBase64)
  407. .INPUT(x, TensorType({DT_STRING}))
  408. .OUTPUT(y, TensorType({DT_STRING}))
  409. .OP_END_FACTORY_REG(DecodeBase64)
  410. } // namespace ge
  411. #endif // GE_OP_STRING_OPS_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示