!2092 upgrade Ascend package 18 Nov 21

Merge pull request !2092 from yanghaoran/release
4 years ago · b55217c85b
--- a/inc/external/acl/error_codes/rt_error_codes.h
+++ b/inc/external/acl/error_codes/rt_error_codes.h
@@ -60,6 +60,7 @@ static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE = 207011;      // no cdq resou
 static const int32_t ACL_ERROR_RT_OVER_LIMIT = 207012;           // over limit
 static const int32_t ACL_ERROR_RT_QUEUE_EMPTY = 207013;          // queue is empty
 static const int32_t ACL_ERROR_RT_QUEUE_FULL = 207014;           // queue is full
 static const int32_t ACL_ERROR_RT_REPEATED_INIT = 207015;        // repeated init

 static const int32_t ACL_ERROR_RT_INTERNAL_ERROR = 507000;              // runtime internal error
 static const int32_t ACL_ERROR_RT_TS_ERROR = 507001;                    // ts internel error
--- a/inc/framework/common/ge_types.h
+++ b/inc/framework/common/ge_types.h
@@ -293,6 +293,7 @@ struct OpDescInfo {
  std::string dev_func;
  std::string tvm_magic;
  uint32_t tiling_key = 0U;
  uintptr_t args = 0U;
  std::string tiling_data;
  std::string node_info;
  std::vector<int64_t> workspace_bytes;
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit 7d777404b3b7fe7daeaf00e566e431c6a05b040a
 Subproject commit fe47d04d75170006fc0d28538dec49a2da426ceb
--- a/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
+++ b/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
@@ -58,6 +58,10 @@ static const int32_t ACL_ERROR_RT_NO_STREAM_RESOURCE         = 207008; // no str
 static const int32_t ACL_ERROR_RT_NO_NOTIFY_RESOURCE         = 207009; // no notify resource
 static const int32_t ACL_ERROR_RT_NO_MODEL_RESOURCE          = 207010; // no model resource
 static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE            = 207011; // no cdq resource
 static const int32_t ACL_ERROR_RT_OVER_LIMIT                 = 207012; // over limit
 static const int32_t ACL_ERROR_RT_QUEUE_EMPTY                = 207013; // queue is empty
 static const int32_t ACL_ERROR_RT_QUEUE_FULL                 = 207014; // queue is full
 static const int32_t ACL_ERROR_RT_REPEATED_INIT              = 207015; // repeated init

 static const int32_t ACL_ERROR_RT_INTERNAL_ERROR             = 507000; // runtime internal error
 static const int32_t ACL_ERROR_RT_TS_ERROR                   = 507001; // ts internel error
@@ -97,6 +101,10 @@ static const int32_t ACL_ERROR_RT_VECTOR_CORE_TIMEOUT        = 507034; // vector
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_EXCEPTION      = 507035; // vector core exception
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_EXCEPTION = 507036; // vector core trap exception
 static const int32_t ACL_ERROR_RT_CDQ_BATCH_ABNORMAL         = 507037; // cdq alloc batch abnormal
 static const int32_t ACL_ERROR_RT_DIE_MODE_CHANGE_ERROR      = 507038; // can not change die mode
 static const int32_t ACL_ERROR_RT_DIE_SET_ERROR              = 507039; // single die mode can not set die
 static const int32_t ACL_ERROR_RT_INVALID_DIEID              = 507040; // invalid die id
 static const int32_t ACL_ERROR_RT_DIE_MODE_NOT_SET           = 507041; // die mode not set

 static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR         = 507899; // drv internal error
 static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR       = 507900; // aicpu internal error
@@ -105,5 +113,4 @@ static const int32_t ACL_ERROR_RT_SOCKET_CLOSE               = 507901; // hdc di
 #ifdef __cplusplus
 }
 #endif

 #endif // __INC_EXTERNEL_RT_ERROR_CODES_H__
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
@@ -497,6 +497,25 @@ REG_OP(Constant)
    .ATTR(value, Tensor, Tensor())
    .OP_END_FACTORY_REG(Constant)

 /**
 *@brief Creates a file constant tensor, The operator is used to process the very large weight which is store in file. \n

 *@par Attributes:
 *file_id: A string, used to record file id. \n
 *shape: data shape. \n
 *dtype: data type. \n

 *@par Outputs:
 *y: The FileConstant tensor. \n
 */
 REG_OP(FileConstant)
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \
        DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE}))
    .REQUIRED_ATTR(file_id, String)
    .REQUIRED_ATTR(shape, ListInt)
    .REQUIRED_ATTR(dtype, Type)
    .OP_END_FACTORY_REG(FileConstant)

 /**
 *@brief Returns a copy of the input tensor. \n

@@ -1329,31 +1348,6 @@ REG_OP(ExpandD)
    .REQUIRED_ATTR(shape, ListInt)
    .OP_END_FACTORY_REG(ExpandD)

 /**
 * @brief Calculate buckets limit and offset. \n

 * @par Inputs:
 * Three inputs, including:
 * @li bucket_list: A 1-D tensor of type int32 with the value of ivf_counts and ivf_offset index. \n
 * @li ivf_counts: A 1-D tensor of type int32 with the value of ivf counts. \n
 * @li ivf_offset: A 1-D tensor of type int32 or int64 with the value of ivf offset. \n

 * @par Attributes:
 * total_limit: A int64 type maximum value of the sum of ivf_counts corresponding to bucket_list. \n

 * @par Outputs:
 * @li buckets_limit: A 1-D tensor of type int32 with the sum <= total_limit. \n
 * @li buckets_offset: A 1-D tensor of type int32 or int64 with the value of ivf_offset corresponding to bucket_list. \n
 */
 REG_OP(CalcBucketsLimitAndOffset)
    .INPUT(bucket_list, TensorType({DT_INT32}))
    .INPUT(ivf_counts, TensorType({DT_INT32}))
    .INPUT(ivf_offset, TensorType({DT_INT32, DT_INT64}))
    .OUTPUT(buckets_limit, TensorType({DT_INT32}))
    .OUTPUT(buckets_offset, TensorType({DT_INT32, DT_INT64}))
    .REQUIRED_ATTR(total_limit, Int)
    .OP_END_FACTORY_REG(CalcBucketsLimitAndOffset)

 /**
 *@brief Get dim number in tensordesc. \n

@@ -1362,6 +1356,9 @@ REG_OP(CalcBucketsLimitAndOffset)

 *@par Outputs:
 *y: A 1D tensor. The data type must be int32. \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(GetShape)
    .DYNAMIC_INPUT(x, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \
@@ -1377,8 +1374,13 @@ REG_OP(GetShape)

 *@par outputs:
 * y: a tensor_desc, type is int.\n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(UpdateTensorDesc)
    .INPUT(x, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT32, DT_UINT8,
                          DT_INT64, DT_UINT64, DT_INT16, DT_UINT16, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT32, DT_UINT8,
                           DT_INT64, DT_UINT64, DT_INT16, DT_UINT16, DT_DOUBLE}))
    .REQUIRED_ATTR(shape, ListInt)
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -585,6 +585,14 @@ REG_OP(ResizeNearestNeighborV2GradD)
 *@li original_image: A Tensor. 4-D shape. Must set the format, supported format list ["NCHW, NHWC"]
 channels], The image tensor that was resized . \n

 *@par Attributes:
 *@li size: An optional listint. Defaults to {}.
 *@par Attributes:
 *@li ori_image_size: An optional listint. Defaults to {}.
 *@par Attributes:
 *@li src_start_w: An optional int. Defaults to 0.
 *@par Attributes:
 *@li dst_start_w: An optional int. Defaults to 0.
 *@par Attributes:
 *@li align_corners: An optional bool. Defaults to False. If true, the centers of
 the 4 corner pixels of the input and grad tensors are aligned. Defaults to
@@ -606,6 +614,10 @@ REG_OP(ResizeBilinearV2Grad)
    .INPUT(grads, TensorType({DT_FLOAT}))
    .INPUT(original_image, TensorType::FloatingDataType())
    .OUTPUT(y, TensorType({DT_FLOAT}))
    .ATTR(size, ListInt, {})
    .ATTR(ori_image_size, ListInt, {})
    .ATTR(src_start_w, Int, 0)
    .ATTR(dst_start_w, Int, 0)
    .ATTR(align_corners, Bool, false)
    .ATTR(half_pixel_centers, Bool, false)
    .OP_END_FACTORY_REG(ResizeBilinearV2Grad)
@@ -624,7 +636,10 @@ size for the images . \n
 output tensors are aligned, preserving the values at the corner pixels.
 Defaults to false .
 * @li half_pixel_centers: An optional bool. Defaults to False . \n

 *@li ori_image_size: An optional listint. Defaults to {}.
 *@li split_size: An optional listint. Defaults to {}.
 *@li src_start_w: An optional int. Defaults to 0.
 *@li dst_start_w: An optional int. Defaults to 0.
 *@par Outputs:
 *y: 4-D with shape [batch, new_height, new_width, channels] . \n

@@ -640,6 +655,10 @@ REG_OP(ResizeBilinearV2)
                               DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(size, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT}))
    .ATTR(ori_image_size, ListInt, {})
    .ATTR(split_size, ListInt, {})
    .ATTR(src_start_w, Int, 0)
    .ATTR(dst_start_w, Int, 0)
    .ATTR(align_corners, Bool, false)
    .ATTR(half_pixel_centers, Bool, false)
    .OP_END_FACTORY_REG(ResizeBilinearV2)
--- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
@@ -113,9 +113,7 @@ if input "x" is with format NC1HWC0. Specifies the mean of "x".
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x".
 *@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
 *@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output .
 *@li reserve_space_3: An optional Tensor of type float32. For compatibility with tensorflow, only has one useless element. \n
 *@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. \n

 *@attention Constraints:
 *@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
@@ -137,7 +135,6 @@ REG_OP(BatchNorm)
    .OUTPUT(batch_variance, TensorType({DT_FLOAT}))
    .OUTPUT(reserve_space_1, TensorType({DT_FLOAT}))
    .OUTPUT(reserve_space_2, TensorType({DT_FLOAT}))
    .OUTPUT(reserve_space_3, TensorType({DT_FLOAT}))
    .ATTR(epsilon, Float, 0.0001)
    .ATTR(data_format, String, "NHWC")
    .ATTR(is_training, Bool, true)
@@ -166,6 +163,33 @@ REG_OP(SyncBatchNormBackwardReduce)
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(SyncBatchNormBackwardReduce)

 /**
 *@brief part of SyncBatchNormBackward . \n

 *@par Inputs:
 * Three inputs, including:
 *@li grad_output: A Tensor. Must be one of the following types: float16, float32 .
 *@li save_input: A Tensor. Must be one of the following types: float16, float32 .
 *@li mean: A Tensor. Must be one of the following types: float16, float32 .
 *@li invstd: A Tensor. Must be one of the following types: float16, float32 .
 *@li weight: A Tensor. Must be one of the following types: float16, float32 .
 *@li mean_dy: A Tensor. Must be one of the following types: float16, float32 .
 *@li mean_dy_xmu: A Tensor. Must be one of the following types: float16, float32 . \n

 *@par Outputs:
 *@li grad_input: A Tensor. Has the same type and format as input "grad_output" . \n
 */
 REG_OP(SyncBatchNormBackwardElemt)
    .INPUT(grad_output, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(save_input, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(mean, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(invstd, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(weight, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(mean_dy, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(mean_dy_xmu, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(grad_input, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(SyncBatchNormBackwardElemt)
    
 /**
 *@brief Performs batch normalization . \n

@@ -285,8 +309,7 @@ REG_OP(BatchNormExt2)
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0.
 *@li scale: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0.
 *@li reserve_space_1: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm.
 *@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm .
 *@li reserve_space_3: A 1D optional Tensor of type float32. It is an output of BatchNorm . \n
 *@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm . \n

 *@par Attributes:
 *@li epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x".
@@ -313,7 +336,6 @@ REG_OP(BatchNormGrad)
    .INPUT(scale, TensorType({DT_FLOAT}))
    .INPUT(reserve_space_1, TensorType({DT_FLOAT}))
    .INPUT(reserve_space_2, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(reserve_space_3, TensorType({DT_FLOAT}))
    .OUTPUT(x_backprop, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(scale_backprop, TensorType({DT_FLOAT}))
    .OUTPUT(offset_backprop, TensorType({DT_FLOAT}))
--- a/third_party/fwkacllib/inc/ops/ocr_ops.h
+++ b/third_party/fwkacllib/inc/ops/ocr_ops.h
@@ -128,7 +128,7 @@ REG_OP(OCRIdentifyPreHandle)
    .INPUT(imgs_offset, TensorType({DT_INT32}))
    .INPUT(imgs_size, TensorType({DT_INT32}))
    .OUTPUT(resized_imgs, TensorType({DT_UINT8}))
    .ATTR(size, ListInt, {})
    .REQUIRED_ATTR(size, ListInt)
    .ATTR(data_format, String, "NHWC")
    .OP_END_FACTORY_REG(OCRIdentifyPreHandle)

@@ -247,6 +247,7 @@ REG_OP(OCRDetectionPostHandle)
 *@li clipped_polys_data: A Tensor of type int32. point data of every clipped poly. \n
 *@li clipped_polys_offset: A Tensor of type int32. Offset of every clipped poly . \n
 *@li clipped_polys_size: A Tensor of type int32. Size of every clipped poly. \n
 *@li clipped_polys_num: A Tensor of type int32. Number of clipped polys. \n
 */
 REG_OP(ResizeAndClipPolys)
    .INPUT(polys_data, TensorType({DT_INT32}))
@@ -259,6 +260,7 @@ REG_OP(ResizeAndClipPolys)
    .OUTPUT(clipped_polys_data, TensorType({DT_INT32}))
    .OUTPUT(clipped_polys_offset, TensorType({DT_INT32}))
    .OUTPUT(clipped_polys_size, TensorType({DT_INT32}))
    .OUTPUT(clipped_polys_num, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(ResizeAndClipPolys);


--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -1305,6 +1305,27 @@ REG_OP(ReduceStdWithMean)
    .ATTR(invert, Bool, false)
    .ATTR(epsilon, Float, 0.001)
    .OP_END_FACTORY_REG(ReduceStdWithMean)

 /**
 *@brief Performs reduced batch normalization . \n

 *@par Inputs:
 *x: A 5D Tensor of type float16 or float32, with format NC1HWC0 . \n

 *@par Outputs:
 *@li mean: A Tensor of type float32 for SUM reduced "x".
 *@li variance: A Tensor of type float32 for square sum reduced "x" . \n

 *@par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(ReduceMeanVariance)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(mean, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(variance, TensorType({DT_FLOAT16,DT_FLOAT}))
    .ATTR(axes, ListInt, {})
    .ATTR(keep_dims, Bool, true)
    .OP_END_FACTORY_REG(ReduceMeanVariance)
 } //namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_
--- a/third_party/fwkacllib/inc/ops/vector_search.h
+++ b/third_party/fwkacllib/inc/ops/vector_search.h
@@ -78,8 +78,8 @@ REG_OP(TopKPQDistance)
    .OUTPUT(topk_ivf, TensorType({DT_INT32}))
    .OUTPUT(topk_index, TensorType({DT_INT32}))
    .ATTR(order, String, "ASC")
    .ATTR(k, Int, 0)
    .ATTR(group_size, Int, 0)
    .REQUIRED_ATTR(k, Int)
    .REQUIRED_ATTR(group_size, Int)
    .OP_END_FACTORY_REG(TopKPQDistance)

 /**
@@ -129,6 +129,68 @@ REG_OP(ScanPQCodes)
    .ATTR(split_count, Int, 1)
    .ATTR(split_index, Int, 0)
    .OP_END_FACTORY_REG(ScanPQCodes)

 /**
 * @brief Calculate buckets limit and offset. \n

 * @par Inputs:
 * Three inputs, including:
 * @li bucket_list: A 1-D tensor of type int32 with the value of ivf_counts and ivf_offset index. \n
 * @li ivf_counts: A 1-D tensor of type int32 with the value of ivf counts. \n
 * @li ivf_offset: A 1-D tensor of type int32 or int64 with the value of ivf offset. \n

 * @par Attributes:
 * total_limit: A int64 type maximum value of the sum of ivf_counts corresponding to bucket_list. \n

 * @par Outputs:
 * @li buckets_limit: A 1-D tensor of type int32 with the sum <= total_limit. \n
 * @li buckets_offset: A 1-D tensor of type int32 or int64 with the value of ivf_offset corresponding to bucket_list. \n
 */
 REG_OP(CalcBucketsLimitAndOffset)
    .INPUT(bucket_list, TensorType({DT_INT32}))
    .INPUT(ivf_counts, TensorType({DT_INT32}))
    .INPUT(ivf_offset, TensorType({DT_INT32, DT_INT64}))
    .OUTPUT(buckets_limit, TensorType({DT_INT32}))
    .OUTPUT(buckets_offset, TensorType({DT_INT32, DT_INT64}))
    .REQUIRED_ATTR(total_limit, Int)
    .OP_END_FACTORY_REG(CalcBucketsLimitAndOffset)

 /**
 * @brief Calculate ProdVirialSeA. \n
 *
 * @par Inputs:
 * Five inputs, including:
 * @li net_deriv: A Tensor. Must be one of the following types: float16, float32, float64.
 * @li in_deriv: A Tensor. Must be one of the following types: float16, float32, float64.
 * @li rij: A Tensor. Must be one of the following types: float16, float32, float64.
 * @li nlist: A Tensor. dtype is int32.
 * @li natoms: A Tensor. dtype is int32. \n
 *
 * @par Outputs:
 * Two outputs, including:
 * @li virial: A Tensor. Must be one of the following types: float16, float32, float64.
 * @li atom_virial: A Tensor. Must be one of the following types: float16, float32, float64. \n
 *
 * @par Attributes:
 * Two attributes, including:
 * @li n_a_sel: A Scalar.
 * @li n_r_sel: A Scalar. \n
 *
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(ProdVirialSeA)
    .INPUT(net_deriv, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(in_deriv, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(rij, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(nlist, TensorType({DT_INT32}))
    .INPUT(natoms, TensorType({DT_INT32}))
    .OUTPUT(virial, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(atom_virial, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .REQUIRED_ATTR(n_a_sel, Int)
    .REQUIRED_ATTR(n_r_sel, Int)
    .ATTR(nall, Int, 28328)
    .OP_END_FACTORY_REG(ProdVirialSeA)
 } // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_VECTOR_SEARCH_H_
--- a/third_party/fwkacllib/inc/runtime/base.h
+++ b/third_party/fwkacllib/inc/runtime/base.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef __CCE_RUNTIME_BASE_H__
 #define __CCE_RUNTIME_BASE_H__
 #ifndef CCE_RUNTIME_BASE_H
 #define CCE_RUNTIME_BASE_H

 #include <stdint.h>
 #include "toolchain/prof_callback.h"
@@ -443,4 +443,4 @@ RTS_API rtError_t rtGetTaskIdAndStreamID(uint32_t *taskId, uint32_t *streamId);
 }
 #endif

 #endif  // __CCE_RUNTIME_BASE_H__
 #endif  // CCE_RUNTIME_BASE_H
--- a/third_party/fwkacllib/inc/runtime/config.h
+++ b/third_party/fwkacllib/inc/runtime/config.h
@@ -239,8 +239,18 @@ RTS_API rtError_t rtSetOpWaitTimeOut(uint32_t timeout);
 */
 RTS_API rtError_t rtSetOpExecuteTimeOut(uint32_t timeout);

 /**
 * @ingroup
 * @brief get is Heterogenous.
 * @param [out] heterogenous=1 Heterogenous Mode: read isHeterogenous=1 in ini file.
 * @param [out] heterogenous=0 NOT Heterogenous Mode:
 *      1:not found ini file, 2:error when reading ini, 3:Heterogenous value is not 1
 * @return RT_ERROR_NONE for ok
 */
 RTS_API rtError_t rtGetIsHeterogenous(int32_t *heterogenous);

 #if defined(__cplusplus)
 }
 #endif

 #endif // CCE_RUNTIME_CONFIG_H
 #endif // CCE_RUNTIME_CONFIG_H
--- a/third_party/fwkacllib/inc/runtime/dev.h
+++ b/third_party/fwkacllib/inc/runtime/dev.h
@@ -25,7 +25,7 @@ extern "C" {

 #define RT_CAPABILITY_SUPPORT     (0x1U)
 #define RT_CAPABILITY_NOT_SUPPORT (0x0U)
 #define MEMORY_INFO_TS_4G_LIMITED (0x0) // for compatibility
 #define MEMORY_INFO_TS_4G_LIMITED (0x0U) // for compatibility

 typedef struct tagRTDeviceInfo {
    uint8_t env_type;  // 0: FPGA  1: EMU 2: ESL
@@ -171,6 +171,15 @@ RTS_API rtError_t rtSetDevice(int32_t device);
 */
 RTS_API rtError_t rtSetDeviceV2(int32_t device, rtDeviceMode deviceMode);

 /**
 * @ingroup dvrt_dev
 * @brief get deviceMode
 * @param [out] deviceMode   the device mode
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtGetDeviceMode(rtDeviceMode *deviceMode);

 /**
 * @ingroup dvrt_dev
 * @brief set target die for current thread
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@@ -133,8 +133,11 @@ typedef struct tagRtArgsWithTiling {
    uint16_t tilingDataOffset;      // tiling data offset
    uint16_t hostInputAddrOffset;   // index of host_memory input in inputs_addrs list
    uint16_t hostInputDataOffset;   // host_mem input data offset
    bool hasHostMemInput;           // has host_memory input data in args or not: ture or false
    uint8_t reserved[7];
    uint8_t hasHostMemInput;        // has host_memory input data in args or not: 0 means no host_memory input data,
                                    // others means has host_memory input data.
    uint8_t isNoNeedH2DCopy;        // is no need host to device copy: 0 means need H2D copy,
                                    // others means doesn't need H2D copy.
    uint8_t reserved[6];
 } rtArgsWithTiling_t;

 /**
@@ -299,8 +302,8 @@ RTS_API rtError_t rtDependencyRegister(void *mHandle, void *sHandle);
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtFunctionRegister(void *binHandle, const void *stubFunc, const char_t *stubName, const void *devFunc,
                                     uint32_t funcMode);
 RTS_API rtError_t rtFunctionRegister(void *binHandle, const void *stubFunc, const char_t *stubName,
                                     const void *devFunc, uint32_t funcMode);

 /**
 * @ingroup rt_kernel
@@ -371,8 +374,9 @@ RTS_API rtError_t rtKernelLaunch(const void *stubFunc, uint32_t blockDim, void *
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim, void *args, uint32_t argsSize,
                                            rtSmDesc_t *smDesc, rtStream_t stream_, const void *kernelInfo);
 RTS_API rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim,
                                           void *args, uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stream_,
                                           const void *kernelInfo);

 /**
 * @ingroup rt_kernel
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -576,7 +576,7 @@ RTS_API rtError_t rtRDMASend(uint32_t index, uint32_t wqeIndex, rtStream_t strea
 * @return RT_ERROR_INVALID_VALUE for error input
 * @return RT_ERROR_DRV_ERR for driver error
 */
 RTS_API rtError_t rtSetIpcMemPid(const char_t *name, int32_t pid[], int num);
 RTS_API rtError_t rtSetIpcMemPid(const char_t *name, int32_t pid[], int32_t num);

 /**
 * @ingroup dvrt_mem
--- a/third_party/fwkacllib/inc/runtime/rt.h
+++ b/third_party/fwkacllib/inc/runtime/rt.h
@@ -31,5 +31,6 @@
 #include "rt_ffts.h"
 #include "rt_ffts_plus.h"
 #include "rt_dfx.h"
 #include "rt_mem_queue.h"

 #endif  // CCE_RUNTIME_RT_H
 #endif  // CCE_RUNTIME_RT_H
--- a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
@@ -23,6 +23,8 @@
 #define PROF_AICORE_METRICS         0x00000004
 #define PROF_AICPU_TRACE            0x00000008
 #define PROF_L2CACHE                0x00000010
 #define PROF_HCCL_TRACE             0x00000020
 #define PROF_TRAINING_TRACE         0x00000040

 // system profilinig switch
 #define PROF_CPU                    0x00010000
@@ -41,10 +43,7 @@
 #define PROF_AIVECTORCORE_METRICS   0x0000020000000
 #define PROF_SUBTASK_TIME           0x0000040000000

 #define PROF_TRAINING_TRACE         0x0000080000000
 #define PROF_HCCL_TRACE             0x0000100000000

 #define PROF_TASK_TRACE             0x0000185000002
 #define PROF_TASK_TRACE             0x0000005000062

 #define PROF_MODEL_LOAD             0x8000000000000000

@@ -54,6 +53,8 @@
 #define PROF_AICORE_METRICS_MASK         0x00000004
 #define PROF_AICPU_TRACE_MASK            0x00000008
 #define PROF_L2CACHE_MASK                0x00000010
 #define PROF_HCCL_TRACE_MASK             0x00000020
 #define PROF_TRAINING_TRACE_MASK         0x00000040

 // system profilinig mask
 #define PROF_CPU_MASK                    0x00010000
@@ -72,9 +73,6 @@
 #define PROF_AIVECTORCORE_METRICS_MASK   0x0000020000000
 #define PROF_SUBTASK_TIME_MASK           0x0000040000000

 #define PROF_TRAINING_TRACE_MASK         0x0000080000000
 #define PROF_HCCL_TRACE_MASK             0x0000100000000

 #define PROF_MODEL_LOAD_MASK             0x8000000000000000

 #if (defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER))