Merge remote-tracking branch 'upstream/master'

4 years ago · dc72f5dcae
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,22 @@
 # Release 1.0.0

 ## Major Features and Improvements
 * Automatically dump the input and output of the abnormal operator when the network execution is abnormal;
 * Realize dynamic multi-batch based on GotoLabel;
 * Optimize the performance of dynamic shape；
 * The dynamic resolution feature supports new scene that the network has multiple inputs and the shape of each input is different.

 ## Bugfixes 
 * Fixed the issue that the input and output data of the AICPU operator cannot be dumped in the single-operator execution scenario.
 * Fixed the execution fails in the custom AICPU operator cascading scenario.
 * Fixed the issue that in the dynamic batch+dynamic AIPP scenario, the getinputformat and getinputdims parameters are inconsistent.


 ## Thanks to our Contributors
 Thanks goes to these wonderful people: wuweikang，wangcong，weiyang，yanghaorang，xutianchun，shibeiji，zhouchao, tanghuikang, zhoulili, liujunzhu, zhengyuanhua, taoxiangdong Contributions of any kind are welcome!

 Contributions of any kind are welcome!

 # Release 0.7.0-beta

 ## Major Features and Improvements
--- a/ge/CMakeLists.txt
+++ b/ge/CMakeLists.txt
@@ -63,6 +63,7 @@ include_directories(${CMAKE_BINARY_DIR}/proto/ge)
 # need to remove dependencies on pb files later
 file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "analyzer/analyzer.cc"
        "client/ge_prof.cc"
        "client/ge_api.cc"
        "common/dump/dump_manager.cc"
        "common/dump/dump_properties.cc"
@@ -230,6 +231,7 @@ target_link_libraries(ge_runner
        ${msprof}
        ${runtime}
        ${resouce}
        ${ascend_hal}
        rt
        dl)

@@ -340,6 +342,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "host_kernels/unpack_kernel.cc"
        "host_kernels/unsqueeze_kernel.cc"
        "hybrid/hybrid_davinci_model_stub.cc"
        "hybrid/node_executor/aicpu/aicpu_ext_info.cc"
        "init/gelib.cc"
        "ir_build/atc_ir_common.cc"
        "ir_build/ge_ir_build.cc"
--- a/ge/analyzer/analyzer.cc
+++ b/ge/analyzer/analyzer.cc
@@ -101,7 +101,7 @@ Status Analyzer::BuildJsonObject(uint64_t session_id, uint64_t graph_id) {

 ge::Status Analyzer::Initialize() {
  ClearHistoryFile();
  return CreateAnalyzerFile();
  return SUCCESS;
 }

 void Analyzer::Finalize() {
@@ -136,7 +136,7 @@ void Analyzer::DestroyGraphJsonObject(uint64_t session_id, uint64_t graph_id) {
  } else {
    auto iter1 = (iter->second).find(graph_id);
    if (iter1 == (iter->second).end()) {
      GELOGW("can not find the graph json object by session_id[%lu] and graph_id[%lu].Do nothing", session_id,
      GELOGW("Can not find the graph json object by session_id[%lu] and graph_id[%lu]. Do nothing.", session_id,
             graph_id);
    }
    (iter->second).erase(iter1);
@@ -169,6 +169,10 @@ void Analyzer::ClearHistoryFile() {
 }

 ge::Status Analyzer::CreateAnalyzerFile() {
  if (is_json_file_create_) {
    GELOGD("analyzer file has been created!No necessary to create again!");
    return SUCCESS;
  }
  GELOGD("start to create analyzer file!");
  // Check whether the manifest exists, if not, create it.
  string real_path = RealPath(kFilePath.c_str());
@@ -176,18 +180,19 @@ ge::Status Analyzer::CreateAnalyzerFile() {
    GELOGE(FAILED, "File path is invalid.");
    return FAILED;
  }
  string file = real_path + "/" + kAnalyzeFile;
  GELOGD("Created analyzer file:[%s]", file.c_str());
  int fd = open(file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kFileAuthority);
  std::lock_guard<std::mutex> lg(file_mutex_);
  json_file_name_ = real_path + "/" + kAnalyzeFile;
  GELOGD("Created analyzer file:[%s]", json_file_name_.c_str());
  int fd = open(json_file_name_.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kFileAuthority);
  if (fd < 0) {
    GELOGE(INTERNAL_ERROR, "Fail to open the file: %s.", file.c_str());
    GELOGE(INTERNAL_ERROR, "Fail to open the file: %s.", json_file_name_.c_str());
    return INTERNAL_ERROR;
  }
  if (close(fd) != 0) {
    GELOGE(INTERNAL_ERROR, "Fail to close the file: %s.", file.c_str());
    GELOGE(INTERNAL_ERROR, "Fail to close the file: %s.", json_file_name_.c_str());
    return INTERNAL_ERROR;
  }
  json_file_name_ = file;
  is_json_file_create_ = true;

  GELOGD("success to create analyzer file[%s]!", json_file_name_.c_str());
  return SUCCESS;
@@ -231,6 +236,12 @@ ge::Status Analyzer::DoAnalyze(DataInfo &data_info) {
    GELOGE(status, "save op info failed!");
    return FAILED;
  }
  // create json file
  status = CreateAnalyzerFile();
  if (status != SUCCESS) {
    GELOGE(status, "create analyzer file failed!");
    return status;
  }
  // save data to file
  return SaveAnalyzerDataToFile();
 }
--- a/ge/analyzer/analyzer.h
+++ b/ge/analyzer/analyzer.h
@@ -24,6 +24,7 @@
 #include <mutex>
 #include <memory>
 #include <fstream>
 #include <atomic>

 #include "external/ge/ge_api_types.h"
 #include "graph/compute_graph.h"
@@ -181,6 +182,7 @@ class Analyzer {
  std::mutex file_mutex_;       // protect json_file_
  std::ofstream json_file_;
  std::string json_file_name_;
  std::atomic_bool is_json_file_create_{false};
 };
 }  // namespace ge
 #endif  // DOMI_ANALYZER_ANANLYZER_H_
--- a/ge/client/CMakeLists.txt
+++ b/ge/client/CMakeLists.txt
@@ -29,6 +29,7 @@ file(GLOB PROTO_HEADER_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}

 file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "ge_api.cc"
        "ge_prof.cc"
        )

 ge_protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST})
@@ -66,5 +67,6 @@ target_link_libraries(ge_client
        ${slog}
        ${mmpa}
        ${runtime}
        ${msprof}
        rt
        dl)
--- a/ge/client/ge_api.cc
+++ b/ge/client/ge_api.cc
@@ -39,7 +39,7 @@ using std::vector;

 namespace {
 const int32_t kMaxStrLen = 128;
 }
 }  // namespace

 static bool g_ge_initialized = false;
 static std::mutex g_ge_release_mutex;  // GEFinalize and ~Session use
--- a/ge/client/module.mk
+++ b/ge/client/module.mk
@@ -4,6 +4,7 @@ LOCAL_PATH := $(call my-dir)
 COMMON_LOCAL_SRC_FILES := \
    proto/ge_api.proto \
    ge_api.cc \
    ge_prof.cc \


 COMMON_LOCAL_C_INCLUDES := \
@@ -69,6 +70,8 @@ LOCAL_SHARED_LIBRARIES := \
    libregister \
    libge_compiler \
    libge_common \
    libmsprof



 LOCAL_LDFLAGS := -lrt -ldl
@@ -102,6 +105,7 @@ LOCAL_SHARED_LIBRARIES := \
    libruntime \
    libge_compiler \
    libge_common \
    libmsprof


 LOCAL_LDFLAGS := -lrt -ldl
--- a/ge/common/CMakeLists.txt
+++ b/ge/common/CMakeLists.txt
@@ -27,6 +27,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "context/ctx.cc"
        "cust_aicpu_kernel_store.cc"
        "debug/memory_dumper.cc"
        "dump/dump_properties.cc"
        "fmk_error_codes.cc"
        "formats/format_transfers/datatype_transfer.cc"
        "formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc"
--- a/ge/common/dump/dump_manager.cc
+++ b/ge/common/dump/dump_manager.cc
@@ -49,7 +49,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf
    dump_properties_.ClearDumpPropertyValue();
    return SUCCESS;
  }
  dump_properties_.SetDumpStatus(dump_status);

  dump_op_switch = dump_config.dump_op_switch;
  dump_properties_.SetDumpOpSwitch(dump_op_switch);
  if (dump_op_switch == kDumpoff && dump_config.dump_list.empty()) {
    GELOGE(PARAM_INVALID, "Dump list is invalid,dump_op_switch is %s", dump_op_switch.c_str());
    return PARAM_INVALID;
@@ -95,14 +98,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf
  return SUCCESS;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpManager::IsDumpOpen() {
  std::lock_guard<std::mutex> lock(mutex_);
  if (!dump_properties_.GetDumpPath().empty()) {
    return true;
  }
  return false;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const DumpProperties &DumpManager::GetDumpProperties() {
  std::lock_guard<std::mutex> lock(mutex_);
  return dump_properties_;
--- a/ge/common/dump/dump_manager.h
+++ b/ge/common/dump/dump_manager.h
@@ -28,7 +28,6 @@ class DumpManager {
  static DumpManager &GetInstance();

  Status SetDumpConf(const DumpConfig &dump_config);
  bool IsDumpOpen();
  const DumpProperties &GetDumpProperties();
  void SetModelName(const std::string &model_name);
  const std::string &GetModelName();
--- a/ge/common/dump/dump_op.cc
+++ b/ge/common/dump/dump_op.cc
@@ -16,7 +16,6 @@

 #include "common/dump/dump_op.h"

 #include "aicpu/common/aicpu_task_struct.h"
 #include "common/dump/dump_manager.h"
 #include "common/ge/datatype_util.h"
 #include "framework/common/debug/ge_log.h"
@@ -28,6 +27,7 @@
 #include "proto/ge_ir.pb.h"
 #include "proto/op_mapping_info.pb.h"
 #include "runtime/mem.h"
 #include "aicpu/common/aicpu_task_struct.h"

 namespace {
 const uint32_t kAicpuLoadFlag = 1;
--- a/ge/common/dump/dump_properties.cc
+++ b/ge/common/dump/dump_properties.cc
@@ -31,7 +31,7 @@

 namespace {
 const std::string kEnableFlag = "1";

 const std::string kDumpStatusOpen = "on";
 const uint32_t kAicoreOverflow = (0x1 << 0);
 const uint32_t kAtomicOverflow = (0x1 << 1);
 const uint32_t kAllOverflow = (kAicoreOverflow | kAtomicOverflow);
@@ -81,12 +81,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::InitByOpti
  if (enable_dump_ == kEnableFlag) {
    std::string dump_step;
    if (GetContext().GetOption(OPTION_EXEC_DUMP_STEP, dump_step) == GRAPH_SUCCESS) {
      GELOGD("Get dump step %s successfully", dump_step.c_str());
      GELOGI("Get dump step %s successfully", dump_step.c_str());
      SetDumpStep(dump_step);
    }
    string dump_mode;
    if (GetContext().GetOption(OPTION_EXEC_DUMP_MODE, dump_mode) == GRAPH_SUCCESS) {
      GELOGD("Get dump mode %s successfully", dump_mode.c_str());
      GELOGI("Get dump mode %s successfully", dump_mode.c_str());
      SetDumpMode(dump_mode);
    }
    AddPropertyValue(DUMP_ALL_MODEL, {});
@@ -192,6 +192,37 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperti
  return dump_mode_;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpStatus(const std::string &status) {
  dump_status_ = status;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperties::GetDumpStatus() const {
  return dump_status_;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpOpSwitch(
  const std::string &dump_op_switch) {
  dump_op_switch_ = dump_op_switch;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperties::GetDumpOpSwitch() const {
  return dump_op_switch_;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpProperties::IsSingleOpNeedDump() const {
  if (dump_op_switch_ == kDumpStatusOpen) {
    return true;
  }
  return false;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpProperties::IsDumpOpen() const {
  if (enable_dump_ == kEnableFlag || dump_status_ == kDumpStatusOpen) {
    return true;
  }
  return false;
 }

 void DumpProperties::CopyFrom(const DumpProperties &other) {
  if (&other != this) {
    enable_dump_ = other.enable_dump_;
--- a/ge/common/dump/dump_properties.h
+++ b/ge/common/dump/dump_properties.h
@@ -61,10 +61,26 @@ class DumpProperties {

  const std::string &GetDumpMode() const;

  void SetDumpStatus(const std::string &status);

  const std::string &GetDumpStatus() const;

  void SetDumpOpSwitch(const std::string &dump_op_switch);

  const std::string &GetDumpOpSwitch() const;

  bool IsOpDebugOpen() const { return is_op_debug_; }

  bool IsDumpOpen() const;

  bool IsSingleOpNeedDump() const;

  uint32_t GetOpDebugMode() const { return op_debug_mode_; }

  const std::string &GetEnableDump() const { return enable_dump_; }

  const std::string &GetEnableDumpDebug() const { return enable_dump_debug_; }

 private:
  void CopyFrom(const DumpProperties &other);

@@ -76,6 +92,8 @@ class DumpProperties {
  std::string dump_path_;
  std::string dump_step_;
  std::string dump_mode_;
  std::string dump_status_;
  std::string dump_op_switch_;
  std::map<std::string, std::set<std::string>> model_dump_properties_map_;

  bool is_op_debug_ = false;
--- a/ge/common/ge/op_tiling_manager.cc
+++ b/ge/common/ge/op_tiling_manager.cc
@@ -15,14 +15,15 @@
 */

 #include "common/ge/op_tiling_manager.h"
 #include "common/util/error_manager/error_manager.h"
 #include "framework/common/debug/log.h"
 #include <string>

 namespace {
 const char *const kEnvName = "ASCEND_OPP_PATH";
 const std::string kDefaultPath = "/usr/local/Ascend/opp";
 const std::string kDefaultBuiltInTilingPath = "/op_impl/built-in/liboptiling.so";
 const std::string kDefaultCustomTilingPath = "/op_impl/custom/liboptiling.so";
 const std::string kDefaultBuiltInTilingPath = "/op_impl/built-in/ai_core/tbe/op_tiling/liboptiling.so";
 const std::string kDefaultCustomTilingPath = "/op_impl/custom/ai_core/tbe/op_tiling/liboptiling.so";
 const uint8_t kPrefixIndex = 9;
 }  // namespace

@@ -44,7 +45,9 @@ std::string OpTilingManager::GetPath() {
  if (opp_path_env != nullptr) {
    char resolved_path[PATH_MAX];
    if (realpath(opp_path_env, resolved_path) == NULL) {
      GELOGE(PARAM_INVALID, "Failed load tiling lib as env 'ASCEND_OPP_PATH'(%s) is invalid path.", opp_path_env);
      ErrorManager::GetInstance().ATCReportErrMessage("E19024", {"env", "value", "situation"},
                                                      {"ASCEND_OPP_PATH", opp_path_env, "loading the tiling lib"});
      GELOGE(PARAM_INVALID, "Failed load tiling lib as env 'ASCEND_OPP_PATH'[%s] is invalid path.", opp_path_env);
      return std::string();
    }
    opp_path = resolved_path;
--- a/ge/common/ge_common.mk
+++ b/ge/common/ge_common.mk
@@ -12,6 +12,7 @@ GE_COMMON_LOCAL_SRC_FILES := \
    math/fp16_math.cc \
    debug/memory_dumper.cc \
    formats/utils/formats_trans_utils.cc \
    dump/dump_properties.cc \
    formats/format_transfers/datatype_transfer.cc \
    formats/format_transfers/format_transfer_transpose.cc \
    formats/format_transfers/format_transfer_nchw_nc1hwc0.cc \
--- a/ge/common/helper/model_cache_helper.cc
+++ b/ge/common/helper/model_cache_helper.cc
@@ -497,7 +497,25 @@ Status ModelCacheHelper::LoadJsonFromFile(const string &file_name, Json &json) c
    GELOGW("Fail to open the file: %s.", path.c_str());
    return INTERNAL_ERROR;
  }
  ifs >> json;
  try {
    ifs >> json;
  } catch (nlohmann::detail::parse_error e) {
    GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
    return INTERNAL_ERROR;
  } catch (nlohmann::detail::invalid_iterator e) {
    GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
    return INTERNAL_ERROR;
  } catch (nlohmann::detail::type_error e) {
    GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
    return INTERNAL_ERROR;
  } catch (nlohmann::detail::out_of_range e) {
    GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
    return INTERNAL_ERROR;
  } catch (nlohmann::detail::other_error e) {
    GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
    return INTERNAL_ERROR;
  }

  if (!json.is_object()) {
    GELOGW("Fail to load the json file: %s.", path.c_str());
    return INTERNAL_ERROR;
--- a/ge/common/helper/model_helper.cc
+++ b/ge/common/helper/model_helper.cc
@@ -41,7 +41,22 @@ Status ModelHelper::SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_fil
                                       const uint8_t *data, size_t size) {
  if (size < 1 || size > UINT32_MAX) {
    GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu invalid", size);
    ErrorManager::GetInstance().ATCReportErrMessage("E19022");
    if (size > UINT32_MAX) {
      string item = "item";
      if (type == MODEL_DEF) {
        item = "model info";
      } else if (type == WEIGHTS_DATA) {
        item = "weight data";
      } else if (type == TASK_INFO) {
        item = "task info";
      } else if (type == TBE_KERNELS) {
        item = "tbe kernels";
      } else if (type == CUST_AICPU_KERNELS) {
        item = "aicpu kernels";
      }
      ErrorManager::GetInstance().ATCReportErrMessage("E19023", {"size", "item", "maxsize"},
                                                      {std::to_string(size), item, std::to_string(UINT32_MAX)});
    }
    return PARAM_INVALID;
  }
  if (data == nullptr) {
@@ -263,7 +278,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadModel(c
  }

  Status status = ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_);
  if (ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_) != SUCCESS) {
  if (status != SUCCESS) {
    GELOGE(status, "Parse model content failed!");
    return status;
  }
--- a/ge/common/profiling/profiling_manager.cc
+++ b/ge/common/profiling/profiling_manager.cc
@@ -51,10 +51,23 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager &ProfilingMana
  return profiling_manager;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::Init(const Options &options) {
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::Init(const Options &options,
                                                                                   bool convert_2_phy_device_id) {
 #ifdef DAVINCI_SUPPORT_PROFILING
  vector<int32_t>().swap(device_id_);
  device_id_.push_back(options.device_id);
  // profiling need phy device id
  if (!convert_2_phy_device_id) {
    device_id_.push_back(options.device_id);
  } else {
    uint32_t phy_device_id = 0;
    rtError_t rt_ret = rtGetDevicePhyIdByIndex(static_cast<uint32_t>(options.device_id), &phy_device_id);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(rt_ret, "runtime get phy_device_id failed, current phy_device_id:%u", phy_device_id);
      return FAILED;
    }
    device_id_.push_back(phy_device_id);
  }

  job_id_ = options.job_id;

  Status ret;
--- a/ge/common/profiling/profiling_manager.h
+++ b/ge/common/profiling/profiling_manager.h
@@ -69,7 +69,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager {
  ProfilingManager();
  virtual ~ProfilingManager();
  static ProfilingManager &Instance();
  ge::Status Init(const Options &options);
  ge::Status Init(const Options &options, bool convert_2_phy_device_id = false);
  ge::Status InitFromOptions(const Options &options);
  ge::Status InitFromAclCfg(const std::string &config);
  ge::Status StartProfiling(int32_t iter, int32_t device_id);
--- a/ge/common/properties_manager.cc
+++ b/ge/common/properties_manager.cc
@@ -172,6 +172,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY DumpProperties &PropertiesManag
  return dump_properties_map_[session_id];
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::AddDumpProperties(
  uint64_t session_id, const DumpProperties &dump_properties) {
  std::lock_guard<std::mutex> lock(mutex_);
  dump_properties_map_.emplace(session_id, dump_properties);
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::RemoveDumpProperties(uint64_t session_id) {
  std::lock_guard<std::mutex> lock(mutex_);
  auto iter = dump_properties_map_.find(session_id);
--- a/ge/common/properties_manager.h
+++ b/ge/common/properties_manager.h
@@ -23,8 +23,8 @@
 #include <string>
 #include <vector>

 #include "graph/op_desc.h"
 #include "common/dump/dump_properties.h"
 #include "graph/op_desc.h"

 namespace ge {
 // Configuration property management
@@ -83,6 +83,10 @@ class PropertiesManager {
  void SetPropertyDelimiter(const std::string &de);

  DumpProperties &GetDumpProperties(uint64_t session_id);

  const map<uint64_t, DumpProperties> &GetDumpPropertiesMap() { return dump_properties_map_; }

  void AddDumpProperties(uint64_t session_id, const DumpProperties &dump_properties);
  void RemoveDumpProperties(uint64_t session_id);

 private:
--- a/ge/common/util.cc
+++ b/ge/common/util.cc
@@ -19,16 +19,16 @@
 #include <fcntl.h>
 #include <sys/stat.h>

 #include <unistd.h>
 #include <regex.h>
 #include <unistd.h>
 #include <algorithm>
 #include <climits>
 #include <cstdlib>
 #include <ctime>
 #include <fstream>

 #include "external/ge/ge_api_error_codes.h"
 #include "common/util/error_manager/error_manager.h"
 #include "external/ge/ge_api_error_codes.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/fmk_types.h"
 #include "framework/common/ge_inner_error_codes.h"
@@ -58,6 +58,7 @@ const int kWarningThreshold = 536870912 * 2;  // 536870912 represent 512M
 const int kMaxFileSizeLimit = INT_MAX;
 const int kMaxBuffSize = 256;
 const char *const kPathValidReason = "The path can only contain 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese character";
 constexpr uint32_t MAX_CONFIG_FILE_BYTE = 10 * 1024 * 1024;
 }  // namespace

 namespace ge {
@@ -482,4 +483,69 @@ FMK_FUNC_HOST_VISIBILITY bool ValidateStr(const std::string &str, const std::str
  regfree(&reg);
  return true;
 }

 FMK_FUNC_HOST_VISIBILITY bool IsValidFile(const char *file_path) {
  if (file_path == nullptr) {
    GELOGE(PARAM_INVALID, "Config path is null.");
    return false;
  }
  if (!CheckInputPathValid(file_path)) {
    GELOGE(PARAM_INVALID, "Config path is invalid: %s", file_path);
    return false;
  }
  // Normalize the path
  std::string resolved_file_path = RealPath(file_path);
  if (resolved_file_path.empty()) {
    GELOGE(PARAM_INVALID, "Invalid input file path [%s], make sure that the file path is correct.", file_path);
    return false;
  }

  mmStat_t stat = {0};
  int32_t ret = mmStatGet(resolved_file_path.c_str(), &stat);
  if (ret != EN_OK) {
    GELOGE(PARAM_INVALID, "cannot get config file status, which path is %s, maybe not exist, return %d, errcode %d",
           resolved_file_path.c_str(), ret, mmGetErrorCode());
    return false;
  }
  if ((stat.st_mode & S_IFMT) != S_IFREG) {
    GELOGE(PARAM_INVALID, "config file is not a common file, which path is %s, mode is %u", resolved_file_path.c_str(),
           stat.st_mode);
    return false;
  }
  if (stat.st_size > MAX_CONFIG_FILE_BYTE) {
    GELOGE(PARAM_INVALID, "config file %s size[%ld] is larger than max config file Bytes[%u]",
           resolved_file_path.c_str(), stat.st_size, MAX_CONFIG_FILE_BYTE);
    return false;
  }
  return true;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status CheckPath(const char *path, size_t length) {
  if (path == nullptr) {
    GELOGE(PARAM_INVALID, "Config path is invalid.");
    return PARAM_INVALID;
  }

  if (strlen(path) != length) {
    GELOGE(PARAM_INVALID, "Path is invalid or length of config path is not equal to given length.");
    return PARAM_INVALID;
  }

  if (length == 0 || length > MMPA_MAX_PATH) {
    GELOGE(PARAM_INVALID, "Length of config path is invalid.");
    return PARAM_INVALID;
  }

  INT32 is_dir = mmIsDir(path);
  if (is_dir != EN_OK) {
    GELOGE(PATH_INVALID, "Open directory %s failed, maybe it is not exit or not a dir", path);
    return PATH_INVALID;
  }

  if (mmAccess2(path, M_R_OK) != EN_OK) {
    GELOGE(PATH_INVALID, "Read path[%s] failed, errmsg[%s]", path, strerror(errno));
    return PATH_INVALID;
  }
  return SUCCESS;
 }
 }  //  namespace ge
--- a/ge/executor/CMakeLists.txt
+++ b/ge/executor/CMakeLists.txt
@@ -22,7 +22,7 @@ file(GLOB PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "../../proto/insert_op.proto"
        "../../proto/op_mapping_info.proto"
        "../../proto/ge_ir.proto"
        "../proto/dump_task.proto"
        "../../proto/dump_task.proto"
        )

 file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
@@ -73,6 +73,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "../graph/manager/trans_var_data_utils.cc"
        "../graph/manager/util/debug.cc"
        "../hybrid/hybrid_davinci_model_stub.cc"
        "../hybrid/node_executor/aicpu/aicpu_ext_info.cc"
        "../model/ge_model.cc"
        "../model/ge_root_model.cc"
        "../omm/csa_interact.cc"
@@ -118,6 +119,7 @@ target_link_libraries(ge_executor
        ${slog}
        ${mmpa}
        ${msprof}
        ${error_manager}
        rt
        dl)

--- a/ge/executor/ge_executor.cc
+++ b/ge/executor/ge_executor.cc
@@ -182,6 +182,37 @@ bool IsDynamicImageSizeMatchModel(uint64_t image_height, uint64_t image_width,
  GELOGE(ge::FAILED, "Dynamic resolution (%lu,%lu) can not match the gear of model.", image_height, image_width);
  return false;
 }

 bool IsDynmaicDimsSizeMatchModel(const vector<uint64_t> cur_dynamic_dims, const vector<vector<int64_t>> &batch_info) {
  if (batch_info.empty()) {
    GELOGE(ge::FAILED, "Dynamic batch info is empty.");
    return false;
  }

  bool find_match = false;
  for (auto resolution : batch_info) {
    if (cur_dynamic_dims.size() != resolution.size()) {
      GELOGE(ge::FAILED, "Cur dynamic dims param num is %zu, current resolution size is %zu.", cur_dynamic_dims.size(),
             resolution.size());
      return false;
    }
    bool flag = true;
    for (std::size_t i = 0; i < resolution.size(); ++i) {
      if (cur_dynamic_dims[i] != static_cast<uint64_t>(resolution[i])) {
        flag = false;
        break;
      }
    }
    if (flag) {
      find_match = true;
      break;
    }
  }
  if (!find_match) {
    GELOGE(ge::FAILED, "choose dynamic dims can not match the gear of model.");
  }
  return find_match;
 }
 }  // namespace

 namespace ge {
@@ -347,9 +378,21 @@ Status GeExecutor::SetDynamicDims(uint32_t model_id, void *dynamic_input_addr, u
  vector<uint64_t> cur_dynamic_dims;
  Status ret = GetCurDynamicDims(model_id, dynamic_dims, cur_dynamic_dims);
  if (ret != SUCCESS) {
    GELOGE(FAILED, "Set cur gear dynmaic dims failed");
    GELOGE(FAILED, "Set cur gear dynamic dims failed");
    return FAILED;
  }
  std::vector<std::vector<int64_t>> batch_info;
  int32_t dynamic_type = static_cast<int32_t>(FIXED);
  ret = GraphExecutor::GetDynamicBatchInfo(model_id, batch_info, dynamic_type);
  if (ret != SUCCESS) {
    GELOGE(ret, "Get dynamic input info failed.");
    return ret;
  }

  if (!IsDynmaicDimsSizeMatchModel(cur_dynamic_dims, batch_info)) {
    GELOGE(PARAM_INVALID, "The current dynamic input does not match the gear of the model.");
    return PARAM_INVALID;
  }

  ret = GraphExecutor::SetDynamicSize(model_id, cur_dynamic_dims, static_cast<int32_t>(DYNAMIC_DIMS));
  if (ret != SUCCESS) {
@@ -410,6 +453,10 @@ Status GeExecutor::GetCurDynamicDims(uint32_t model_id, const vector<uint64_t> &
  for (std::size_t i = 0; i < all_data_dims.size(); ++i) {
    if (all_data_dims[i] < 0) {
      cur_dynamic_dims.push_back(dynamic_dims[i]);
    } else if (static_cast<uint64_t>(all_data_dims[i]) != dynamic_dims[i]) {
      GELOGE(PARAM_INVALID, "Static dims should be same, index: %zu value: %d should be %d", i, dynamic_dims[i],
             all_data_dims[i]);
      return PARAM_INVALID;
    }
  }
  return SUCCESS;
--- a/ge/executor/module.mk
+++ b/ge/executor/module.mk
@@ -60,6 +60,7 @@ local_ge_executor_src_files :=  \
    ../single_op/task/aicpu_task_builder.cc \
    ../single_op/task/aicpu_kernel_task_builder.cc \
    ../hybrid/hybrid_davinci_model_stub.cc\
    ../hybrid/node_executor/aicpu/aicpu_ext_info.cc \

 local_ge_executor_c_include :=             \
    proto/insert_op.proto                  \
@@ -87,6 +88,7 @@ local_ge_executor_shared_library :=        \
    libgraph                               \
    libregister                            \
    libmsprof                              \
    liberror_manager                       \

 local_ge_executor_ldflags := -lrt -ldl     \

@@ -137,6 +139,7 @@ LOCAL_SHARED_LIBRARIES :=                  \
    libgraph                               \
    libregister                            \
    libmsprof                              \
    liberror_manager                       \

 LOCAL_LDFLAGS += $(local_ge_executor_ldflags)

--- a/ge/ge_inference.mk
+++ b/ge/ge_inference.mk
@@ -254,6 +254,7 @@ OME_HOST_SRC_FILES := \
    single_op/stream_resource.cc                                         \
    single_op/single_op_manager.cc                                       \
    hybrid/hybrid_davinci_model_stub.cc                                  \
    hybrid/node_executor/aicpu/aicpu_ext_info.cc                         \
    # graph/load/new_model_manager/task_info/hccl_task_info.cc

 OME_DEVICE_SRC_FILES := $(OME_HOST_SRC_FILES)
@@ -286,6 +287,7 @@ COMMON_LOCAL_C_INCLUDES := \
    $(TOPDIR)inc/runtime \
    $(TOPDIR)libc_sec/include \
    $(TOPDIR)ops/built-in/op_proto/inc \
    $(TOPDIR)toolchain/ide/ide-daemon/external \
    third_party/json/include \
    third_party/protobuf/include \
    third_party/opencv/include \
@@ -340,6 +342,7 @@ DEVICE_LOCAL_C_INCLUDES := \
    $(TOPDIR)inc/runtime \
    $(TOPDIR)ops/built-in/op_proto/inc \
    $(TOPDIR)framework/domi \
    $(TOPDIR)toolchain/ide/ide-daemon/external \
    third_party/json/include \
    third_party/protobuf/include \
    third_party/opencv/include \
@@ -368,6 +371,7 @@ LOCAL_SRC_FILES += $(BUILER_SRC_FILES)
 LOCAL_SRC_FILES += $(ANALYZER_SRC_FILES)

 LOCAL_STATIC_LIBRARIES := libge_memory \
                          libadump_server_stub \

 LOCAL_SHARED_LIBRARIES := \
    libc_sec \
@@ -432,6 +436,7 @@ LOCAL_C_INCLUDES := $(DEVICE_LOCAL_C_INCLUDES)
 LOCAL_C_INCLUDES += $(ANALYZER_LOCAL_INCLUDES)

 LOCAL_STATIC_LIBRARIES := libge_memory \
                          libadump_server_stub \

 LOCAL_SHARED_LIBRARIES := \
    libc_sec \
--- a/ge/ge_local_engine/engine/host_cpu_engine.cc
+++ b/ge/ge_local_engine/engine/host_cpu_engine.cc
@@ -25,40 +25,65 @@
 #include "common/ge/plugin_manager.h"
 #include "graph/utils/type_utils.h"
 #include "common/fp16_t.h"
 #include "common/math/math_util.h"

 namespace {
 #define CREATE_OUTPUT_CASE(DTYPE, TYPE)                                                                            \
  case (DTYPE): {                                                                                                  \
    GeTensorPtr ge_tensor = nullptr;                                                                               \
    if (need_create_flag) {                                                                                        \
      int64_t data_num = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize();                  \
      std::unique_ptr<TYPE[]> buf(new (std::nothrow) TYPE[data_num]());                                            \
      if (buf == nullptr) {                                                                                        \
        GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed",                                     \
               static_cast<size_t>(sizeof(TYPE) * data_num));                                                      \
        return MEMALLOC_FAILED;                                                                                    \
      }                                                                                                            \
      ge_tensor = MakeShared<GeTensor>(out_desc);                                                                  \
      GE_CHECK_NOTNULL(ge_tensor);                                                                                 \
      GELOGI("node:%s allocate output %zu, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE));    \
      ge_tensor->SetData(reinterpret_cast<uint8_t *>(buf.get()), data_num * sizeof(TYPE));                         \
      ge_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType());                                          \
      ge_tensor->MutableTensorDesc().SetShape(out_desc.GetShape());                                                \
      outputs.emplace_back(ge_tensor);                                                                             \
    } else {                                                                                                       \
      ge_tensor = outputs[i];                                                                                      \
      GE_CHECK_NOTNULL(ge_tensor);                                                                                 \
      GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i,                      \
             reinterpret_cast<const uint8_t *>(ge_tensor->GetData().data()), ge_tensor->GetData().size());         \
    }                                                                                                              \
    auto tensor = TensorAdapter::AsTensor(*ge_tensor);                                                             \
    auto tensor_name = op_desc->GetOutputNameByIndex(i);                                                           \
    GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu",           \
                               op_desc->GetName().c_str(), i);                                                     \
    GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s, addr = %p, size = %zu", \
           op_desc->GetName().c_str(), i, tensor_name.c_str(), tensor.GetData(), tensor.GetSize());                \
    named_outputs.emplace(tensor_name, tensor);                                                                    \
    break;                                                                                                         \
 #define CREATE_OUTPUT_CASE(DTYPE, TYPE)                                                                               \
  case (DTYPE): {                                                                                                     \
    GeTensorPtr ge_tensor = nullptr;                                                                                  \
    if (need_create_flag) {                                                                                           \
      int64_t num_size = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize();                     \
      if (out_desc.GetShape().IsUnknownShape()) {                                                                     \
        std::vector<std::pair<int64_t, int64_t>> range;                                                               \
        if (out_desc.GetShapeRange(range) != GRAPH_SUCCESS) {                                                         \
          GELOGE(INTERNAL_ERROR, "Get shape range failed, node:%s", op_desc->GetName().c_str());                      \
          return INTERNAL_ERROR;                                                                                      \
        }                                                                                                             \
        int64_t max_range_size = 1;                                                                                   \
        for (const auto &item : range) {                                                                              \
          FMK_INT64_MULCHECK(max_range_size, item.second);                                                            \
          max_range_size *= item.second;                                                                              \
        }                                                                                                             \
        num_size = max_range_size;                                                                                    \
      }                                                                                                               \
      if (num_size < 0) {                                                                                             \
        GELOGE(INTERNAL_ERROR, "node:%s, get size for output %zu failed, num=%lld", op_desc->GetName().c_str(), i,    \
               num_size);                                                                                             \
        return INTERNAL_ERROR;                                                                                        \
      }                                                                                                               \
      auto data_num = static_cast<uint64_t>(num_size);                                                                \
      GELOGI("node:%s allocate output %zu start, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE)); \
      std::unique_ptr<TYPE[]> buf(new (std::nothrow) TYPE[data_num]());                                               \
      if (buf == nullptr) {                                                                                           \
        GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed",                                        \
               static_cast<size_t>(sizeof(TYPE) * data_num));                                                         \
        return MEMALLOC_FAILED;                                                                                       \
      }                                                                                                               \
      ge_tensor = MakeShared<GeTensor>(out_desc);                                                                     \
      GE_CHECK_NOTNULL(ge_tensor);                                                                                    \
      GELOGI("node:%s allocate output %zu success, size=%lld", op_desc->GetName().c_str(), i,                         \
             data_num * sizeof(TYPE));                                                                                \
      if (ge_tensor->SetData(reinterpret_cast<uint8_t *>(buf.get()), data_num * sizeof(TYPE)) != GRAPH_SUCCESS) {     \
        GELOGE(MEMALLOC_FAILED, "Set data for output %zu of node %s failed.", i, op_desc->GetName().c_str());         \
        return MEMALLOC_FAILED;                                                                                       \
      }                                                                                                               \
      ge_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType());                                             \
      ge_tensor->MutableTensorDesc().SetShape(out_desc.GetShape());                                                   \
      outputs.emplace_back(ge_tensor);                                                                                \
    } else {                                                                                                          \
      ge_tensor = outputs[i];                                                                                         \
      GE_CHECK_NOTNULL(ge_tensor);                                                                                    \
      GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i,                         \
             reinterpret_cast<const uint8_t *>(ge_tensor->GetData().data()), ge_tensor->GetData().size());            \
    }                                                                                                                 \
    auto tensor = TensorAdapter::AsTensor(*ge_tensor);                                                                \
    auto tensor_name = op_desc->GetOutputNameByIndex(i);                                                              \
    GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu",              \
                               op_desc->GetName().c_str(), i);                                                        \
    GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s, addr = %p, size = %zu",    \
           op_desc->GetName().c_str(), i, tensor_name.c_str(), tensor.GetData(), tensor.GetSize());                   \
    named_outputs.emplace(tensor_name, tensor);                                                                       \
    break;                                                                                                            \
  }
 }  // namespace

--- a/ge/ge_runner.mk
+++ b/ge/ge_runner.mk
@@ -296,6 +296,7 @@ LIBGE_LOCAL_SRC_FILES := \
 LIBCLIENT_LOCAL_SRC_FILES := \
    proto/ge_api.proto \
    client/ge_api.cc \
    client/ge_prof.cc \

 RUNNER_LOCAL_C_INCLUDES := \
    $(LOCAL_PATH) ./ \
@@ -312,6 +313,7 @@ RUNNER_LOCAL_C_INCLUDES := \
    $(TOPDIR)libc_sec/include \
    $(TOPDIR)ops/built-in/op_proto/inc \
    $(TOPDIR)framework/domi/analyzer \
    $(TOPDIR)toolchain/ide/ide-daemon/external \
    proto/fwk_adapter.proto \
    proto/ge_ir.proto \
    proto/insert_op.proto \
@@ -353,6 +355,7 @@ LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES)
 LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)

 LOCAL_STATIC_LIBRARIES := libge_memory \
                          libadump_server \

 LOCAL_SHARED_LIBRARIES := \
    libc_sec \
@@ -371,6 +374,7 @@ LOCAL_LDFLAGS := -lrt -ldl
 LOCAL_SHARED_LIBRARIES += \
    libruntime \
    libresource \
    stub/libascend_hal \

 include $(BUILD_HOST_SHARED_LIBRARY)

@@ -389,6 +393,7 @@ endif
 LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES)

 LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_api.cc
 LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_prof.cc


 LOCAL_SHARED_LIBRARIES :=
@@ -438,6 +443,7 @@ LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES)
 LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)

 LOCAL_STATIC_LIBRARIES := libge_memory \
                          libadump_server \

 LOCAL_SHARED_LIBRARIES := \
    libc_sec \
@@ -450,6 +456,7 @@ LOCAL_LDFLAGS := -lrt -ldl
 LOCAL_SHARED_LIBRARIES += \
    libruntime \
    libresource \
    stub/libascend_hal \

 include $(BUILD_HOST_STATIC_LIBRARY)

@@ -469,6 +476,7 @@ LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES)
 LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)

 LOCAL_STATIC_LIBRARIES := libge_memory \
                          libadump_server \

 LOCAL_SHARED_LIBRARIES := \
    libc_sec \
@@ -481,5 +489,6 @@ LOCAL_LDFLAGS := -lrt -ldl
 LOCAL_SHARED_LIBRARIES += \
    libruntime \
    libresource \
    libascend_hal \

 include $(BUILD_STATIC_LIBRARY)
--- a/ge/graph/build/memory/block_mem_assigner.cc
+++ b/ge/graph/build/memory/block_mem_assigner.cc
@@ -1296,6 +1296,11 @@ void MergeBlocks(std::vector<MemoryBlock *> &dest, std::vector<MemoryBlock *> &s
      return;
    }
    if (dest[i] != nullptr && src[i] != nullptr) {
      if (!dest[i]->reuse_mem_ || !src[i]->reuse_mem_) {
        GELOGD("Diff batch's workspace can't be reused, i: %zu, dest[i]: %s, stream: %ld, src[i]: %s, stream: %ld.", i,
               dest[i]->String().c_str(), dest[i]->stream_id_, src[i]->String().c_str(), src[i]->stream_id_);
        continue;
      }
      for (auto &symbol : src[i]->SymbolList()) {
        dest[i]->AddSymbol(symbol);
      }
--- a/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/ge/graph/build/memory/graph_mem_assigner.cc
@@ -227,7 +227,10 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, size_t &mem_offse
  if (mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) {
    GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", mem_offset,
           VarManager::Instance(session_id)->GetGraphMemoryMaxSize());
    ErrorManager::GetInstance().ATCReportErrMessage("E19022");
    ErrorManager::GetInstance().ATCReportErrMessage(
      "E19022", {"size", "item", "maxsize"},
      {std::to_string(mem_offset), "featuremap",
       std::to_string(VarManager::Instance(session_id)->GetGraphMemoryMaxSize())});
    return ge::FAILED;
  }
  return SUCCESS;
@@ -908,6 +911,8 @@ Status GraphMemoryAssigner::AssignAtomicOutputAndWorkspaceMemory(const ge::NodeP
      GELOGE(ret, "Assign atomic workspace memory failed, node is %s.", node_op_desc->GetName().c_str());
      return ret;
    }
  } else {
    GELOGW("Current atomic node %s does not have attr ATOMIC_WORKSPACE_INFO.", node->GetName().c_str());
  }

  return SUCCESS;
@@ -1452,14 +1457,56 @@ Status GraphMemoryAssigner::SetLoopGraphAtomicAttr(const ge::NodePtr &node, int6
  return SUCCESS;
 }

 ge::Status GraphMemoryAssigner::IsIndependentAtomicClean(const ge::NodePtr &node,
                                                         bool &is_independent_atomic_clean_node) {
  GE_CHECK_NOTNULL(node);
  const auto &out_control_anchor = node->GetOutControlAnchor();
  GE_CHECK_NOTNULL(out_control_anchor);
  for (const auto &peer_in_control_anchor : out_control_anchor->GetPeerInControlAnchors()) {
    if (peer_in_control_anchor != nullptr) {
      auto peer_in_node = peer_in_control_anchor->GetOwnerNode();
      auto peer_in_node_desc = peer_in_node->GetOpDesc();
      if (peer_in_node_desc != nullptr) {
        bool is_atomic_node = false;
        // If GetBool fail, is_atomic_node is false.
        (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATOMIC_ATTR_IS_ATOMIC_NODE, is_atomic_node);
        if (is_atomic_node) {
          vector<int> is_connect_netoutput;
          // If GetBool fail, attr is_connect_netoutput is an empty vector.
          (void)ge::AttrUtils::GetListInt(peer_in_node_desc, ATTR_NAME_NODE_CONNECT_OUTPUT, is_connect_netoutput);
          if (!is_connect_netoutput.empty()) {
            GELOGD("Peer in node %s is independent atomic clean node", peer_in_node->GetName().c_str());
            is_independent_atomic_clean_node = true;
            break;
          }
        }
      }
    }
  }

  return SUCCESS;
 }

 ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &n, const vector<int64_t> &atomic_mem_start,
                                                   const vector<int64_t> &atomic_mem_size) {
  for (ge::NodePtr &node : compute_graph_->GetAllNodes()) {
    auto node_op_desc = node->GetOpDesc();
    GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);

    if (((n != nullptr) && (node->GetName() == n->GetName())) ||
        ((n == nullptr) && (node_op_desc->GetType() == ATOMICADDRCLEAN))) {
    bool is_valid_atomic_clean_node = (n != nullptr) && (node->GetName() == n->GetName());

    if (((n == nullptr) && (node_op_desc->GetType() == ATOMICADDRCLEAN))) {
      bool is_independent_atomic_clean = false;
      if (IsIndependentAtomicClean(node, is_independent_atomic_clean) != SUCCESS) {
        GELOGE(FAILED, "Failed to determine the connection relationship of atomic addr clean node.");
        return PARAM_INVALID;
      }

      is_valid_atomic_clean_node = is_valid_atomic_clean_node || (!is_independent_atomic_clean);
    }

    if (is_valid_atomic_clean_node) {
      GELOGD("Node %s, set atomic clean attr start.", node->GetName().c_str());
      vector<int64_t> workspace_vector = node_op_desc->GetWorkspace();
      vector<int64_t> workspace_byte_vector = node_op_desc->GetWorkspaceBytes();
      workspace_vector.insert(workspace_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end());
--- a/ge/graph/build/memory/graph_mem_assigner.h
+++ b/ge/graph/build/memory/graph_mem_assigner.h
@@ -175,6 +175,8 @@ class GraphMemoryAssigner {
  ge::Status SetAtomicCleanAttr(const ge::NodePtr &n, const std::vector<int64_t> &atomic_mem_start,
                                const std::vector<int64_t> &atomic_mem_size);

  ge::Status IsIndependentAtomicClean(const ge::NodePtr &node, bool &is_independent_atomic_clean_node);

  void AlignMemOffset(const int64_t &mem_align_size);

  ge::Status UpdateOpInputOffset(const NodePtr &node, vector<int64_t> &input_list) const;
--- a/ge/graph/build/task_generator.cc
+++ b/ge/graph/build/task_generator.cc
@@ -266,6 +266,14 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
  if (is_unknown_shape) {
    GE_CHK_STATUS_RET(SetUnknownShapeStream(run_context, stream), "Set unknown shape stream failed.");
  }
  std::function<void()> callback = [&]() {
    if (is_unknown_shape) {
      if (DestroyUnknownShapeStream(run_context, stream) != SUCCESS) {
        GELOGE(FAILED, "Destory unknown shape stream failed.");
      }
    }
  };
  GE_MAKE_GUARD(release, callback);

  for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
    OpDescPtr op_desc = node->GetOpDesc();
@@ -352,9 +360,6 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
           op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id,
           task_list_size_after - task_list_size_before);
  }
  if (is_unknown_shape) {
    GE_CHK_STATUS_RET(DestroyUnknownShapeStream(run_context, stream), "Destory unknown shape stream failed.");
  }
  GE_TIMESTAMP_CALLNUM_EVENT_END(GenerateTask, "GraphBuild::GenerateTask");
  return SUCCESS;
 }
@@ -532,6 +537,9 @@ Status TaskGenerator::MarkNodeAndSetIndex(ComputeGraphPtr &graph) {
      (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(node);
    }

    (void)op_desc->DelAttr(kIsFirstNode);
    (void)op_desc->DelAttr(kIsLastNode);

    all_stream_ops[op_desc->GetStreamId()].emplace_back(op_desc);
  }

@@ -645,8 +653,6 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
                                        vector<uint32_t> &all_reduce_nodes) const {
  GELOGI("Start AutoFindBpOpIndex");
  NodePtr bp_node = nullptr;
  uint32_t last_bp = 0;
  uint32_t iter_end = 0;
  uint32_t current_idx = 0;
  for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
    OpDescPtr op_desc = node->GetOpDesc();
@@ -662,20 +668,40 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
      all_reduce_nodes.emplace_back(current_idx);
      GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx);
    }
    if (op_desc->GetType() == NETOUTPUT) {
    if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) {
      if (bp_node == nullptr) {
        bp_node = node;
      }
      iter_end = current_idx;
      GELOGI("Iter end name %s, idx %u", op_desc->GetName().c_str(), iter_end);
    }
    if (graph->GetNeedIteration()) {
      if (op_desc->GetName() == NODE_NAME_NET_OUTPUT + '_' + NODE_NAME_STREAM_SWITCH + "_StreamActive") {
        profiling_point.end_index.insert(current_idx);
        GELOGI("Iter end name %s, idx %u, from Node_Output_IteratorCtrl_StreamSwitch_StreamActive",
               op_desc->GetName().c_str(), current_idx);
      }
      if (op_desc->GetName() == NODE_NAME_FLOWCTRL_LOOP_ASSIGN) {
        profiling_point.end_index.insert(current_idx);
        GELOGI("Iter end name %s, idx %u, from FlowCtrl_LoopCond_ASSIGN", op_desc->GetName().c_str(), current_idx);
      }
    } else {
      if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) {
        profiling_point.end_index.insert(current_idx);
        GELOGI("Iter end name %s, idx %u, from NETOUTPUT", op_desc->GetName().c_str(), current_idx);
      }
    }
  }
  profiling_point.end_index = iter_end;

  if (bp_node == nullptr) {
    GELOGW("not find bp_node.");
    return SUCCESS;
  }

  profiling_point.bp_index = FindLastBpFromBpNode(graph, bp_node);
  return SUCCESS;
 }

 uint32_t TaskGenerator::FindLastBpFromBpNode(const ComputeGraphPtr &graph, const NodePtr &bp_node) const {
  uint32_t last_bp = 0;
  OpDescPtr bp_op_desc = nullptr;
  for (auto &in_anchor : bp_node->GetAllInDataAnchors()) {
    auto out_anchor = in_anchor->GetPeerOutAnchor();
@@ -691,7 +717,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
  }

  GE_CHECK_NOTNULL(bp_op_desc);
  current_idx = 0;
  uint32_t current_idx = 0;
  for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
    OpDescPtr op_desc = node->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
@@ -702,8 +728,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
      break;
    }
  }
  profiling_point.bp_index = last_bp;
  return SUCCESS;
  return last_bp;
 }

 Status TaskGenerator::FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str,
@@ -734,7 +759,6 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin
                                  ProfilingPoint &profiling_point, vector<uint32_t> &all_reduce_nodes) const {
  GELOGI("Start FindBpOfEnv");
  uint32_t current_idx = 0;
  uint32_t iter_end = 0;
  uint32_t last_bp = 0;
  for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
    OpDescPtr op_desc = node->GetOpDesc();
@@ -745,10 +769,23 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin
      continue;
    }

    if (op_desc->GetType() == NETOUTPUT) {
      iter_end = current_idx;
      GELOGI("Iter end name %s, idx %u", op_desc->GetName().c_str(), iter_end);
    if (graph->GetNeedIteration()) {
      if (op_desc->GetName() == NODE_NAME_NET_OUTPUT + '_' + NODE_NAME_STREAM_SWITCH + "_StreamActive") {
        profiling_point.end_index.insert(current_idx);
        GELOGI("Iter end name %s, idx %u, from Node_Output_IteratorCtrl_StreamSwitch_StreamActive",
               op_desc->GetName().c_str(), current_idx);
      }
      if (op_desc->GetName() == NODE_NAME_FLOWCTRL_LOOP_ASSIGN) {
        profiling_point.end_index.insert(current_idx);
        GELOGI("Iter end name %s, idx %u, from FlowCtrl_LoopCond_ASSIGN", op_desc->GetName().c_str(), current_idx);
      }
    } else {
      if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) {
        profiling_point.end_index.insert(current_idx);
        GELOGI("Iter end name %s, idx %u, from NETOUTPUT", op_desc->GetName().c_str(), current_idx);
      }
    }

    if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE) {
      all_reduce_nodes.emplace_back(current_idx);
      GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx);
@@ -760,7 +797,6 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin
  }

  profiling_point.bp_index = last_bp;
  profiling_point.end_index = iter_end;
  return SUCCESS;
 }

@@ -857,7 +893,7 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const
  bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() ||
                      ProfilingManager::Instance().ProfilingTrainingTraceOn();
  if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
      (profiling_point.end_index == 0)) {
      (profiling_point.end_index.empty())) {
    return SUCCESS;
  }
  if (profiling_point.fp_index == node_index) {
@@ -914,7 +950,7 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
  bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() ||
                      ProfilingManager::Instance().ProfilingTrainingTraceOn();
  if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
      (profiling_point.end_index == 0)) {
      (profiling_point.end_index.empty())) {
    return SUCCESS;
  }
  if (profiling_point.bp_index == node_index) {
@@ -928,7 +964,7 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
    bp_log_def->set_notify(false);
    task_def_list.emplace_back(bp_task_def);
  }
  if (profiling_point.end_index == node_index) {
  if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end()) {
    GELOGI("The iteration end operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
    TaskDef end_task_def;
    end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
--- a/ge/graph/build/task_generator.h
+++ b/ge/graph/build/task_generator.h
@@ -36,7 +36,7 @@ class OpsKernelManager;
 struct ProfilingPoint {
  uint32_t fp_index = 0;
  uint32_t bp_index = 0;
  uint32_t end_index = 0;
  std::set<uint32_t> end_index;
 };
 // Describes infos needed by generate task for fusion node
 struct FusionTaskInfo {
@@ -112,6 +112,7 @@ class TaskGenerator {
  Status AutoFindFpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point) const;
  Status AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
                           vector<uint32_t> &all_reduce_nodes) const;
  uint32_t FindLastBpFromBpNode(const ComputeGraphPtr &graph, const NodePtr &bp_node) const;

  Status FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str,
                     ProfilingPoint &profiling_point) const;
--- a/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/ge/graph/load/new_model_manager/davinci_model.cc
@@ -125,6 +125,7 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptr<ModelListener
      rt_model_stream_(nullptr),
      is_inner_model_stream_(false),
      is_async_mode_(false),
      last_execute_mode_(false),
      session_id_(0),
      device_id_(0),
      maxDumpOpNum_(0),
@@ -2879,6 +2880,12 @@ void DavinciModel::SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector<v
      }
    }
  }
  auto it = zero_copy_op_id_batch_label_.find(op_desc->GetId());
  if (it == zero_copy_op_id_batch_label_.end()) {
    zero_copy_task.SetBatchLabel(kDefaultBatchLable);
  } else {
    zero_copy_task.SetBatchLabel(it->second);
  }

  std::lock_guard<std::mutex> lock(outside_addrs_mutex_);
  if (zero_copy_task.IsTaskArgsSet()) {
@@ -3045,6 +3052,9 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> &
             data.first, addr, size, buffer_addr);
      // For input data, just copy for rts task.
      for (ZeroCopyTask &task : zero_copy_tasks_) {
        if (task.GetBatchLabel() != kDefaultBatchLable && task.GetBatchLabel() != batch_label) {
          continue;
        }
        uintptr_t addr_val = reinterpret_cast<uintptr_t>(addr);
        if (task.UpdateTaskParam(addr_val, buffer_addr, zero_copy_batch_label_addrs_, batch_label) != SUCCESS) {
          return FAILED;
@@ -3365,6 +3375,7 @@ Status DavinciModel::InitModelStream(rtStream_t stream) {
  if (is_async_mode_) {
    rt_model_stream_ = stream;
    is_inner_model_stream_ = false;
    last_execute_mode_ = true;
    return SUCCESS;
  }

@@ -3376,12 +3387,14 @@ Status DavinciModel::InitModelStream(rtStream_t stream) {

    rt_model_stream_ = stream;
    is_inner_model_stream_ = false;
    last_execute_mode_ = false;
    return SUCCESS;
  }

  if (rt_model_stream_ == nullptr) {
  if (last_execute_mode_ || (rt_model_stream_ == nullptr)) {
    GE_CHK_RT_RET(rtStreamCreateWithFlags(&rt_model_stream_, priority_, RT_STREAM_FORBIDDEN_DEFAULT));
    is_inner_model_stream_ = true;
    last_execute_mode_ = false;
  }

  return SUCCESS;
@@ -3516,7 +3529,7 @@ uint8_t *DavinciModel::MallocWeightsMem(size_t weights_size) {
 }

 void DavinciModel::FreeFeatureMapMem() {
  if (std::getenv(kEnvGeuseStaticMemory) != nullptr) {
  if (std::getenv(kEnvGeuseStaticMemory) != nullptr && is_inner_mem_base_) {
    string weight_memory_key = std::to_string(0) + "_f";
    if (MemManager::Instance(RT_MEMORY_HBM)->GetMemoryAddr(weight_memory_key) != nullptr) {
      GE_CHK_STATUS(MemManager::Instance(RT_MEMORY_HBM)->FreeMemory(weight_memory_key, GetDeviceId()),
--- a/ge/graph/load/new_model_manager/davinci_model.h
+++ b/ge/graph/load/new_model_manager/davinci_model.h
@@ -884,6 +884,7 @@ class DavinciModel {
  bool is_inner_model_stream_;

  bool is_async_mode_;  // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_.
  bool last_execute_mode_;

  bool is_stream_list_bind_{false};
  bool is_pure_head_stream_{false};
--- a/ge/graph/load/new_model_manager/model_manager.cc
+++ b/ge/graph/load/new_model_manager/model_manager.cc
@@ -43,6 +43,13 @@ const std::string kCmdTypeProfInit = "prof_init";
 const std::string kCmdTypeProfFinalize = "prof_finalize";
 const std::string kCmdTypeProfStart = "prof_start";
 const std::string kCmdTypeProfStop = "prof_stop";
 const char *const kLoadOpFromBuf = "loadOpFromBuf";
 struct CustAicpuSoBuf {
  uint64_t kernelSoBuf;
  uint32_t kernelSoBufLen;
  uint64_t kernelSoName;
  uint32_t kernelSoNameLen;
 } __attribute__((packed));
 }  // namespace

 DumpProperties ModelManager::dump_properties_;
@@ -163,7 +170,13 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
    GELOGI("The session: %lu not created.", session_id);
    return;
  } else {
    GE_CHK_RT(rtSetDevice(static_cast<int32_t>(GetContext().DeviceId())));
    rtContext_t ctx = nullptr;
    bool has_ctx = (rtCtxGetCurrent(&ctx) == RT_ERROR_NONE);
    if (!has_ctx) {
      GELOGI("Set device %u.", GetContext().DeviceId());
      GE_CHK_RT(rtSetDevice(static_cast<int32_t>(GetContext().DeviceId())));
    }

    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0);
    if (ret != SUCCESS) {
      GELOGW("The session: %lu destroy failed.", session_id);
@@ -171,7 +184,11 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
      (void)sess_ids_.erase(session_id);
      GELOGI("The session: %lu destroyed.", session_id);
    }
    GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));

    if (!has_ctx) {
      GELOGI("Reset device %u.", GetContext().DeviceId());
      GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));
    }
  }
 }

@@ -219,6 +236,7 @@ ModelManager::~ModelManager() {
  std::lock_guard<std::mutex> lock(map_mutex_);
  model_map_.clear();
  model_aicpu_kernel_.clear();
  cust_aicpu_so_.clear();

  GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0)));
 }
@@ -919,7 +937,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
    }
    davinci_model->SetDeviceId(device_id);
    davinci_model->SetOmName(model.om_name);
    if (DumpManager::GetInstance().IsDumpOpen()) {
    if (DumpManager::GetInstance().GetDumpProperties().IsDumpOpen()) {
      davinci_model->SetDumpProperties(DumpManager::GetInstance().GetDumpProperties());
    } else {
      davinci_model->SetDumpProperties(dump_properties_);
@@ -1070,6 +1088,67 @@ Status ModelManager::CreateAicpuSession(uint64_t session_id) {
  return SUCCESS;
 }

 Status ModelManager::LoadCustAicpuSo(const OpDescPtr op_desc, string so_name) {
  std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
  auto it = cust_aicpu_so_.find(so_name);
  if (it == cust_aicpu_so_.end()) {
    GE_CHK_STATUS_RET(LaunchCustAicpuSo(op_desc, so_name), "LaunchCustAicpuSo failed. op name %s, so_name %s",
                      op_desc->GetName().c_str(), so_name.c_str());
    (void)cust_aicpu_so_.insert(so_name);
    GELOGI("LaunchCustAicpuSo op name %s, so_name %s.", op_desc->GetName().c_str(), so_name.c_str());
  }
  return SUCCESS;
 }

 Status ModelManager::LaunchCustAicpuSo(const OpDescPtr op_desc, string so_name) {
  CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr());
  if (aicpu_kernel == nullptr) {
    GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str());
    return INTERNAL_ERROR;
  }
  const void *aicpu_data = aicpu_kernel->GetBinData();
  uint32_t aicpu_data_length = aicpu_kernel->GetBinDataSize();

  void *d_aicpu_data = nullptr;
  void *d_so_name = nullptr;
  void *args = nullptr;
  rtError_t status;
  rtStream_t stream = nullptr;
  GE_CHK_RT(rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM));
  GE_CHK_RT(rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE));
  GE_CHK_RT(rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM));
  GE_CHK_RT(rtMemcpy(d_so_name, so_name.size(), reinterpret_cast<const void *>(so_name.c_str()), so_name.size(),
                     RT_MEMCPY_HOST_TO_DEVICE));

  CustAicpuSoBuf cust_aicpu_so_buf;
  cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_aicpu_data));
  cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length;
  cust_aicpu_so_buf.kernelSoName = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_so_name));
  cust_aicpu_so_buf.kernelSoNameLen = so_name.size();

  uint32_t args_size = sizeof(CustAicpuSoBuf);
  GE_CHK_RT(rtMalloc(&args, args_size, RT_MEMORY_HBM));
  GE_CHK_RT(rtMemcpy(args, args_size, static_cast<void *>(&cust_aicpu_so_buf), args_size, RT_MEMCPY_HOST_TO_DEVICE));
  GE_CHK_RT(rtStreamCreate(&stream, 0));
  GE_CHK_RT(rtCpuKernelLaunch(nullptr, kLoadOpFromBuf, 1, args, args_size, nullptr, stream));

  status = rtStreamSynchronize(stream);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status);
    GE_CHK_RT(rtStreamDestroy(stream));
    GE_CHK_RT(rtFree(args));
    GE_CHK_RT(rtFree(d_aicpu_data));
    GE_CHK_RT(rtFree(d_so_name));
    return RT_ERROR_TO_GE_STATUS(status);
  }
  GE_CHK_RT(rtStreamDestroy(stream));
  GE_CHK_RT(rtFree(args));
  GE_CHK_RT(rtFree(d_aicpu_data));
  GE_CHK_RT(rtFree(d_so_name));
  GELOGI("Cpu kernel launch loadOpFromBuf task success.");
  return SUCCESS;
 }

 ///
 /// @ingroup ge
 /// @brief get model memory size and weight
--- a/ge/graph/load/new_model_manager/model_manager.h
+++ b/ge/graph/load/new_model_manager/model_manager.h
@@ -268,6 +268,10 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {

  ge::Status DestroyAicpuSessionForInfer(uint32_t model_id);

  ge::Status LoadCustAicpuSo(const OpDescPtr op_desc, string so_name);

  ge::Status LaunchCustAicpuSo(const OpDescPtr op_desc, string so_name);

  ge::Status GetOrigInputInfo(uint32_t model_id, uint32_t index, OriginInputInfo &orig_input_info);

  ge::Status GenSessionId(uint64_t &session_id);
@@ -333,6 +337,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  uint64_t session_id_bias_;
  std::set<uint64_t> sess_ids_;
  std::vector<rtExceptionInfo> exception_infos_;
  std::mutex cust_aicpu_mutex_;
  std::set<std::string> cust_aicpu_so_;

  static DumpProperties dump_properties_;
 };
--- a/ge/graph/load/new_model_manager/model_utils.cc
+++ b/ge/graph/load/new_model_manager/model_utils.cc
@@ -29,6 +29,14 @@
 #include "framework/common/debug/ge_log.h"
 #include "graph/manager/graph_var_manager.h"

 #define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET)                                                                 \
  do {                                                                                                       \
    if (SIZE <= static_cast<uint64_t>(OFFSET)) {                                                             \
      GELOGE(OUT_OF_MEMORY, "Node: %s, memory out of range[%lu: %ld]", OP->GetName().c_str(), SIZE, OFFSET); \
      return {};                                                                                             \
    }                                                                                                        \
  } while (0)

 namespace ge {
 ///
 /// @ingroup ge
@@ -38,7 +46,7 @@ namespace ge {
 vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) {
  vector<int64_t> v_input_size;
  GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_size);
  const size_t inputs_size = op_desc->GetInputsSize();
  const size_t inputs_size = op_desc->GetAllInputsSize();
  const string op_type = op_desc->GetType();

  const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
@@ -151,7 +159,7 @@ vector<int64_t> ModelUtils::GetWeightSize(ConstOpDescPtr op_desc) {
  }

  // other ops get weight from connected constop
  const size_t inputs_size = op_desc->GetInputsSize();
  const size_t inputs_size = op_desc->GetAllInputsSize();
  const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
  for (size_t i = 0; i < inputs_size; ++i) {
    if ((i < v_is_input_const.size()) && v_is_input_const[i]) {
@@ -191,7 +199,7 @@ vector<ConstGeTensorPtr> ModelUtils::GetWeights(ConstOpDescPtr op_desc) {
  }

  // other ops get weight from connected constop
  const size_t inputs_size = op_desc->GetInputsSize();
  const size_t inputs_size = op_desc->GetAllInputsSize();
  const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
  for (size_t i = 0; i < inputs_size; ++i) {
    if ((i < v_is_input_const.size()) && v_is_input_const[i]) {
@@ -221,7 +229,7 @@ vector<::tagCcAICPUTensor> ModelUtils::GetInputDescs(ConstOpDescPtr op_desc) {
  vector<::opTensor_t> v_input_descs;
  GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_descs);

  const size_t inputs_size = op_desc->GetInputsSize();
  const size_t inputs_size = op_desc->GetAllInputsSize();
  const vector<bool> v_is_input_const = op_desc->GetIsInputConst();

  for (size_t i = 0; i < inputs_size; ++i) {
@@ -306,7 +314,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
  GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_data_addr);
  uint64_t session_id = model_param.session_id;

  const size_t inputs_size = op_desc->GetInputsSize();
  const size_t inputs_size = op_desc->GetAllInputsSize();
  const vector<int64_t> v_input_offset = op_desc->GetInputOffset();

  const string op_type = op_desc->GetType();
@@ -334,6 +342,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
      if (tensor_size) {
        int64_t data_offset = 0;
        GE_CHK_STATUS(TensorUtils::GetDataOffset(*tensor_desc, data_offset));
        VALIDATE_MEM_RANGE(op_desc, model_param.weight_size, data_offset);
        uint8_t *weight_addr = model_param.weight_base + data_offset;
        v_input_data_addr.push_back(weight_addr);
        GELOGI("[IMAS]GetInputDataAddrs graph_%u type[C] name[%s] input[%zu] memaddr[%p]", model_param.graph_id,
@@ -345,11 +354,12 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co

    GE_IF_BOOL_EXEC(non_const_index >= v_input_offset.size(),
                    GELOGW("offsets=%zu, inputs=%zu, index=%zu.", v_input_offset.size(), inputs_size, non_const_index);
                    break;);
                    break);

    int64_t input_offset = v_input_offset[non_const_index];
    non_const_index++;
    GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset),
                    VALIDATE_MEM_RANGE(op_desc, model_param.var_size, input_offset - model_param.logic_var_base);
                    uint8_t *variable_addr = model_param.var_base + input_offset - model_param.logic_var_base;
                    v_input_data_addr.push_back(variable_addr);
                    GELOGI("[IMAS]GetInputDataAddrs graph_%u type[V] name[%s] input[%lu] memaddr[%p]",
@@ -363,6 +373,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
      mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(input_offset));
      v_input_data_addr.push_back(mem_addr);
    } else {
      VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, input_offset);
      mem_addr = model_param.mem_base + input_offset;
      v_input_data_addr.push_back(mem_addr);
    }
@@ -398,6 +409,7 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
  }
  for (size_t i = 0; i < outputs_size; ++i) {
    GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(v_output_offset[i]),
                    VALIDATE_MEM_RANGE(op_desc, model_param.var_size, v_output_offset[i] - model_param.logic_var_base);
                    uint8_t *variable_addr = model_param.var_base + v_output_offset[i] - model_param.logic_var_base;
                    v_output_data_addr.push_back(variable_addr);
                    GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[V] name[%s] output[%zu] memaddr[%p]",
@@ -410,6 +422,7 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
      mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_output_offset[i]));
      v_output_data_addr.push_back(mem_addr);
    } else {
      VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, v_output_offset[i]);
      mem_addr = static_cast<uint8_t *>(model_param.mem_base + v_output_offset[i]);
      v_output_data_addr.push_back(mem_addr);
    }
@@ -440,15 +453,19 @@ vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param
  for (size_t i = 0; i < v_workspace_bytes.size(); ++i) {
    if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {
      v_workspace_data_addr.push_back(reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i])));
      GELOGI("Fusion: op: %s, GetWorkspaceDataAddrs mem_addr[workspace index %zu]:%p", op_desc->GetName().c_str(), i,
             reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i])));
      GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[L1] name[%s], mem_addr[workspace index %zu]:0x%lx",
             model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i]);
    } else if (v_workspace_bytes[i] == 0) {
      v_workspace_data_addr.push_back(nullptr);
      GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] Null addr",
             model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i], v_workspace_bytes[i]);
    } else {
      int64_t workspace_offset = v_workspace_offset[i];
      int64_t workspace_bytes = v_workspace_bytes[i];
      uint8_t *mem_addr = workspace_bytes == 0 ? nullptr : model_param.mem_base + workspace_offset;
      VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, v_workspace_offset[i]);
      uint8_t *mem_addr = model_param.mem_base + v_workspace_offset[i];
      v_workspace_data_addr.push_back(mem_addr);
      GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] memaddr[%p]",
             model_param.graph_id, op_desc->GetName().c_str(), i, workspace_offset, workspace_bytes, mem_addr);
             model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i], v_workspace_bytes[i],
             mem_addr);
    }
  }

--- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
@@ -26,6 +26,7 @@
 #include "framework/common/l2_cache_optimize.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/load/new_model_manager/davinci_model.h"
 #include "graph/load/new_model_manager/model_manager.h"
 #include "graph/load/new_model_manager/model_utils.h"
 #include "runtime/kernel.h"
 #include "super_kernel/super_kernel.h"
@@ -41,13 +42,6 @@ const char *kIsLastNode = "is_last_node";
 const char *kIsFirstNode = "is_first_node";
 const int64_t kCloseSkt = 100;
 const uint32_t kAddrLen = sizeof(void *);
 const char *const kLoadOpFromBuf = "loadOpFromBuf";
 struct CustAicpuSoBuf {
  uint64_t kernelSoBuf;
  uint32_t kernelSoBufLen;
  uint64_t kernelSoName;
  uint32_t kernelSoNameLen;
 } __attribute__((packed));
 }  // namespace

 namespace ge {
@@ -861,92 +855,6 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) {
  return SUCCESS;
 }

 Status KernelTaskInfo::LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def) {
  CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr());
  if (aicpu_kernel == nullptr) {
    GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str());
    return INTERNAL_ERROR;
  }
  const void *aicpu_data = aicpu_kernel->GetBinData();
  uint32_t aicpu_data_length = aicpu_kernel->GetBinDataSize();

  void *d_aicpu_data = nullptr;
  rtError_t status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }

  status = rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }

  void *d_so_name = nullptr;
  status = rtMalloc(&d_so_name, so_name_.size(), RT_MEMORY_HBM);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }

  status = rtMemcpy(d_so_name, so_name_.size(), reinterpret_cast<const void *>(so_name_.c_str()), so_name_.size(),
                    RT_MEMCPY_HOST_TO_DEVICE);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }

  CustAicpuSoBuf cust_aicpu_so_buf;
  cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_aicpu_data));
  cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length;
  cust_aicpu_so_buf.kernelSoName = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_so_name));
  cust_aicpu_so_buf.kernelSoNameLen = so_name_.size();

  void *args = nullptr;
  uint32_t args_size = sizeof(CustAicpuSoBuf);
  status = rtMalloc(&args, args_size, RT_MEMORY_HBM);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }
  GELOGI("loadOpFromBuf kernelSoBuf %p, kernelSoBufLen %u, kernelSoName %p, kernelSoNameLen %u.", d_aicpu_data,
         aicpu_data_length, d_so_name, so_name_.size());

  status = rtMemcpy(args, args_size, static_cast<void *>(&cust_aicpu_so_buf), args_size, RT_MEMCPY_HOST_TO_DEVICE);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }

  rtStream_t stream = nullptr;
  status = rtStreamCreate(&stream, 0);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt create stream failed, status: 0x%x", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }

  status = rtCpuKernelLaunch(nullptr, kLoadOpFromBuf, 1, args, args_size, nullptr, stream);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt CpuKernelLaunch loadOpFromBuf failed, status: 0x%X", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }
  GELOGI("Cpu kernel launch loadOpFromBuf.");

  status = rtStreamSynchronize(stream);
  if (status != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }

  GE_CHK_RT(rtFree(args));
  GE_CHK_RT(rtFree(d_aicpu_data));
  GE_CHK_RT(rtFree(d_so_name));

  GELOGI("Cpu kernel launch loadOpFromBuf task success.");
  return SUCCESS;
 }

 Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &kernel_def) {
  GELOGI("Do InitAicpuTask");
  so_name_ = kernel_def.so_name();
@@ -961,7 +869,7 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
  }

  if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
    GE_CHK_STATUS_RET(LaunchCustAicpuSo(op_desc, kernel_def), "launch cust aicpu so failed");
    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name_), "launch cust aicpu so failed");
  }

  // copy args to new host memory
--- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
@@ -106,8 +106,6 @@ class KernelTaskInfo : public TaskInfo {

  Status InitAicpuTaskExtInfo(const std::string &ext_info);

  Status LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def);

  Status StoreInputOutputTensor(const std::vector<void *> &input_data_addrs,
                                const std::vector<void *> &output_data_addrs,
                                const std::vector<::tagCcAICPUTensor> &input_descs,
--- a/ge/graph/load/new_model_manager/zero_copy_task.cc
+++ b/ge/graph/load/new_model_manager/zero_copy_task.cc
@@ -118,13 +118,11 @@ bool ZeroCopyTask::CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_ad
 */
 Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const map<string, set<uintptr_t>> &batch_addrs,
                                     const string &batch_label) {
  for (auto pair : task_addr_offset_) {
    if (pair.first != addr) {
      continue;
    }

  auto iter = task_addr_offset_.find(addr);
  if (iter != task_addr_offset_.end()) {
    auto &cur_pair = *iter;
    uint8_t *args_info = args_info_.data();
    for (auto offset : pair.second) {
    for (auto offset : cur_pair.second) {
      if (!CheckDynamicBatch(batch_addrs, batch_label, reinterpret_cast<uintptr_t>(args_addr_ + offset))) {
        continue;
      }
--- a/ge/graph/load/new_model_manager/zero_copy_task.h
+++ b/ge/graph/load/new_model_manager/zero_copy_task.h
@@ -83,6 +83,10 @@ class ZeroCopyTask {
   */
  ge::Status DistributeParam(bool async_mode, rtStream_t stream);

  void SetBatchLabel(const string &batch_label) { batch_label_ = batch_label; }

  const string &GetBatchLabel() const { return batch_label_; }

 protected:
  bool CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_addrs, const string &batch_label, uintptr_t addr);

@@ -93,7 +97,7 @@ class ZeroCopyTask {
  const size_t args_size_;
  vector<uint8_t> args_info_;
  bool is_updated_;

  string batch_label_;
  // <address from Op, {offset in args}>
  map<uintptr_t, vector<size_t>> task_addr_offset_;
 };
--- a/ge/graph/manager/graph_manager.cc
+++ b/ge/graph/manager/graph_manager.cc
@@ -267,6 +267,14 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph,
  auto compute_graph = GraphUtils::GetComputeGraph(graph);
  if (compute_graph != nullptr) {
    compute_graph->SetGraphID(graph_id);
    bool graph_has_been_added = false;
    if (AttrUtils::GetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, graph_has_been_added) &&
        graph_has_been_added) {
      GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] same graph object can not be added again, graph_id = %u.",
             graph_id);
      return GE_GRAPH_GRAPH_ALREADY_EXIST;
    }
    (void)AttrUtils::SetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, true);
  } else {
    GELOGE(FAILED, "compute graph is null");
    return FAILED;
@@ -1953,9 +1961,9 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
  names_to_passes.emplace_back("MergePass", &merge_pass);
  names_to_passes.emplace_back("CastRemovePass", &cast_remove_pass);
  names_to_passes.emplace_back("TransposeTransDataPass", &transpose_transdata_pass);
  names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass);
  names_to_passes.emplace_back("TransOpSymmetryEliminationPass", &symmetry_elimination_pass);
  names_to_passes.emplace_back("TransOpNearbyAllreduceFusionPass", &trans_op_nearby_allreduce_fusion_pass);
  names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass);
  names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass);
  names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass);
  names_to_passes.emplace_back("DimensionAdjustPass", &dimension_adjust_pass);
--- a/ge/graph/partition/engine_place.cc
+++ b/ge/graph/partition/engine_place.cc
@@ -23,6 +23,7 @@
 #include <mutex>

 #include "common/op/ge_op_utils.h"
 #include "common/util/error_manager/error_manager.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/utils/op_desc_utils.h"
 #include "init/gelib.h"
@@ -82,6 +83,8 @@ Status EnginePlacer::Run() {
      // If can't get op's engine name, keep check support finish and return failed
      if (engine_name.empty()) {
        is_check_support_success = false;
        ErrorManager::GetInstance().ATCReportErrMessage("E13003", {"opname", "optype"},
                                                        {op_desc->GetName(), op_desc->GetType()});
        GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Can not find engine of op type %s",
               node_ptr->GetOpDesc()->GetType().c_str());
        continue;
--- a/ge/graph/passes/for_pass.cc
+++ b/ge/graph/passes/for_pass.cc
@@ -190,6 +190,10 @@ Status ForPass::FindInputsAndOutputs(const NodePtr &node, std::vector<OutDataAnc
      GELOGE(FAILED, "FindInputWithIndex %s:%u failed: in_data_anchor is NULL.", node->GetName().c_str(), index);
      return FAILED;
    }
    GE_IF_BOOL_EXEC(
      in_data_anchor->GetPeerOutAnchor() == nullptr,
      GELOGW("Get null input by index %d from node %s ", in_data_anchor->GetIdx(), node->GetName().c_str());
      continue);
    data_inputs.emplace_back(in_data_anchor->GetPeerOutAnchor());
  }

--- a/ge/graph/passes/multi_batch_clone_pass.cc
+++ b/ge/graph/passes/multi_batch_clone_pass.cc
@@ -239,7 +239,7 @@ Status MultiBatchClonePass::CreateIndexConstNode(const ComputeGraphPtr &graph, N

  GeTensorDesc const_tensor(GeShape({count}), FORMAT_ND, DT_INT32);
  GeTensor tensor(const_tensor);
  tensor.SetData(reinterpret_cast<uint8_t *>(addr.get()), count * sizeof(int32_t));
  (void)tensor.SetData(reinterpret_cast<uint8_t *>(addr.get()), count * sizeof(int32_t));
  if (!AttrUtils::SetTensor(const_desc, ATTR_NAME_WEIGHTS, tensor)) {
    GELOGE(OUT_OF_MEMORY, "Failed to init tensor value for const %s", const_desc->GetName().c_str());
    return FAILED;
--- a/ge/graph/passes/reshape_recovery_pass.cc
+++ b/ge/graph/passes/reshape_recovery_pass.cc
@@ -50,9 +50,12 @@ Status InsertReshapeIfNeed(const NodePtr &node) {
    GE_CHECK_NOTNULL(src_tensor);
    for (auto dst_anchor : src_anchor->GetPeerInDataAnchors()) {
      auto dst_node = dst_anchor->GetOwnerNode();
      GELOGD("Try insert reshape between %s[%d] and %s[%d] to keep the shape continues", node->GetName().c_str(),
             src_anchor->GetIdx(), dst_node->GetName().c_str(), dst_anchor->GetIdx());
      GE_CHECK_NOTNULL(dst_node);
      GE_CHECK_NOTNULL(dst_node->GetOpDesc());
      auto dst_tensor = dst_node->GetOpDesc()->GetInputDescPtr(dst_anchor->GetIdx());
      GE_CHECK_NOTNULL(dst_tensor);
      bool is_need_insert_reshape = src_tensor->GetShape().GetDims() != UNKNOWN_RANK &&
                                    dst_tensor->GetShape().GetDims() != UNKNOWN_RANK &&
                                    src_tensor->GetShape().GetDims() != dst_tensor->GetShape().GetDims();
--- a/ge/graph/preprocess/multi_batch_copy_graph.cc
+++ b/ge/graph/preprocess/multi_batch_copy_graph.cc
@@ -113,10 +113,9 @@ NodePtr InsertCopyNode(const NodePtr &node, size_t n) {
  desc->CopyAttrsFrom(*src_op_desc);
  for (uint32_t i = 0; i < node->GetAllInDataAnchorsSize(); ++i) {
    auto input_desc = desc->MutableInputDesc(i);
    GE_IF_BOOL_EXEC(input_desc == nullptr,
                    GELOGE(INTERNAL_ERROR, "Failed to get input desc by index %u from node %s when copy from %s", i,
                           desc->GetName().c_str(), node->GetName().c_str());
                    return nullptr);
    GE_IF_BOOL_EXEC(input_desc == nullptr, GELOGW("Get null input desc by index %u from node %s when copy from %s", i,
                                                  desc->GetName().c_str(), node->GetName().c_str());
                    continue);

    input_desc->CopyAttrsFrom(src_op_desc->GetInputDesc(i));
  }
@@ -991,12 +990,17 @@ Status MultiBatchGraphCopyer::InsertIdentityAfterSwitchN() {
    size_t i = 0;
    for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
      for (auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
        auto identity_desc = MakeShared<OpDesc>(node->GetName() + "_identity_" + std::to_string(i), IDENTITY);
        GE_CHECK_NOTNULL(identity_desc);

        auto out_node = in_data_anchor->GetOwnerNode();
        auto op_desc = out_node->GetOpDesc();
        GE_CHECK_NOTNULL(op_desc);
        if ((out_node->GetType() == MERGE) && (op_desc->HasAttr(ATTR_INSERT_BY_MBATCH))) {
          GELOGD("No need to insert identity between %s and %s.", node->GetName().c_str(), out_node->GetName().c_str());
          continue;
        }

        auto identity_desc = MakeShared<OpDesc>(node->GetName() + "_identity_" + std::to_string(i), IDENTITY);
        GE_CHECK_NOTNULL(identity_desc);

        string batch_label;
        if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
          if (!AttrUtils::SetStr(identity_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
--- a/ge/host_kernels/strided_slice_kernel.cc
+++ b/ge/host_kernels/strided_slice_kernel.cc
@@ -16,131 +16,262 @@

 #include "host_kernels/strided_slice_kernel.h"

 #include <memory>

 #include "common/fp16_t.h"
 #include "common/ge_inner_error_codes.h"
 #include "common/math/math_util.h"
 #include "common/op/ge_op_utils.h"
 #include "external/graph/types.h"
 #include "framework/common/debug/ge_log.h"
 #include "host_kernels/kernel_utils.h"
 #include "graph/utils/type_utils.h"
 #include "host_kernels/kernel_utils.h"
 #include "inc/kernel_factory.h"
 #include <memory>

 namespace ge {
 namespace {
 const int32_t kNumOne = 1;
 const size_t kStridedSliceInputSize = 4;
 const size_t kStridedSliceInputIndex0 = 0;
 const size_t kStridedSliceInputIndex1 = 1;
 const size_t kStridedSliceInputIndex2 = 2;
 const size_t kStridedSliceInputIndex3 = 3;
 const int32_t kDefaultSrideSize = 1;
 }  // namespace
 Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr, const std::vector<ConstGeTensorPtr> &input,
                                           Attr &args) {
  int64_t begin_mask = 0;
  int64_t end_mask = 0;
  int64_t ellipsis_mask = 0;
  int64_t new_axis_mask = 0;
  int64_t shrink_axis_mask = 0;
 const size_t kStridedSliceInputIndex = 0;
 const size_t kStridedSliceBeginIndex = 1;
 const size_t kStridedSliceEndIndex = 2;
 const size_t kStridedSliceStrideIndex = 3;
 const int32_t kDefaultStrideSize = 1;
 const std::set<DataType> kIndexNumberType = {DT_INT32, DT_INT64};

  if (attr == nullptr) {
    GELOGW("input opdescptr is nullptr.");
    return PARAM_INVALID;
 bool IsEllipsisMaskValid(const GeTensorDescPtr &input_desc, const int ellipsis_mask) {
  if (ellipsis_mask != 0) {
    auto ellipsis_num = 0;
    auto input_shape = input_desc->GetShape();
    bool ellipsis_mask_flag = false;
    for (size_t i = 0; i < input_shape.GetDimNum(); i++) {
      uint32_t i_temp = static_cast<uint32_t>(i);
      ellipsis_mask_flag = (static_cast<uint32_t>(ellipsis_mask) & (1 << i_temp));
      if (ellipsis_mask_flag) {
        ++ellipsis_num;
      }
      if (ellipsis_num > 1) {
        GELOGW("Only one non-zero bit is allowed in ellipsis_mask.");
        return false;
      }
    }
  }
  if (input.size() != kStridedSliceInputSize) {
    GELOGW("The number of input for strided slice must be %zu.", kStridedSliceInputSize);
    return PARAM_INVALID;
  return true;
 }
 }  // namespace
 Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector<ge::ConstGeTensorPtr> &input,
                                   vector<ge::GeTensorPtr> &v_output) {
  GELOGD("StridedSliceKernel in.");
  // 1.Check input and attrs
  if (CheckAndGetAttr(attr) != SUCCESS) {
    GELOGW("Check and get attrs failed.Ignore kernel.");
    return NOT_CHANGED;
  }
  if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_BEGIN_MASK, begin_mask)) {
    GELOGW("get begin_mask attr failed.");
    return PARAM_INVALID;
  if (CheckInputParam(input) != SUCCESS) {
    GELOGW("Check input params failed.Ignore kernel.");
    return NOT_CHANGED;
  }
  if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_END_MASK, end_mask)) {
    GELOGW("get end_mask attr failed.");
    return PARAM_INVALID;
  // 2.Init param with mask attrs.
  std::vector<int64_t> input_dims;
  std::vector<int64_t> begin_vec;
  std::vector<int64_t> output_dims;
  std::vector<int64_t> stride_vec;
  if (InitParamWithAttrs(input, input_dims, begin_vec, output_dims, stride_vec) != SUCCESS) {
    GELOGW("Init param with mask attrs failed.Ignore kernel.");
    return NOT_CHANGED;
  }
  if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_ELLIPSIS_MASK, ellipsis_mask)) {
    GELOGW("get ellipsis_mask attr failed.");
    return PARAM_INVALID;

  // 3.Set sliced data to output_ptr
  ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex];
  auto data_type = weight0->GetTensorDesc().GetDataType();
  size_t data_size = weight0->GetData().size() / GetSizeByDataType(data_type);
  void *data = reinterpret_cast<void *>(const_cast<uint8_t *>(weight0->GetData().data()));
  GE_CHECK_NOTNULL(data);
  // Index 0 can always gets a GeTensorDesc object from any OpDescPtr.
  auto output_tensor_desc = attr->GetOutputDesc(0);
  GeTensorPtr output_ptr = MakeShared<GeTensor>(output_tensor_desc);
  if (output_ptr == nullptr) {
    GELOGE(MEMALLOC_FAILED, "MakeShared GeTensor failed, node name %s.", attr->GetName().c_str());
    return NOT_CHANGED;
  }
  if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_NEW_AXIS_MASK, new_axis_mask)) {
    GELOGW("get new_axis_mask attr failed.");
    return PARAM_INVALID;
  auto ret = OpUtils::SetOutputSliceData(data, static_cast<int64_t>(data_size), data_type, input_dims, begin_vec,
                                         output_dims, output_ptr.get(), stride_vec);
  if (ret != SUCCESS) {
    GELOGE(INTERNAL_ERROR, "SetOutputSliceData failed.");
    return NOT_CHANGED;
  }
  if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK, shrink_axis_mask)) {
    GELOGW("get shrink_axis_mask attr failed.");

  // 4.Set output data_type and shape
  GeTensorDesc &t_d = output_ptr->MutableTensorDesc();
  t_d.SetDataType(static_cast<DataType>(data_type));

  auto final_dim_size = static_cast<uint32_t>(output_dims.size());
  vector<int64_t> v_dims;
  GetOutputDims(final_dim_size, output_dims, v_dims);
  t_d.SetShape(GeShape(v_dims));
  v_output.push_back(output_ptr);
  GELOGI("StridedSliceKernel success.");
  return SUCCESS;
 }
 Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr) {
  if (attr == nullptr) {
    GELOGE(PARAM_INVALID, "input opdescptr is nullptr.");
    return PARAM_INVALID;
  }
  if ((ellipsis_mask != 0) || (new_axis_mask != 0)) {
    GELOGW("ellipsis_mask or new_axis_mask must be 0 with optimizer.");
    return NOT_CHANGED;
  // Get all op attr value of strided_slice
  for (auto &attr_2_value : attr_value_map_) {
    if (!AttrUtils::GetInt(attr, attr_2_value.first, attr_2_value.second)) {
      GELOGE(PARAM_INVALID, "Get %s attr failed.", attr_2_value.first.c_str());
      return PARAM_INVALID;
    }
  }
  const auto &input_desc = attr->MutableInputDesc(kStridedSliceInputIndex0);
  // Check ellipsis_mask is valid
  const auto &input_desc = attr->MutableInputDesc(kStridedSliceInputIndex);
  GE_CHECK_NOTNULL(input_desc);
  DataType data_type = input_desc->GetDataType();
  if ((data_type != DT_FLOAT) && (data_type != DT_INT32)) {
    GELOGW(
      "Data type of StridedSlice OP must be float or int32."
      "Constant folding will not be carried out in this condition"
      "which might affect the time performance but not the accuracy");
  }
  args.begin_mask = begin_mask;
  args.end_mask = end_mask;
  args.ellipsis_mask = ellipsis_mask;
  args.new_axis_mask = new_axis_mask;
  args.data_type = static_cast<int64_t>(data_type);
  args.shrink_axis_mask = shrink_axis_mask;

  ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex0];
  ConstGeTensorPtr weight1 = input[kStridedSliceInputIndex1];
  ConstGeTensorPtr weight2 = input[kStridedSliceInputIndex2];
  ConstGeTensorPtr weight3 = input[kStridedSliceInputIndex3];
  if (CheckWeight(weight0, weight1, weight2, weight3) != SUCCESS) {
    GELOGW("Check And Get Attr failed.");
  auto ellipsis_mask = attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK);
  if (!IsEllipsisMaskValid(input_desc, ellipsis_mask)) {
    return PARAM_INVALID;
  }

  return SUCCESS;
 }
 Status StridedSliceKernel::CheckWeight(const ConstGeTensorPtr &weight0, const ConstGeTensorPtr &weight1,
                                       const ConstGeTensorPtr &weight2, const ConstGeTensorPtr &weight3) const {
  if ((weight0 == nullptr) || (weight1 == nullptr) || (weight2 == nullptr) || (weight3 == nullptr)) {
    GELOGW("weight is nullptr.");
 Status StridedSliceKernel::CheckInputParam(const std::vector<ConstGeTensorPtr> &input) const {
  if (input.size() != kStridedSliceInputSize) {
    GELOGE(PARAM_INVALID, "The number of input for strided slice must be %zu.", kStridedSliceInputSize);
    return PARAM_INVALID;
  }
  if (!(weight1->GetTensorDesc().GetDataType() == DT_INT32 && weight2->GetTensorDesc().GetDataType() == DT_INT32 &&
        weight3->GetTensorDesc().GetDataType() == DT_INT32)) {
    GELOGE(INTERNAL_ERROR, "Data type of StridedSlice OP(begin,end,strides) must be int32.");
    return INTERNAL_ERROR;

  ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex];
  ConstGeTensorPtr begin_tensor = input[kStridedSliceBeginIndex];
  ConstGeTensorPtr end_tensor = input[kStridedSliceEndIndex];
  ConstGeTensorPtr stride_tensor = input[kStridedSliceStrideIndex];
  GE_CHECK_NOTNULL(weight0);
  GE_CHECK_NOTNULL(begin_tensor);
  GE_CHECK_NOTNULL(end_tensor);
  GE_CHECK_NOTNULL(stride_tensor);

  // check if begin,end,strides data type is supported
  auto begin_tensor_desc = begin_tensor->GetTensorDesc();
  auto end_tensor_desc = begin_tensor->GetTensorDesc();
  auto stride_tensor_desc = begin_tensor->GetTensorDesc();
  if (begin_tensor_desc.GetDataType() != end_tensor_desc.GetDataType() ||
      end_tensor_desc.GetDataType() != stride_tensor_desc.GetDataType()) {
    GELOGW("Data type of StridedSlice OP(begin,end,strides) must be same.");
    return PARAM_INVALID;
  }
  if (kIndexNumberType.find(begin_tensor_desc.GetDataType()) == kIndexNumberType.end()) {
    GELOGW("Data type of StridedSlice OP(begin,end,strides) must be int32 or int64.");
    return PARAM_INVALID;
  }

  // check data
  size_t weight0_size = weight0->GetData().size() / sizeof(int32_t);
  size_t weight1_size = weight1->GetData().size() / sizeof(int32_t);
  size_t weight2_size = weight2->GetData().size() / sizeof(int32_t);
  size_t weight3_size = weight3->GetData().size() / sizeof(int32_t);
  if ((weight0_size == 0) || (weight1_size == 0) || (weight2_size == 0) || (weight3_size == 0)) {
  auto x_data_type = weight0->GetTensorDesc().GetDataType();
  auto x_data_size = GetSizeByDataType(x_data_type);
  if (x_data_size < 0) {
    GELOGW("Data type of x input %s is not supported.", TypeUtils::DataTypeToSerialString(x_data_type).c_str());
    return PARAM_INVALID;
  }
  size_t weight0_size = weight0->GetData().size() / x_data_size;
  size_t begin_data_size = begin_tensor->GetData().size() / sizeof(int32_t);
  size_t end_data_size = end_tensor->GetData().size() / sizeof(int32_t);
  size_t stride_data_size = stride_tensor->GetData().size() / sizeof(int32_t);
  if ((weight0_size == 0) || (begin_data_size == 0) || (end_data_size == 0) || (stride_data_size == 0)) {
    GELOGW("Data size of inputs is 0.");
    return PARAM_INVALID;
  }

  // check dim size
  size_t weight0_dim_size = weight0->GetTensorDesc().GetShape().GetDimNum();
  if (!((weight0_dim_size >= weight1_size) && (weight1_size == weight2_size) && (weight1_size == weight3_size))) {
  if (!((begin_data_size == end_data_size) && (end_data_size == stride_data_size))) {
    GELOGW("The sizes of begin, end and stride is not supported.");
    return NOT_CHANGED;
    return PARAM_INVALID;
  }

  return SUCCESS;
 }

 Status StridedSliceKernel::MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag,
                                   int32_t &begin_i, int32_t &end_i, int32_t &dim_i) const {
 Status StridedSliceKernel::InitParamWithAttrs(const std::vector<ConstGeTensorPtr> &input,
                                              std::vector<int64_t> &input_dims, std::vector<int64_t> &begin_vec,
                                              std::vector<int64_t> &output_dims, std::vector<int64_t> &stride_vec) {
  ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex];
  ConstGeTensorPtr begin_tensor = input[kStridedSliceBeginIndex];
  ConstGeTensorPtr end_tensor = input[kStridedSliceEndIndex];
  ConstGeTensorPtr stride_tensor = input[kStridedSliceStrideIndex];

  const GeShape x_shape = weight0->GetTensorDesc().GetShape();
  auto x_dims = x_shape.GetDims();
  auto x_dims_num = x_shape.GetDimNum();
  // handle new_axis_mask
  ExpandDimsWithNewAxis(begin_tensor, x_dims_num, x_dims);

  const int32_t *begin = reinterpret_cast<const int32_t *>(begin_tensor->GetData().data());
  const int32_t *end = reinterpret_cast<const int32_t *>(end_tensor->GetData().data());
  const int32_t *stride = reinterpret_cast<const int32_t *>(stride_tensor->GetData().data());
  auto begin_dim_num = begin_tensor->GetData().size() / sizeof(int32_t);
  auto min_dim = x_dims_num > begin_dim_num ? begin_dim_num : x_dims_num;
  for (size_t i = 0; i < x_dims.size(); ++i) {
    auto i_temp = static_cast<uint64_t>(i);
    bool new_axis_mask_flag =
      (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK)) & (1 << i_temp));
    if (new_axis_mask_flag) {
      output_dims.push_back(1);
      input_dims.push_back(1);
      begin_vec.push_back(0);
      stride_vec.push_back(1);
      continue;
    }

    int64_t begin_i = 0;
    int64_t end_i = 0;
    int64_t stride_i = 1;
    if (i < min_dim) {
      begin_i = begin[i];
      end_i = end[i];
      stride_i = stride[i];
    } else {
      begin_i = 0;
      end_i = x_dims.at(i);
      stride_i = 1;
    }
    GELOGD("Before mask calculate. Begin is : %d\t,end is : %d\t stride is : %d\t x_dim_i is : %d.", begin_i, end_i,
           stride_i, x_dims.at(i));
    auto ret = MaskCal(i, begin_i, end_i, x_dims.at(i));
    if (ret != SUCCESS) {
      GELOGW("MaskCal failed, because of data overflow.");
      return NOT_CHANGED;
    }
    int64_t dim_final;
    GELOGD("Before stride calculate. Begin is : %d\t,end is : %d\t stride is : %d\t x_dim_i is : %d.", begin_i, end_i,
           stride_i, x_dims.at(i));
    (void)StrideCal(x_dims.at(i), begin_i, end_i, stride_i, dim_final);
    output_dims.push_back(dim_final);
    input_dims.push_back(x_dims.at(i));
    begin_vec.push_back(begin_i);
    stride_vec.push_back(stride_i);
  }
  return SUCCESS;
 }
 void StridedSliceKernel::ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num,
                                               vector<int64_t> &x_dims) {
  auto begin_data_type_size = GetSizeByDataType(begin_tensor->GetTensorDesc().GetDataType());
  size_t begin_vec_size = begin_tensor->GetData().size() / begin_data_type_size;
  auto final_dim_num = x_dims_num < begin_vec_size ? begin_vec_size : x_dims_num;
  for (size_t i = 0; i < final_dim_num; i++) {
    auto i_temp = static_cast<uint64_t>(i);
    bool new_axis_mask_flag =
      (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK)) & (1 << i_temp));
    if (new_axis_mask_flag) {
      x_dims.insert(x_dims.begin() + i, 1);
    }
  }
 }
 Status StridedSliceKernel::MaskCal(const size_t i, int64_t &begin_i, int64_t &end_i, int64_t &dim_i) const {
  uint64_t i_temp = static_cast<uint64_t>(i);
  bool begin_mask_flag = (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_BEGIN_MASK)) & (1 << i_temp));
  bool end_mask_flag = (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_END_MASK)) & (1 << i_temp));
  bool ellipsis_mask_flag =
    (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK)) & (1 << i_temp));
  bool shrink_mask_flag =
    (static_cast<uint32_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK)) & (1 << i_temp));
  if (shrink_mask_flag) {
    begin_i = (begin_i < 0 ? (dim_i + begin_i) : begin_i);
    FMK_INT32_ADDCHECK(begin_i, kNumOne);
    FMK_INT32_ADDCHECK(begin_i, kNumOne)
    end_i = begin_i + kNumOne;
  } else {
    if (begin_mask_flag) {
@@ -153,130 +284,43 @@ Status StridedSliceKernel::MaskCal(const bool &begin_mask_flag, const bool &end_
    } else {
      end_i = (end_i < 0 ? (dim_i + end_i) : end_i);
    }
    if (ellipsis_mask_flag) {
      begin_i = 0;
      end_i = dim_i;
    }
  }
  return SUCCESS;
 }
 Status StridedSliceKernel::StrideCal(const int64_t x_dims_i, int64_t &begin_i, int64_t &end_i, int64_t &stride_i,
                                     int64_t &dim_final) const {
  if (stride_i == 0) {
    stride_i = kDefaultStrideSize;
  } else if (stride_i < 0) {
    stride_i = -stride_i;
    begin_i = x_dims_i - begin_i - 1;
    end_i = x_dims_i - end_i - 1;
  }

 void StridedSliceKernel::GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, const Attr &args,
  if (end_i > x_dims_i) {
    end_i = x_dims_i;
  }

  if ((begin_i == 0) && (end_i == 0)) {
    dim_final = x_dims_i;
  } else {
    dim_final = abs(end_i - begin_i) / stride_i;
  }
  return SUCCESS;
 }
 void StridedSliceKernel::GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims,
                                       vector<int64_t> &v_dims) {
  for (uint32_t k = 0; k < dims_size; k++) {
    bool shrink_mask_i = (static_cast<uint32_t>(args.shrink_axis_mask) & (1 << k));
    bool shrink_mask_i = (static_cast<uint32_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK)) & (1 << k));
    if (shrink_mask_i) {
      continue;
    }
    v_dims.push_back(output_dims[k]);
  }
 }

 Status StridedSliceKernel::CheckOutputDims(const std::vector<int64_t> &output_dims, const OpDescPtr attr) {
  // check dim not all less than 0
  for (auto dim : output_dims) {
    if (dim > 0) {
      return SUCCESS;
    }
  }
  GELOGW("all output dim <=0, can't be processed. op_name : %s", attr->GetName().c_str());
  return NOT_CHANGED;
 }

 Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector<ge::ConstGeTensorPtr> &input,
                                   vector<ge::GeTensorPtr> &v_output) {
  GELOGI("StridedSliceKernel in.");
  Attr args;
  Status ret = CheckAndGetAttr(attr, input, args);
  if (ret != SUCCESS) {
    GELOGW("Check And Get Attr failed.");
    return NOT_CHANGED;
  }

  ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex0];
  ConstGeTensorPtr weight1 = input[kStridedSliceInputIndex1];
  ConstGeTensorPtr weight2 = input[kStridedSliceInputIndex2];
  ConstGeTensorPtr weight3 = input[kStridedSliceInputIndex3];

  const GeShape x_shape = weight0->GetTensorDesc().GetShape();
  size_t dim_size = x_shape.GetDimNum();
  size_t data_size = weight0->GetData().size() / sizeof(int32_t);

  const int32_t *begin = reinterpret_cast<const int32_t *>(weight1->GetData().data());
  const int32_t *end = reinterpret_cast<const int32_t *>(weight2->GetData().data());
  const int32_t *stride = reinterpret_cast<const int32_t *>(weight3->GetData().data());
  if ((begin == nullptr) || (end == nullptr) || (stride == nullptr)) {
    GELOGW("input weight tensor is nullptr.");
    return NOT_CHANGED;
  }

  std::vector<int64_t> input_dims;
  std::vector<int64_t> begin_vec;
  std::vector<int64_t> output_dims;
  std::vector<int64_t> stride_vec;
  int64_t dim_final;
  for (size_t i = 0; i < dim_size; i++) {
    int32_t begin_i = begin[i];
    int32_t end_i = end[i];
    int32_t stride_i = stride[i];
    int32_t dim_i = static_cast<int32_t>(x_shape.GetDim(i));
    GELOGI("%d\t %d\t %d\t %d", begin_i, end_i, stride_i, dim_i);
    uint32_t i_temp = static_cast<uint32_t>(i);
    bool begin_mask_i = (static_cast<uint32_t>(args.begin_mask) & (1 << i_temp));
    bool end_mask_i = (static_cast<uint32_t>(args.end_mask) & (1 << i_temp));
    bool shrink_mask_i = (static_cast<uint32_t>(args.shrink_axis_mask) & (1 << i_temp));
    ret = MaskCal(begin_mask_i, end_mask_i, shrink_mask_i, begin_i, end_i, dim_i);
    if (ret != SUCCESS) {
      GELOGW("MaskCal failed, because of data overflow.");
      return NOT_CHANGED;
    }
    if (stride_i == 0) {
      stride_i = kDefaultSrideSize;
    } else if (stride_i < 0) {
      stride_i = -stride_i;
      begin_i = x_shape.GetDim(i) - begin_i - 1;
      end_i = x_shape.GetDim(i) - end_i - 1;
    }
    if ((begin_i == 0) && (end_i == 0)) {
      dim_final = x_shape.GetDim(i);
    } else {
      dim_final = abs(end_i - begin_i) / stride_i;
    }
    output_dims.push_back(dim_final);
    input_dims.push_back(x_shape.GetDim(i));
    begin_vec.push_back(begin_i);
    stride_vec.push_back(stride_i);
  }

  // Index 0 can always gets a GeTensorDesc object from any OpDescPtr.
  auto output_tensor_desc = attr->GetOutputDesc(0);
  GeTensorPtr output_ptr = MakeShared<GeTensor>(output_tensor_desc);
  if (output_ptr == nullptr) {
    GELOGW("MakeShared GeTensor failed, node name %s.", attr->GetName().c_str());
    return NOT_CHANGED;
  }

  void *data = reinterpret_cast<void *>(const_cast<uint8_t *>(weight0->GetData().data()));
  GE_CHECK_NOTNULL(data);

  ret = CheckOutputDims(output_dims, attr);
  if (ret != SUCCESS) {
    return ret;
  }

  ret = OpUtils::SetOutputSliceData(data, static_cast<int64_t>(data_size), args.data_type, input_dims, begin_vec,
                                    output_dims, output_ptr.get(), stride_vec);
  if (ret != SUCCESS) {
    GELOGW("SetOutputSliceData failed.");
    return NOT_CHANGED;
  }

  GeTensorDesc &t_d = output_ptr->MutableTensorDesc();
  t_d.SetDataType(static_cast<DataType>(args.data_type));

  uint32_t final_dim_size = static_cast<uint32_t>(output_dims.size());
  vector<int64_t> v_dims;
  GetOutputDims(final_dim_size, output_dims, args, v_dims);
  t_d.SetShape(GeShape(v_dims));
  v_output.push_back(output_ptr);
  GELOGI("StridedSliceKernel success.");
  return SUCCESS;
 }
 REGISTER_KERNEL(STRIDEDSLICE, StridedSliceKernel);
 }  // namespace ge
--- a/ge/host_kernels/strided_slice_kernel.h
+++ b/ge/host_kernels/strided_slice_kernel.h
@@ -17,34 +17,33 @@
 #ifndef GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_
 #define GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_

 #include <vector>

 #include "inc/kernel.h"
 #include <vector>

 namespace ge {
 struct Attr {
  int64_t begin_mask;
  int64_t end_mask;
  int64_t ellipsis_mask;
  int64_t new_axis_mask;
  int64_t data_type;
  int64_t shrink_axis_mask;
 };

 class StridedSliceKernel : public Kernel {
 public:
  Status Compute(const OpDescPtr attr, const std::vector<ConstGeTensorPtr> &input,
                 vector<GeTensorPtr> &v_output) override;

 private:
  Status CheckAndGetAttr(const OpDescPtr &attr, const std::vector<ConstGeTensorPtr> &input, Attr &args);
  Status CheckWeight(const ConstGeTensorPtr &weight0, const ConstGeTensorPtr &weight1, const ConstGeTensorPtr &weight2,
                     const ConstGeTensorPtr &weight3) const;
  Status MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag, int32_t &begin_i,
                 int32_t &end_i, int32_t &dim_i) const;
  void GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, const Attr &args,
                     vector<int64_t> &v_dims);
  Status CheckOutputDims(const std::vector<int64_t> &output_dims, const OpDescPtr attr);
  Status CheckAndGetAttr(const OpDescPtr &attr);
  Status CheckInputParam(const std::vector<ConstGeTensorPtr> &input) const;
  Status InitParamWithAttrs(const std::vector<ConstGeTensorPtr> &input, std::vector<int64_t> &input_dims,
                            std::vector<int64_t> &begin_vec, std::vector<int64_t> &output_dims,
                            std::vector<int64_t> &stride_vec);
  Status MaskCal(const size_t i, int64_t &begin_i, int64_t &end_i, int64_t &dim_i) const;
  Status StrideCal(const int64_t x_dims_i, int64_t &begin_i, int64_t &end_i, int64_t &stride_i,
                   int64_t &dim_final) const;
  void ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num, vector<int64_t> &x_dims);

  void GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, vector<int64_t> &v_dims);

  map<string, uint32_t> attr_value_map_ = {{STRIDE_SLICE_ATTR_BEGIN_MASK, 0},
                                           {STRIDE_SLICE_ATTR_END_MASK, 0},
                                           {STRIDE_SLICE_ATTR_ELLIPSIS_MASK, 0},
                                           {STRIDE_SLICE_ATTR_NEW_AXIS_MASK, 0},
                                           {STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK, 0}};
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_
--- a/ge/hybrid/executor/hybrid_model_executor.cc
+++ b/ge/hybrid/executor/hybrid_model_executor.cc
@@ -27,6 +27,12 @@ const char *const kEnvProfilingLevel = "HYBRID_PROFILING_LEVEL";
 HybridModelExecutor::HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream)
    : model_(model), device_id_(device_id), stream_(stream) {}

 HybridModelExecutor::~HybridModelExecutor() {
  if (context_.rt_gen_context != nullptr) {
    (void)rtCtxDestroy(context_.rt_gen_context);
  }
 }

 Status HybridModelExecutor::Init() {
  GELOGD("Start to init HybridGraphEngine.");
  GE_CHK_STATUS_RET_NOLOG(InitExecutionContext());
--- a/ge/hybrid/executor/hybrid_model_executor.h
+++ b/ge/hybrid/executor/hybrid_model_executor.h
@@ -35,7 +35,7 @@ class HybridModelExecutor {

  HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream);

  ~HybridModelExecutor() = default;
  ~HybridModelExecutor();

  Status Init();

--- a/ge/hybrid/model/hybrid_model_builder.cc
+++ b/ge/hybrid/model/hybrid_model_builder.cc
@@ -618,7 +618,8 @@ Status HybridModelBuilder::VarNodeToTensor(const NodePtr &var_node, std::unique_
  }

  int64_t var_size = CalcVarSizeInBytes(*tensor_desc);
  tensor.reset(new (std::nothrow) TensorValue(dev_mem, var_size));
  // var size is only for checking, will not allocate any memory by it
  tensor.reset(new (std::nothrow) TensorValue(dev_mem, static_cast<size_t>(var_size)));
  GE_CHECK_NOTNULL(tensor);
  return SUCCESS;
 }
--- a/ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc
+++ b/ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc
@@ -197,7 +197,7 @@ void AicpuExtInfoHandler::GetShapeAndType(const AicpuShapeAndType *shape_and_typ
    dims.emplace_back(tmpDim);
  }
  data_type = static_cast<DataType>(shape_and_type->type);
  shape = std::move(GeShape(dims));
  shape = GeShape(dims);
 }
 }  // namespace hybrid
 }  // namespace ge
 }  // namespace ge
--- a/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
+++ b/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
@@ -48,6 +48,7 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) {
  std::vector<ConstGeTensorPtr> inputs;
  for (int32_t i = 0; i < context.NumInputs(); ++i) {
    const auto &input_desc = op_desc->GetInputDesc(i);
    GE_CHECK_NOTNULL(context.GetInput(i));
    auto in_tensor = MakeShared<GeTensor>(input_desc, reinterpret_cast<const uint8_t *>(context.GetInput(i)->GetData()),
                                          context.GetInput(i)->GetSize());
    GE_CHECK_NOTNULL(in_tensor);
--- a/ge/init/gelib.cc
+++ b/ge/init/gelib.cc
@@ -167,7 +167,6 @@ Status GELib::SystemInitialize(const map<string, string> &options) {

  // In train and infer, profiling is always needed.
  InitOptions(options);
  InitProfiling(this->options_);
  auto model_manager = ModelManager::GetInstance();
  GE_CHECK_NOTNULL(model_manager);
  GE_IF_BOOL_EXEC(model_manager->EnableExceptionDump(options) != SUCCESS,
@@ -175,23 +174,23 @@ Status GELib::SystemInitialize(const map<string, string> &options) {
                  return FAILED);
  // 1.`is_train_mode_` means case: train
  // 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer
  // these two case need call `InitSystemWithOptions->rtGetDeviceIndexByPhyId`
  // to convert phy device id to logical device id
  // note:rtGetDeviceIndexByPhyId return `0` logical id when input phy device id is `0`
  // these two case with logical device id
  if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) {
    InitProfiling(this->options_, true);
    status = InitSystemWithOptions(this->options_);
  } else {
    InitProfiling(this->options_);
    status = InitSystemWithoutOptions();
  }
  return status;
 }

 void GELib::InitProfiling(Options &options) {
 void GELib::InitProfiling(Options &options, bool convert_2_phy_device_id) {
  GELOGI("Init Profiling. session Id: %ld, device id:%d ", options.session_id, options.device_id);
  std::lock_guard<std::mutex> lock(status_mutex_);
  GetContext().Init();
  // Profiling init
  if (ProfilingManager::Instance().Init(options) != SUCCESS) {
  if (ProfilingManager::Instance().Init(options, convert_2_phy_device_id) != SUCCESS) {
    GELOGW("Profiling init failed.");
  }
 }
@@ -362,6 +361,9 @@ Status GELib::Finalize() {
    GELOGW("not initialize");
    return SUCCESS;
  }
  if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) {
    GE_CHK_RT_RET(rtSetDevice(options_.device_id));
  }
  Status final_state = SUCCESS;
  Status mid_state;
  GELOGI("engineManager finalization.");
@@ -412,10 +414,14 @@ Status GELib::Finalize() {

  GetMutableGlobalOptions().erase(ENABLE_SINGLE_STREAM);

  if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) {
    GE_CHK_RT_RET(rtDeviceReset(options_.device_id));
  }

  instancePtr_ = nullptr;
  init_flag_ = false;
  if (final_state != SUCCESS) {
    GELOGE(FAILED, "MemManager finalization.");
    GELOGE(FAILED, "finalization failed.");
    return final_state;
  }
  GELOGI("finalization success.");
--- a/ge/init/gelib.h
+++ b/ge/init/gelib.h
@@ -68,7 +68,7 @@ class GELib {
  // get incre build cache path
  const std::string &GetIncreBuildCachePath() const { return incre_build_cache_path_; }

  void InitProfiling(Options &options);
  void InitProfiling(Options &options, bool convert_2_phy_device_id = false);
  void ShutDownProfiling();

  Status InitSystemWithoutOptions();
--- a/ge/session/inner_session.cc
+++ b/ge/session/inner_session.cc
@@ -18,6 +18,7 @@
 #include <map>
 #include <memory>
 #include <vector>
 #include "common/dump/dump_properties.h"
 #include "common/util.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/ge_context.h"
@@ -30,6 +31,8 @@

 namespace ge {
 namespace {
 const int32_t kDumpStatus = 0;

 Status CheckReuseMemoryOption(const std::map<string, string> &options) {
  auto iter = options.find(OPTION_EXEC_DISABLE_REUSED_MEMORY);
  if (iter != options.end()) {
@@ -47,7 +50,7 @@ Status CheckReuseMemoryOption(const std::map<string, string> &options) {
 }  // namespace

 static std::mutex mutex_;  // BuildGraph and RunGraph use

 bool InnerSession::is_dump_server_inited_ = false;
 InnerSession::InnerSession(uint64_t session_id, const std::map<string, string> &options)
    : init_flag_(false), session_id_(session_id), options_(options), graph_manager_(domi::GetContext()) {}

@@ -71,12 +74,12 @@ Status InnerSession::Initialize() {

  GE_CHK_RT_RET(rtSetDevice(GetContext().DeviceId()));

  PropertiesManager::Instance().GetDumpProperties(session_id_).InitByOptions();
  DumpProperties dump_properties;
  dump_properties.InitByOptions();

  ret = graph_manager_.Initialize(options_);
  if (ret != SUCCESS) {
    GELOGE(ret, "[InnerSession:%lu] initialize failed.", session_id_);
    PropertiesManager::Instance().RemoveDumpProperties(session_id_);
    return ret;
  }

@@ -84,7 +87,6 @@ Status InnerSession::Initialize() {
  if (ret != SUCCESS) {
    GELOGE(ret, "failed to set malloc size");
    (void)graph_manager_.Finalize();
    PropertiesManager::Instance().RemoveDumpProperties(session_id_);
    GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));
    return ret;
  }
@@ -95,7 +97,6 @@ Status InnerSession::Initialize() {
  ret = VarManager::Instance(session_id_)->Init(version, session_id_, DEFAULT_DEVICE_ID, DEFAULT_JOB_ID);
  if (ret != SUCCESS) {
    GELOGE(ret, "failed to init session instance");
    PropertiesManager::Instance().RemoveDumpProperties(session_id_);
  }
  init_flag_ = true;
  return SUCCESS;
@@ -120,8 +121,6 @@ Status InnerSession::Finalize() {
  GELOGI("VarManager free var memory.");
  (void)VarManager::Instance(session_id_)->FreeVarMemory();

  PropertiesManager::Instance().RemoveDumpProperties(session_id_);

  GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));

  return ret;
@@ -297,4 +296,5 @@ Status InnerSession::SaveVariables(const Graph &graph, const std::vector<std::st
                                   const std::vector<Tensor> &outputs, std::vector<Tensor> &var_values) {
  return graph_manager_.SaveVariables(graph, var_names, outputs, var_values);
 }

 }  // namespace ge
--- a/ge/session/inner_session.h
+++ b/ge/session/inner_session.h
@@ -71,6 +71,7 @@ class InnerSession {
  std::mutex resource_mutex_;  // AddGraph, RemoveGraph and Finalize use
  void UpdateThreadContext(const std::map<std::string, std::string> &options);
  void UpdateThreadContext(uint32_t graph_id);
  static bool is_dump_server_inited_;
 };
 }  // namespace ge

--- a/ge/single_op/single_op.cc
+++ b/ge/single_op/single_op.cc
@@ -24,6 +24,7 @@
 #include "graph/load/new_model_manager/model_utils.h"
 #include "runtime/mem.h"
 #include "single_op/single_op_manager.h"
 #include "graph/load/new_model_manager/model_manager.h"

 namespace ge {
 namespace {
@@ -42,6 +43,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() {
    delete task;
    task = nullptr;
  }
  GELOGI("SingleOp destory sessionId = %lu", aicpu_session_id_);
  ModelManager::GetInstance()->DestroyAicpuSession(aicpu_session_id_);
 }

 Status SingleOp::ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs) {
@@ -166,6 +169,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c
    if (ret != SUCCESS) {
      return ret;
    }
    ret = task->OpenDump(args_, stream_);
    if (ret != SUCCESS) {
      GELOGE(ret, "Open dump failed");
      return ret;
    }
  }

  return ret;
@@ -173,9 +181,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c

 void SingleOp::SetStream(rtStream_t stream) { stream_ = stream; }

 void SingleOp::SetSessionID(uint64_t session_id) { aicpu_session_id_ = session_id; }

 DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex, rtStream_t stream)
    : resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) {}

 DynamicSingleOp::~DynamicSingleOp() {
  GELOGI("DynamicSingleOp destory sessionId = %lu", aicpu_session_id_);
  ModelManager::GetInstance()->DestroyAicpuSession(aicpu_session_id_);
 }

 Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs,
                                       std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs) const {
  if (inputs.size() != input_desc.size()) {
@@ -236,14 +251,22 @@ Status DynamicSingleOp::AllocateWorkspaces(const std::vector<int64_t> &workspace
  return SUCCESS;
 }

 Status DynamicSingleOp::ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, const vector<void *> &inputs,
                                       vector<GeTensorDesc> &output_desc, vector<void *> &outputs) {
  GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc));

  std::vector<void *> workspace_buffers;
  GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers));

  return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_);
 }

 Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, const vector<DataBuffer> &input_buffers,
                                     vector<GeTensorDesc> &output_desc, vector<DataBuffer> &output_buffers) {
  GE_CHECK_NOTNULL(op_task_);
  GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers));
  std::lock_guard<std::mutex> lk(*stream_mutex_);
  GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc));
  std::vector<void *> workspace_buffers;
  GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers));

  std::vector<void *> inputs;
  std::vector<void *> outputs;
  for (auto &buffer : input_buffers) {
@@ -252,6 +275,17 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, con
  for (auto &buffer : output_buffers) {
    outputs.emplace_back(buffer.data);
  }
  return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_);

  if (op_task_->GetOpTaskType() == OP_TASK_TBE) {
    return ExecuteTbeTask(input_desc, inputs, output_desc, outputs);
  } else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) {
    return op_task_->LaunchKernel(input_desc, inputs, output_desc, outputs, stream_);
  } else {
    GELOGE(UNSUPPORTED, "Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u",
           op_task_->GetOpTaskType());
    return UNSUPPORTED;
  }
 }

 void DynamicSingleOp::SetSessionID(uint64_t session_id) { aicpu_session_id_ = session_id; }
 }  // namespace ge
--- a/ge/single_op/single_op.h
+++ b/ge/single_op/single_op.h
@@ -27,6 +27,7 @@
 #include "framework/executor/ge_executor.h"
 #include "runtime/stream.h"
 #include "task/op_task.h"
 #include "cce/aicpu_engine_struct.h"

 namespace ge {
 class SingleOp {
@@ -36,6 +37,7 @@ class SingleOp {

  Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
  void SetStream(rtStream_t stream);
  void SetSessionID(uint64_t session_id);

 private:
  Status ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
@@ -50,6 +52,7 @@ class SingleOp {
  std::vector<void *> output_addr_list_;
  std::vector<size_t> output_sizes_;
  std::vector<uintptr_t> args_;
  uint64_t aicpu_session_id_ = 0;

  std::vector<OpTask *> tasks_;
  std::vector<std::vector<uintptr_t *>> arg_table_;
@@ -58,9 +61,10 @@ class SingleOp {
 class DynamicSingleOp {
 public:
  DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream);
  ~DynamicSingleOp() = default;
  ~DynamicSingleOp();
  Status ExecuteAsync(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs,
                      std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs);
  void SetSessionID(uint64_t session_id);

 private:
  friend class SingleOpModel;
@@ -69,12 +73,16 @@ class DynamicSingleOp {

  Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes, std::vector<void *> &workspaces);

  std::unique_ptr<TbeOpTask> op_task_;
  Status ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, const vector<void *> &inputs,
                        vector<GeTensorDesc> &output_desc, vector<void *> &outputs);

  std::unique_ptr<OpTask> op_task_;
  uintptr_t resource_id_ = 0;
  std::mutex *stream_mutex_;
  rtStream_t stream_ = nullptr;
  size_t num_inputs_ = 0;
  size_t num_outputs_ = 0;
  uint64_t aicpu_session_id_ = 0;
 };
 }  // namespace ge
 #endif  // GE_SINGLE_OP_SINGLE_OP_H_
--- a/ge/single_op/single_op_model.cc
+++ b/ge/single_op/single_op_model.cc
@@ -16,6 +16,7 @@

 #include "single_op/single_op_model.h"

 #include <atomic>
 #include <memory>
 #include <string>
 #include <vector>
@@ -31,6 +32,8 @@
 #include "task/aicpu_kernel_task_builder.h"
 #include "task/tbe_task_builder.h"

 static std::atomic<std::uint64_t> aicpu_sessionid(0);

 using domi::TaskDef;
 using std::unique_ptr;
 using std::vector;
@@ -250,17 +253,21 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
        }
        single_op.tasks_.emplace_back(task);
      } else {
        GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernek are supported, but got %u", context.kernel_type());
        GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernel are supported, but got %u", context.kernel_type());
        return UNSUPPORTED;
      }
    } else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
      GELOGD("Building AICPU_TF task");
      OpTask *task = nullptr;
      auto ret = BuildKernelExTask(task_def.kernel_ex(), single_op, &task);
      AiCpuTask *aicpu_task = nullptr;
      bool depend_compute_flag = false;
      uint64_t singleop_sessionid = aicpu_sessionid++;
      GELOGI("Build singleOp, sessionId = %lu", singleop_sessionid);
      auto ret = BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, false, depend_compute_flag, singleop_sessionid);
      if (ret != SUCCESS) {
        return ret;
      }
      single_op.tasks_.emplace_back(task);
      single_op.tasks_.emplace_back(aicpu_task);
      single_op.SetSessionID(singleop_sessionid);
    } else {
      // skip
      GELOGD("Skip task type: %d", static_cast<int>(task_type));
@@ -316,7 +323,8 @@ Status SingleOpModel::BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTa
  return SUCCESS;
 }

 Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, SingleOp &single_op, OpTask **task) {
 Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, bool dynamic_flag,
                                        bool &depend_compute_flag, uint64_t session_id) {
  auto iter = op_list_.find(kernel_def.op_index());
  if (iter == op_list_.end()) {
    GELOGE(INTERNAL_ERROR, "op desc not found. op index = %u", kernel_def.op_index());
@@ -329,11 +337,12 @@ Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, Sin
    return MEMALLOC_FAILED;
  }
  auto builder = AiCpuTaskBuilder(iter->second->GetOpDesc(), kernel_def);
  auto ret = builder.BuildTask(*aicpu_task, model_params_);
  auto ret = builder.BuildTask(*aicpu_task, model_params_, dynamic_flag, session_id);
  if (ret != SUCCESS) {
    GELOGE(ret, "build aicpu_TF op task failed");
    return ret;
  }
  depend_compute_flag = (aicpu_task->GetUnknownType() == DEPEND_COMPUTE);

  *task = aicpu_task.release();
  return SUCCESS;
@@ -370,6 +379,27 @@ Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) {
  return BuildTaskList(single_op);
 }

 Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) {
  const domi::KernelDef &kernel_def = task_def.kernel();
  const auto &context = kernel_def.context();
  auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
  if (kernel_type == cce::ccKernelType::TE) {
    GELOGD("Building TBE task");
    TbeOpTask *tbe_task = nullptr;
    GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &tbe_task));
    single_op.op_task_.reset(tbe_task);
  } else if (kernel_type == cce::ccKernelType::AI_CPU) {
    GELOGD("Building AICPU_CC task");
    OpTask *task = nullptr;
    GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task));
    single_op.op_task_.reset(task);
  } else {
    GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernel are supported, but got %u", context.kernel_type());
    return UNSUPPORTED;
  }
  return SUCCESS;
 }

 Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) {
  auto ge_model = model_helper_.GetGeModel();
  GE_CHECK_NOTNULL(ge_model);
@@ -385,10 +415,30 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) {
        GELOGE(UNSUPPORTED, "Do not support dynamic op with multiple tasks.");
        return UNSUPPORTED;
      }

      TbeOpTask *task = nullptr;
      GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &task));
      single_op.op_task_.reset(task);
      GE_CHK_STATUS_RET_NOLOG(BuildModelTaskKernel(task_def, single_op));
    } else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
      if (single_op.op_task_ != nullptr) {
        GELOGE(UNSUPPORTED, "Do not support dynamic op with multiple tasks.");
        return UNSUPPORTED;
      }
      GELOGD("Building AICPU_TF task");
      AiCpuTask *aicpu_task = nullptr;
      bool depend_compute_flag = false;
      uint64_t dynamic_singleop_sessionid = aicpu_sessionid++;
      GELOGI("Build dynamic singleOp, sessionId = %lu", dynamic_singleop_sessionid);
      GE_CHK_STATUS_RET_NOLOG(
        BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, true, depend_compute_flag, dynamic_singleop_sessionid));
      if (depend_compute_flag) {
        if (i >= tasks.size() - 1) {
          GELOGE(FAILED, "The copy task of the fourth operator was not found.");
          return FAILED;
        }
        ++i;
        const TaskDef &copy_task_def = tasks[i];
        GE_CHK_STATUS_RET_NOLOG(aicpu_task->SetMemCopyTask(copy_task_def.kernel_ex()));
      }
      single_op.op_task_.reset(aicpu_task);
      single_op.SetSessionID(dynamic_singleop_sessionid);
    } else {
      // skip
      GELOGD("Skip task type: %d", static_cast<int>(task_type));
--- a/ge/single_op/single_op_model.h
+++ b/ge/single_op/single_op_model.h
@@ -66,8 +66,10 @@ class SingleOpModel {
  Status BuildTaskList(SingleOp &single_op);
  Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op);
  Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task);
  Status BuildKernelExTask(const domi::KernelExDef &kernel_def, SingleOp &single_op, OpTask **task);
  Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, bool dynamic_flag,
                           bool &depend_compute_flag, uint64_t session_id);
  Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task);
  Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op);

  static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam &param);
  void ParseArgTable(TbeOpTask *task, SingleOp &op);
--- a/ge/single_op/task/aicpu_kernel_task_builder.cc
+++ b/ge/single_op/task/aicpu_kernel_task_builder.cc
@@ -54,6 +54,29 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task) {
  task.SetSoName(so_name);
  task.SetkernelName(kernel_name);
  task.op_desc_ = op_desc_;

  task.num_inputs_ = op_desc_->GetInputsSize();
  task.num_outputs_ = op_desc_->GetOutputsSize();

  // get kernel_ext_info
  auto &kernel_ext_info = kernel_def_.kernel_ext_info();
  auto kernel_ext_info_size = kernel_def_.kernel_ext_info_size();
  GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, FAILED,
                         "task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", kernel_ext_info.size(),
                         kernel_ext_info_size);

  ret = task.SetExtInfoAndType(kernel_ext_info);
  if (ret != SUCCESS) {
    GELOGE(ret, "Init ext info failed.");
    return ret;
  }

  auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(task.args_.get());
  if (task.ext_info_addr_dev_ != nullptr) {
    aicpu_param_head->extInfoLength = kernel_ext_info.size();
    aicpu_param_head->extInfoAddr = reinterpret_cast<uintptr_t>(task.ext_info_addr_dev_);
  }

  return SUCCESS;
 }
 }  // namespace ge
--- a/ge/single_op/task/aicpu_task_builder.cc
+++ b/ge/single_op/task/aicpu_task_builder.cc
@@ -30,13 +30,13 @@ Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector<vo
  size_t arg_size = kernel_def_.args_size();
  auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "rtMallocHost failed, size = %zu, ret = %d", arg_size, rt_ret);
    GELOGE(RT_FAILED, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret);
    return RT_FAILED;
  }

  const void *src_addr = reinterpret_cast<const void *>(addresses.data());
  uint64_t src_len = sizeof(void *) * addresses.size();
  rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_HOST);
  rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != RT_ERROR_NONE) {
    (void)rtFree(*io_addr);
    GELOGE(RT_FAILED, "rtMemcpy addresses failed, ret = %d", rt_ret);
@@ -69,8 +69,8 @@ Status AiCpuTaskBuilder::SetKernelArgs(void **args, STR_FWK_OP_KERNEL &fwk_op_ke
    return RT_FAILED;
  }

  rt_ret =
    rtMemcpy(fwk_op_args, sizeof(STR_FWK_OP_KERNEL), &fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_HOST);
  rt_ret = rtMemcpy(fwk_op_args, sizeof(STR_FWK_OP_KERNEL), &fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL),
                    RT_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != RT_ERROR_NONE) {
    (void)rtFree(fwk_op_args);
    GELOGE(RT_FAILED, "copy args failed, ret = %d", rt_ret);
@@ -80,7 +80,8 @@ Status AiCpuTaskBuilder::SetKernelArgs(void **args, STR_FWK_OP_KERNEL &fwk_op_ke
  return SUCCESS;
 }

 Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param) {
 Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, const SingleOpModelParam &param,
                                            bool dynamic_flag) {
  if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
    GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", sizeof(STR_FWK_OP_KERNEL),
           kernel_def_.args_size());
@@ -88,31 +89,60 @@ Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam
  }
  auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param);
  auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace);
  if (ws_addr_vec.empty()) {
    GELOGE(PARAM_INVALID, "workspace Data Address is empty.");
    return PARAM_INVALID;
  }
  auto rt_ret = rtMemcpy(ws_addr_vec[0], kernel_def_.task_info_size(), kernel_def_.task_info().data(),
                         kernel_def_.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(FAILED, "rtMemcpy error: 0x%X", rt_ret);
    return FAILED;

  if (dynamic_flag) {
    GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM));
  } else {
    if (ws_addr_vec.empty()) {
      GELOGE(PARAM_INVALID, "workspace Data Address is empty.");
      return PARAM_INVALID;
    }
    *kernel_workspace = ws_addr_vec[0];
  }
  GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(), kernel_def_.task_info().data(),
                         kernel_def_.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE));

  void *io_addr = nullptr;
  auto ret = SetInputOutputAddr(&io_addr, BuildTaskUtils::JoinAddresses(addresses));
  auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses));
  if (ret != SUCCESS) {
    return ret;
  }
  return SUCCESS;
 }

 Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag,
                                   uint64_t session_id) {
  void *io_addr = nullptr;
  void *kernel_workspace = nullptr;
  GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&io_addr, &kernel_workspace, param, dynamic_flag));

  STR_FWK_OP_KERNEL fwk_op_kernel = {0};
  ret = SetFmkOpKernel(io_addr, ws_addr_vec[0], fwk_op_kernel);
  auto ret = SetFmkOpKernel(io_addr, kernel_workspace, fwk_op_kernel);
  if (ret != SUCCESS) {
    (void)rtFree(io_addr);
    return ret;
  }

  task.op_desc_ = op_desc_;
  task.num_inputs_ = op_desc_->GetInputsSize();
  task.num_outputs_ = op_desc_->GetOutputsSize();

  // get kernel_ext_info
  auto &kernel_ext_info = kernel_def_.kernel_ext_info();
  auto kernel_ext_info_size = kernel_def_.kernel_ext_info_size();
  GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, FAILED,
                         "task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", kernel_ext_info.size(),
                         kernel_ext_info_size);
  GE_CHK_STATUS_RET(task.SetExtInfoAndType(kernel_ext_info), "Init ext info failed.");

  if (task.ext_info_addr_dev_ != nullptr) {
    fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoAddr = reinterpret_cast<uintptr_t>(task.ext_info_addr_dev_);
    fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoLen = kernel_ext_info_size;
  }
  GE_CHK_STATUS_RET(task.InitForSummaryAndCopy(), "AiCpuTask init for summary and copy task failed.");

  // Create session
  auto session_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID;
  fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID = session_id;
  GELOGI("Begin to CreateAicpuSession, session id: %lu", session_id);
  GE_CHECK_NOTNULL(ModelManager::GetInstance());
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS,
                  GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id);
@@ -127,8 +157,8 @@ Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam
  task.op_type_ = op_desc_->GetName();
  task.io_addr_ = io_addr;
  task.task_info_ = kernel_def_.task_info();
  task.workspace_addr_ = ws_addr_vec[0];
  task.op_desc_ = op_desc_;
  task.workspace_addr_ = kernel_workspace;
  task.dynamic_flag_ = dynamic_flag;

  auto debug_info = BuildTaskUtils::GetTaskInfo(op_desc_);
  GELOGI("[TASK_INFO] %s %s", task.task_info_.c_str(), debug_info.c_str());
--- a/ge/single_op/task/aicpu_task_builder.h
+++ b/ge/single_op/task/aicpu_task_builder.h
@@ -29,12 +29,14 @@ class AiCpuTaskBuilder {
  AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def);
  ~AiCpuTaskBuilder() = default;

  Status BuildTask(AiCpuTask &task, const SingleOpModelParam &param);
  Status BuildTask(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag, uint64_t session_id);

 private:
  static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel);
  Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses);
  Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel);
  Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, const SingleOpModelParam &param,
                            bool dynamic_flag);

  const OpDescPtr op_desc_;
  const domi::KernelExDef &kernel_def_;
--- a/ge/single_op/task/op_task.cc
+++ b/ge/single_op/task/op_task.cc
@@ -20,8 +20,10 @@
 #include <chrono>
 #include <thread>

 #include "aicpu/common/aicpu_task_struct.h"
 #include "common/dump/dump_manager.h"
 #include "common/dump/dump_op.h"
 #include "common/formats/formats.h"
 #include "framework/common/debug/log.h"
 #include "register/op_tiling.h"
 #include "runtime/rt.h"
@@ -30,24 +32,31 @@ namespace ge {
 namespace {
 constexpr int kLaunchRetryTimes = 1000;
 constexpr int kSleepTime = 10;
 constexpr uint64_t kReleaseFlag = 1;
 constexpr int kCopyNum = 2;
 }  // namespace

 Status OpTask::OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream) {
  if (DumpManager::GetInstance().IsDumpOpen()) {
 Status OpTask::OpenDump(const std::vector<uintptr_t> &io_addr, rtStream_t stream) {
  if (DumpManager::GetInstance().GetDumpProperties().IsSingleOpNeedDump()) {
    GELOGI("Dump is open in single op,start to set dump info");
    std::vector<uint64_t> input_addrs;
    std::vector<uint64_t> output_adds;
    auto input_size = op_desc->GetAllInputsDesc().size();
    auto output_size = op_desc->GetOutputsSize();
    auto input_size = op_desc_->GetInputsSize();
    auto output_size = op_desc_->GetOutputsSize();
    auto all_size = io_addr.size();
    if (input_size + output_size != all_size) {
      GELOGE(FAILED, "io_addr size is not equal input and output size");
      return FAILED;
    }
    for (size_t i = 0; i < input_size; i++) {
      uint64_t input_addr = *(reinterpret_cast<const uint64_t *>(arg) + i);
      uint64_t input_addr = static_cast<uint64_t>(io_addr[i]);
      input_addrs.emplace_back(input_addr);
    }
    for (size_t j = 0; j < output_size; j++) {
      uint64_t output_addr = *(reinterpret_cast<const uint64_t *>(arg) + input_size + j);
      uint64_t output_addr = static_cast<uint64_t>(io_addr[input_size + j]);
      output_adds.emplace_back(output_addr);
    }
    dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc, input_addrs, output_adds, stream);
    dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream);
    auto status = dump_op_.LaunchDumpOp();
    if (status != SUCCESS) {
      GELOGE(status, "Launch dump op failed in single op");
@@ -112,11 +121,6 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) {
  }
  GELOGI("[TASK_INFO] %s", this->stub_name_.c_str());

  auto status = OpenDump(args_.get(), op_desc_, stream);
  if (status != SUCCESS) {
    GELOGE(status, "Open dump failed in tbe single op %s", stub_name_.c_str());
    return status;
  }
  return SUCCESS;
 }

@@ -218,6 +222,119 @@ Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *
  return SUCCESS;
 }

 AiCpuBaseTask::~AiCpuBaseTask() {
  if (ext_info_addr_dev_ != nullptr) {
    (void)rtFree(ext_info_addr_dev_);
  }
 }

 Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info) {
  if (kernel_ext_info.empty()) {
    GELOGI("Kernel_ext_info is empty, no need copy to device.");
    return SUCCESS;
  }

  int32_t unknown_shape_type_val = 0;
  (void)AttrUtils::GetInt(op_desc_, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val);
  GELOGD("Get unknown_type is %d.", unknown_shape_type_val);
  unknown_type_ = static_cast<UnknowShapeOpType>(unknown_shape_type_val);

  aicpu_ext_handle_.reset(
    new (std::nothrow)::ge::hybrid::AicpuExtInfoHandler(op_desc_->GetName(), num_inputs_, num_outputs_, unknown_type_));
  GE_CHK_BOOL_RET_STATUS(aicpu_ext_handle_ != nullptr, FAILED, "Malloc aicpu_ext_handle mem failed!");

  Status ret = aicpu_ext_handle_->Parse(kernel_ext_info);
  if (ret != SUCCESS) {
    GELOGE(ret, "Parse kernel ext info failed, kernel_ext_info_size=%zu.", kernel_ext_info.size());
    return ret;
  }

  GE_CHK_RT_RET(rtMalloc(&ext_info_addr_dev_, kernel_ext_info.size(), RT_MEMORY_HBM));
  GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, kernel_ext_info.size(), kernel_ext_info.data(), kernel_ext_info.size(),
                         RT_MEMCPY_HOST_TO_DEVICE));
  return SUCCESS;
 }

 Status AiCpuBaseTask::UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
                                    std::vector<GeTensorDesc> &output_desc) {
  GELOGI("Update ext info begin, unknown_type=%d.", unknown_type_);
  if (num_inputs_ == 0 && num_outputs_ == 0) {
    GELOGI("No input and output, no need update ext info.");
    return SUCCESS;
  }

  GE_CHECK_NOTNULL(aicpu_ext_handle_);
  for (size_t i = 0; i < num_inputs_; ++i) {
    GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(i, input_desc[i]),
                      "Input[%zu] update input shape failed.", i);
  }

  if (unknown_type_ != DEPEND_COMPUTE) {
    for (size_t j = 0; j < num_outputs_; ++j) {
      GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]),
                        "Output[%zu] UpdateOutputShapeAndType failed.", j);
      // debug code
      GELOGD("No input and output, no need update ext info.");
    }
  }

  GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_,
                         aicpu_ext_handle_->GetExtInfoLen(),  // check size
                         aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(),
                         RT_MEMCPY_HOST_TO_DEVICE));

  GELOGI("Update ext info end.");
  return SUCCESS;
 }

 Status AiCpuBaseTask::UpdateOutputShape(vector<GeTensorDesc> &output_desc) {
  if (num_outputs_ == 0) {
    GELOGD("AiCpuBaseTask output_num is 0, no need update output shape.");
    return SUCCESS;
  }
  GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape.");

  GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), ext_info_addr_dev_,
                         aicpu_ext_handle_->GetExtInfoLen(), RT_MEMCPY_DEVICE_TO_HOST));

  for (size_t i = 0; i < num_outputs_; ++i) {
    GeShape shape;
    DataType data_type;
    aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type);
    GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]), "AiCpuCCTask Update [%zu]th output shape failed.",
                      i);
  }
  GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished.");
  return SUCCESS;
 }

 Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc) {
  auto shape_old = output_desc.GetShape();
  output_desc.SetShape(shape_new);
  GELOGD("Update AiCpuBaseTask shape from %s to %s", shape_old.ToString().c_str(), shape_new.ToString().c_str());

  auto origin_shape_old = output_desc.GetOriginShape();
  auto origin_format = output_desc.GetOriginFormat();
  auto format = output_desc.GetFormat();
  if (origin_format == format) {
    output_desc.SetOriginShape(shape_new);
    return SUCCESS;
  }

  std::vector<int64_t> origin_dims_new;

  auto trans_ret =
    formats::TransShape(format, shape_new.GetDims(), output_desc.GetDataType(), origin_format, origin_dims_new);
  GE_CHK_STATUS_RET(trans_ret, "AiCpuTask originFormat[%d] is not same as format[%d], but TransShape failed, shape=%s.",
                    origin_format, format, shape_new.ToString().c_str());

  auto origin_shape_new = GeShape(origin_dims_new);
  output_desc.SetOriginShape(origin_shape_new);
  GELOGD("AiCpuTask originFormat[%d] is not same as format[%d], need update from %s ro %s.", origin_format, format,
         origin_shape_old.ToString().c_str(), origin_shape_new.ToString().c_str());
  return SUCCESS;
 }

 AiCpuTask::~AiCpuTask() {
  if (args_ != nullptr) {
    (void)rtFree(args_);
@@ -226,6 +343,43 @@ AiCpuTask::~AiCpuTask() {
  if (io_addr_ != nullptr) {
    (void)rtFree(io_addr_);
  }

  if (dynamic_flag_ && workspace_addr_ != nullptr) {
    (void)rtFree(workspace_addr_);
  }
  if (copy_workspace_buf_ != nullptr) {
    (void)rtFree(copy_workspace_buf_);
  }

  if (copy_ioaddr_dev_ != nullptr) {
    (void)rtFree(copy_ioaddr_dev_);
  }

  if (copy_input_release_flag_dev_ != nullptr) {
    (void)rtFree(copy_input_release_flag_dev_);
  }

  if (copy_input_data_size_dev_ != nullptr) {
    (void)rtFree(copy_input_data_size_dev_);
  }

  if (copy_input_src_dev_ != nullptr) {
    (void)rtFree(copy_input_src_dev_);
  }

  if (copy_input_dst_dev_ != nullptr) {
    (void)rtFree(copy_input_dst_dev_);
  }

  if (copy_task_args_buf_ != nullptr) {
    (void)rtFree(copy_task_args_buf_);
  }

  for (auto summary : output_summary_) {
    if (summary != nullptr) {
      (void)rtFree(summary);
    }
  }
 }

 const void *AiCpuTask::GetIOAddr() const { return io_addr_; }
@@ -247,15 +401,225 @@ Status AiCpuTask::LaunchKernel(rtStream_t stream) {
  }
  GELOGI("[TASK_INFO] is %s", this->task_info_.c_str());

  auto status = OpenDump(args_, op_desc_, stream);
  if (status != SUCCESS) {
    GELOGE(status, "Open dump failed in aicpu single op %s", op_type_.c_str());
    return status;
  }
  GELOGD("Done launch kernel successfully. task = %s", this->op_type_.c_str());
  return SUCCESS;
 }

 Status AiCpuTask::PrepareCopyInputs(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm) {
  std::vector<uint64_t> copy_input_release_flag;
  std::vector<uint64_t> copy_input_data_size;
  std::vector<uint64_t> copy_input_src;
  std::vector<uint64_t> copy_input_dst;

  for (size_t i = 0; i < num_outputs_; ++i) {
    const auto &summary = output_summary_host_[i];
    GELOGI("Node out[%zu] summary, shape data=0x%lx, shape data size=%lu, raw data=0x%lx, raw data size=%lu.", i,
           summary.shape_data_ptr, summary.shape_data_size, summary.raw_data_ptr, summary.raw_data_size);
    auto output = outputs[i];
    copy_input_release_flag.emplace_back(kReleaseFlag);
    copy_input_data_size.emplace_back(summary.raw_data_size);
    copy_input_src.emplace_back(summary.raw_data_ptr);
    copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(output));

    const auto &shape_buffer = out_shape_hbm[i];
    copy_input_release_flag.emplace_back(kReleaseFlag);
    copy_input_data_size.emplace_back(summary.shape_data_size);
    copy_input_src.emplace_back(summary.shape_data_ptr);
    copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(shape_buffer));
  }

  const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);

  GE_CHK_RT_RET(rtMemcpy(copy_input_release_flag_dev_, copy_input_buf_len, copy_input_release_flag.data(),
                         copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  GE_CHK_RT_RET(rtMemcpy(copy_input_data_size_dev_, copy_input_buf_len, copy_input_data_size.data(), copy_input_buf_len,
                         RT_MEMCPY_HOST_TO_DEVICE));
  GE_CHK_RT_RET(rtMemcpy(copy_input_src_dev_, copy_input_buf_len, copy_input_src.data(), copy_input_buf_len,
                         RT_MEMCPY_HOST_TO_DEVICE));
  GE_CHK_RT_RET(rtMemcpy(copy_input_dst_dev_, copy_input_buf_len, copy_input_dst.data(), copy_input_buf_len,
                         RT_MEMCPY_HOST_TO_DEVICE));
  return SUCCESS;
 }

 Status AiCpuTask::ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_shape_hbm) {
  for (size_t i = 0; i < num_outputs_; ++i) {
    auto &result_summary = output_summary_host_[i];

    GE_CHK_RT_RET(rtMemcpy(&result_summary, sizeof(aicpu::FWKAdapter::ResultSummary), output_summary_[i],
                           sizeof(aicpu::FWKAdapter::ResultSummary), RT_MEMCPY_DEVICE_TO_HOST));
    auto shape_data_size = result_summary.shape_data_size;
    void *shape_buffer = nullptr;
    GE_MAKE_GUARD_RTMEM(shape_buffer);
    GE_CHK_RT_RET(rtMalloc(&shape_buffer, shape_data_size, RT_MEMORY_HBM));
    out_shape_hbm.emplace_back(shape_buffer);
  }
  return SUCCESS;
 }

 Status AiCpuTask::CopyDataToHbm(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm, rtStream_t stream) {
  GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs, out_shape_hbm));

  GE_CHK_RT_RET(rtKernelLaunchEx(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_KERNEL_DEFAULT, stream));
  GE_CHK_RT_RET(rtStreamSynchronize(stream));
  return SUCCESS;
 }

 Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc, const std::vector<void *> &out_shape_hbm) {
  for (size_t i = 0; i < num_outputs_; ++i) {
    const auto &result_summary = output_summary_host_[i];
    std::vector<int64_t> shape_dims;
    const auto &shape_hbm = out_shape_hbm[i];

    uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
    std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]());
    GE_CHECK_NOTNULL(shape_addr);
    GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm, result_summary.shape_data_size,
                           RT_MEMCPY_DEVICE_TO_HOST));

    for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
      shape_dims.emplace_back(shape_addr[dim_idx]);
      GELOGD("Node [%zu]th output dim[%u]=%ld.", i, dim_idx, shape_addr[dim_idx]);
    }

    GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]),
                      "AiCpuTask update [%zu]th output shape failed.", i);
  }
  return SUCCESS;
 }

 Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<void *> &outputs,
                                                    rtStream_t stream) {
  if (num_outputs_ == 0) {
    GELOGI("Output num is 0, there is no need to update the output and size.");
    return SUCCESS;
  }

  GELOGI("Update shape and data by result summary begin.");

  std::vector<void *> out_shape_hbm;
  GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(out_shape_hbm),
                    "Read ResultSummary and update output shape failed.");

  GE_CHK_STATUS_RET(CopyDataToHbm(outputs, out_shape_hbm, stream), "Copy data to output failed.");

  GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc, out_shape_hbm), "Update shape by hbm buffer failed.");

  GELOGI("Update shape and data by result summary end.");
  return SUCCESS;
 }

 Status AiCpuTask::SetIO(const vector<void *> &inputs, vector<void *> &outputs) {
  vector<uint64_t> io_addrs;
  io_addrs.reserve(num_inputs_ + num_outputs_);
  for (size_t i = 0; i < num_inputs_; ++i) {
    GE_CHECK_NOTNULL(inputs[i]);
    GELOGD("AiCpuTask input[%zu] addr = %p", i, inputs[i]);
    io_addrs.emplace_back(reinterpret_cast<uintptr_t>(inputs[i]));
  }

  if (unknown_type_ != DEPEND_COMPUTE) {
    for (size_t i = 0; i < num_outputs_; ++i) {
      GE_CHECK_NOTNULL(outputs[i]);
      GELOGD("AiCpuTask output[%zu] addr = %p", i, outputs[i]);
      io_addrs.emplace_back(reinterpret_cast<uintptr_t>(outputs[i]));
    }
  } else {
    for (size_t i = 0; i < num_outputs_; ++i) {
      void *summary_addr = output_summary_[i];
      io_addrs.emplace_back(reinterpret_cast<uintptr_t>(summary_addr));
    }
  }

  if (!io_addrs.empty()) {
    auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(io_addr_));
    GE_CHK_RT_RET(rtMemcpy(dst_io_addr, sizeof(uint64_t) * io_addrs.size(), &io_addrs[0],
                           sizeof(uint64_t) * io_addrs.size(), RT_MEMCPY_HOST_TO_DEVICE));
    GE_CHECK_NOTNULL(dst_io_addr);
  };
  return SUCCESS;
 }

 Status AiCpuTask::InitForSummaryAndCopy() {
  if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
    GELOGI("Unknown_type is %d, output num is %d.", unknown_type_, num_outputs_);
    return SUCCESS;
  }

  output_summary_.resize(num_outputs_);
  constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary);
  for (size_t i = 0; i < num_outputs_; ++i) {
    GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM));
  }
  output_summary_host_.resize(num_outputs_);

  const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);

  GE_CHK_RT_RET(rtMalloc(&copy_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  GE_CHK_RT_RET(rtMalloc(&copy_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  GE_CHK_RT_RET(rtMalloc(&copy_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  GE_CHK_RT_RET(rtMalloc(&copy_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM));

  GE_CHK_RT_RET(rtMalloc(&copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM));

  std::vector<uint64_t> copy_io_addr;
  copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_));
  copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_));
  copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_));
  copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_));

  const auto copy_io_addr_size = sizeof(uint64_t) * copy_io_addr.size();

  GE_CHK_RT_RET(rtMalloc(&copy_ioaddr_dev_, copy_io_addr_size, RT_MEMORY_HBM));

  GE_CHK_RT_RET(
    rtMemcpy(copy_ioaddr_dev_, copy_io_addr_size, copy_io_addr.data(), copy_io_addr_size, RT_MEMCPY_HOST_TO_DEVICE));
  return SUCCESS;
 }

 Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) {
  if (kernel_def.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
    GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", sizeof(STR_FWK_OP_KERNEL),
           kernel_def.args_size());
    return PARAM_INVALID;
  }
  GE_CHK_RT_RET(rtMalloc(&copy_workspace_buf_, kernel_def.task_info_size(), RT_MEMORY_HBM));
  GE_CHK_RT_RET(rtMemcpy(copy_workspace_buf_, kernel_def.task_info_size(), kernel_def.task_info().data(),
                         kernel_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE));

  STR_FWK_OP_KERNEL aicpu_task = {0};
  auto sec_ret = memcpy_s(&aicpu_task, sizeof(STR_FWK_OP_KERNEL), kernel_def.args().data(), kernel_def.args().size());
  if (sec_ret != EOK) {
    GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
    return FAILED;
  }

  aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr = reinterpret_cast<uintptr_t>(copy_ioaddr_dev_);
  aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = reinterpret_cast<uintptr_t>(copy_workspace_buf_);
  aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
  aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;

  GE_CHK_RT_RET(rtMemcpy(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), &aicpu_task, sizeof(STR_FWK_OP_KERNEL),
                         RT_MEMCPY_HOST_TO_DEVICE));
  return SUCCESS;
 }

 Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
                               std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs,
                               rtStream_t stream) {
  GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc));
  GE_CHK_STATUS_RET_NOLOG(SetIO(inputs, outputs));
  GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  GE_CHK_RT_RET(rtStreamSynchronize(stream));

  if (unknown_type_ == DEPEND_SHAPE_RANGE) {
    GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  } else if (unknown_type_ == DEPEND_COMPUTE) {
    GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, outputs, stream));
  }

  return SUCCESS;
 }

 void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) {
  args_ = std::move(args);
  arg_size_ = arg_size;
@@ -291,11 +655,34 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
  }
  GELOGD("Invoke rtCpuKernelLaunch succeeded");

  auto status = OpenDump(args_.get(), op_desc_, stream);
  if (status != SUCCESS) {
    GELOGE(status, "Open dump failed in aicpucc single op");
    return status;
  return SUCCESS;
 }

 Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
                                 std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs,
                                 rtStream_t stream) {
  GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED,
                         "AiCpuCCTask unknown type[%d] is depend compute, it's not supported now.", unknown_type_);

  GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc));

  size_t arg_index = 0;
  auto *task_io_addr = reinterpret_cast<uintptr_t *>(io_addr_);
  GE_CHECK_NOTNULL(task_io_addr);
  for (auto &input : inputs) {
    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(input);
  }
  for (auto &output : outputs) {
    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(output);
  }

  GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  GE_CHK_RT_RET(rtStreamSynchronize(stream));

  if (unknown_type_ == DEPEND_SHAPE_RANGE) {
    GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  }

  return SUCCESS;
 }
 }  // namespace ge
--- a/ge/single_op/task/op_task.h
+++ b/ge/single_op/task/op_task.h
@@ -27,6 +27,9 @@
 #include "graph/op_kernel_bin.h"
 #include "runtime/stream.h"
 #include "graph/node.h"
 #include "cce/aicpu_engine_struct.h"
 #include "hybrid/node_executor/aicpu/aicpu_ext_info.h"
 #include "init/gelib.h"

 namespace ge {
 enum OpTaskType {
@@ -52,14 +55,20 @@ class OpTask {
  virtual const void *GetIOAddr() const = 0;
  const vector<int64_t> &GetWorkspaceSizes() const;
  void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes);
  const OpDescPtr &GetOpdesc() const { return op_desc_; }
  Status OpenDump(const std::vector<uintptr_t> &io_addr, rtStream_t stream);
  virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
                              std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) {
    return UNSUPPORTED;
  }

 private:
  std::vector<int64_t> workspace_sizes_;

 protected:
  Status OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream);
  DumpProperties dump_properties_;
  DumpOp dump_op_;
  OpDescPtr op_desc_;
 };

 class TbeOpTask : public OpTask {
@@ -97,10 +106,30 @@ class TbeOpTask : public OpTask {
  uint32_t max_tiling_size_ = 0;
  std::string tiling_data_;
  NodePtr node_;
  OpDescPtr op_desc_;
 };

 class AiCpuTask : public OpTask {
 class AiCpuBaseTask : public OpTask {
 public:
  AiCpuBaseTask() = default;
  ~AiCpuBaseTask() override;
  const UnknowShapeOpType GetUnknownType() const { return unknown_type_; }

 protected:
  Status SetExtInfoAndType(const std::string &kernel_ext_info);

  Status UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc, std::vector<GeTensorDesc> &output_desc);
  Status UpdateOutputShape(vector<GeTensorDesc> &output_desc);
  Status UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc);

 protected:
  size_t num_inputs_ = 0;
  size_t num_outputs_ = 0;
  UnknowShapeOpType unknown_type_ = DEPEND_IN_SHAPE;
  std::unique_ptr<ge::hybrid::AicpuExtInfoHandler> aicpu_ext_handle_;
  void *ext_info_addr_dev_ = nullptr;
 };

 class AiCpuTask : public AiCpuBaseTask {
 public:
  AiCpuTask() = default;
  ~AiCpuTask() override;
@@ -109,7 +138,24 @@ class AiCpuTask : public OpTask {
  OpTaskType GetOpTaskType() override { return OP_TASK_AICPU; }
  const void *GetIOAddr() const override;

  Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
                      std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) override;
  Status SetMemCopyTask(const domi::KernelExDef &kernel_def);

 private:
  Status SetIO(const vector<void *> &inputs, vector<void *> &outputs);

  // for copy task.
  Status InitForSummaryAndCopy();
  Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<void *> &outputs,
                                           rtStream_t stream);
  Status ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_shape_hbm);

  Status CopyDataToHbm(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm, rtStream_t stream);
  Status PrepareCopyInputs(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm);

  Status UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc, const std::vector<void *> &out_shape_hbm);

  friend class AiCpuTaskBuilder;
  void *workspace_addr_ = nullptr;
  std::string task_info_;
@@ -117,10 +163,24 @@ class AiCpuTask : public OpTask {
  size_t arg_size_ = 0;
  std::string op_type_;
  void *io_addr_ = nullptr;
  OpDescPtr op_desc_;

  bool dynamic_flag_ = false;
  // for copy task
  void *copy_task_args_buf_;
  void *copy_workspace_buf_;

  std::vector<void *> output_summary_;
  std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_;

  void *copy_ioaddr_dev_;

  void *copy_input_release_flag_dev_;
  void *copy_input_data_size_dev_;
  void *copy_input_src_dev_;
  void *copy_input_dst_dev_;
 };

 class AiCpuCCTask : public OpTask {
 class AiCpuCCTask : public AiCpuBaseTask {
 public:
  AiCpuCCTask() = default;
  ~AiCpuCCTask() override;
@@ -137,6 +197,9 @@ class AiCpuCCTask : public OpTask {
  void SetIoAddr(void *io_addr);
  size_t GetArgSize() const;

  Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
                      std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) override;

 private:
  friend class AiCpuCCTaskBuilder;
  std::string so_name_;
@@ -146,7 +209,6 @@ class AiCpuCCTask : public OpTask {
  uint32_t block_dim_ = 1;
  void *sm_desc_ = nullptr;
  void *io_addr_ = nullptr;
  OpDescPtr op_desc_;
 };
 }  // namespace ge

--- a/inc/external/ge/ge_prof.h
+++ b/inc/external/ge/ge_prof.h
@@ -0,0 +1,69 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_GE_GE_PROF_H_
 #define INC_EXTERNAL_GE_GE_PROF_H_

 #include <map>
 #include <string>
 #include <vector>

 #include "ge/ge_api_error_codes.h"

 namespace ge {
 enum ProfDataTypeConfig {
  kProfAcl = 0x0001,
  kProfTaskTime = 0x0002,
  kProfAiCoreMetrics = 0x0004,
  kProfAicpuTrace = 0x0008,
  kProfModelExecute = 0x0010,
  kProfRuntimeApi = 0x0020,
  kProfRuntimeTrace = 0x0040,
  kProfScheduleTimeline = 0x0080,
  kProfScheduleTrace = 0x0100,
  kProfAiVectorCoreMetrics = 0x0200,
  kProfSubtaskTime = 0x0400,
  kProfTrainingTrace = 0x0800,
  kProfHcclTrace = 0x1000,
  kProfDataProcess = 0x2000,
  kProfTaskTrace = 0x3842,
  kProfModelLoad = 0x8000000000000000
 };

 enum ProfilingAicoreMetrics {
  kAicoreArithmaticThroughput = 0,
  kAicorePipeline = 1,
  kAicoreSynchronization = 2,
  kAicoreMemory = 3,
  kAicoreInternalMemory = 4,
  kAicoreStall = 5,
  kAicoreMetricsAll = 255  // only for op_trace
 };

 typedef struct ProfAicoreEvents ProfAicoreEvents;
 typedef struct aclgrphProfConfig aclgrphProfConfig;

 Status aclgrphProfInit(const char *profiler_path, uint32_t length);
 Status aclgrphProfFinalize();
 aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums,
                                           ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events,
                                           uint64_t data_type_config);
 Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config);
 Status aclgrphProfStart(aclgrphProfConfig *profiler_config);
 Status aclgrphProfStop(aclgrphProfConfig *profiler_config);
 }  // namespace ge

 #endif  // INC_EXTERNAL_GE_GE_PROF_H_
--- a/inc/framework/common/ge_inner_error_codes.h
+++ b/inc/framework/common/ge_inner_error_codes.h
@@ -97,6 +97,7 @@ GE_ERRORNO_COMMON(INTERNAL_ERROR, 4, "Internal errors");              // 1343225
 GE_ERRORNO_COMMON(CSEC_ERROR, 5, "Failed to call libc_sec API!");     // 1343225861
 GE_ERRORNO_COMMON(TEE_ERROR, 6, "Failed to call tee API!");           // 1343225862
 GE_ERRORNO_COMMON(END_OF_SEQUENCE, 7, "End of sequence!");            // 1343225863
 GE_ERRORNO_COMMON(PATH_INVALID, 8, "Path is invalid!");               // 1343225864

 // Error code for plugin manager
 GE_ERRORNO_COMMON(GE_PLGMGR_PATH_INVALID, 30, "Path is invalid!");                   // 1343225886
@@ -124,9 +125,13 @@ GE_ERRORNO_CLIENT(GE_CLI_GE_ALREADY_INITIALIZED, 10, "GE is already initialized.
 GE_ERRORNO_CLIENT(GE_CLI_GE_NOT_INITIALIZED, 11, "GE is not yet initialized or is finalized.");  // 1343229963

 // Init module error code definition
 GE_ERRORNO_INIT(GE_MULTI_INIT, 0, "Multiple initializations are not supported.");            // 1343234048
 GE_ERRORNO_INIT(GE_FINALIZE_NOT_INIT, 1, "Finalize is not allowed before initialization.");  // 1343234049
 GE_ERRORNO_INIT(GE_MULTI_FINALIZE, 2, "Multiple finalizations are not supported.");          // 1343234050
 GE_ERRORNO_INIT(GE_MULTI_INIT, 0, "Multiple initializations are not supported.");                 // 1343234048
 GE_ERRORNO_INIT(GE_FINALIZE_NOT_INIT, 1, "Finalize is not allowed before initialization.");       // 1343234049
 GE_ERRORNO_INIT(GE_MULTI_FINALIZE, 2, "Multiple finalizations are not supported.");               // 1343234050
 GE_ERRORNO_INIT(GE_PROF_MULTI_INIT, 3, "Multiple profiling initializations are not supported.");  // 1343234051
 GE_ERRORNO_INIT(GE_PROF_NOT_INIT, 4, "Profing initializations have not been done.");              // 1343234052
 GE_ERRORNO_INIT(GE_PROF_MODE_CONFLICT, 5,
                "Profiling command mode which is preferred is running, the api mode will not work.");  // 1343234053

 // Session module error code definition
 GE_ERRORNO_SESSION(GE_SESS_INIT_FAILED, 0, "Failed to initialize session.");                          // 1343238144
--- a/inc/framework/common/util.h
+++ b/inc/framework/common/util.h
@@ -398,6 +398,24 @@ bool CheckOutputPathValid(const std::string &file_path, const std::string &atc_p
 /// @param [out] result
 ///
 bool ValidateStr(const std::string &filePath, const std::string &mode);

 ///
 /// @ingroup domi_common
 /// @brief Check whether the file is normal file.
 /// @param [in] file_path file path
 /// @param [out] result
 ///
 bool IsValidFile(const char *file_path);

 ///
 /// @ingroup domi_common
 /// @brief Check path invalid
 /// @param [in] path, path to be checked
 /// @param [in] length, length of path
 /// @return 0 success
 /// @return -1 fail
 ///
 Status CheckPath(const char *path, size_t length);
 }  // namespace ge

 #endif  // INC_FRAMEWORK_COMMON_UTIL_H_
--- a/metadef/graph/CMakeLists.txt
+++ b/metadef/graph/CMakeLists.txt
@@ -18,13 +18,13 @@
 set(CMAKE_CXX_FLAGS "-Wno-unused-variable ${CMAKE_CXX_FLAGS}")
 # add all proto files, generate corresponding .h and .cc files
 file(GLOB_RECURSE PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "../proto/om.proto"
        "../proto/ge_ir.proto"
        "../proto/insert_op.proto"
        "../proto/task.proto"
        "../proto/fwk_adaper.proto"
        "../proto/op_mapping_info.proto"
        "../proto/dump_task.proto"
        "../../proto/om.proto"
        "../../proto/ge_ir.proto"
        "../../proto/insert_op.proto"
        "../../proto/task.proto"
        "../../proto/fwk_adaper.proto"
        "../../proto/op_mapping_info.proto"
        "../../proto/dump_task.proto"
        )

 file(GLOB_RECURSE ONNX_PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
--- a/metadef/graph/compute_graph.cc
+++ b/metadef/graph/compute_graph.cc
@@ -658,7 +658,7 @@ ComputeGraph::UpdateOutputMapping(const std::map<uint32_t, uint32_t> &output_map
    return GRAPH_FAILED;
  }

  size_t num = op_desc->GetInputsSize();
  size_t num = op_desc->GetAllInputsSize();
  for (size_t i = 0; i < num; i++) {
    GeTensorDesc tensor = op_desc->GetInputDesc(i);
    uint32_t cur_index = 0;
--- a/metadef/graph/format_refiner.cc
+++ b/metadef/graph/format_refiner.cc
@@ -149,9 +149,10 @@ graphStatus FormatRefiner::GetAnchorPoints(const ge::ComputeGraphPtr &graph, std
    // consider special node save process
    // get all input desc format
    bool node_is_all_nd = false;
    auto input_size = static_cast<uint32_t>(op_desc->GetInputsSize());
    auto input_size = static_cast<uint32_t>(op_desc->GetAllInputsSize());
    for (uint32_t i = 0; i < input_size; i++) {
      // Operator pre-set format but not origin format
      GE_IF_BOOL_EXEC(op_desc->MutableInputDesc(i) == nullptr, continue);
      auto input_format = op_desc->MutableInputDesc(i)->GetFormat();
      // Pre-save data node (only main graph data) and default infer fail
      if (node_ptr->GetType() == DATA) {
@@ -164,6 +165,7 @@ graphStatus FormatRefiner::GetAnchorPoints(const ge::ComputeGraphPtr &graph, std
    // Get all output desc format
    auto output_size = static_cast<uint32_t>(op_desc->GetOutputsSize());
    for (uint32_t i = 0; i < output_size; i++) {
      GE_IF_BOOL_EXEC(op_desc->MutableOutputDesc(i) == nullptr, continue);
      auto output_format = op_desc->MutableOutputDesc(i)->GetFormat();
      if (output_format != FORMAT_ND && output_format != FORMAT_RESERVED) {
        node_is_all_nd = true;
@@ -222,8 +224,9 @@ graphStatus FormatRefiner::BackInferProcess(std::deque<ge::NodePtr> &nodes, ge::
  for (const auto &in_anchor : node->GetAllInDataAnchors()) {
    GELOGD("Node is [%s] [B]", (node->GetName()).c_str());
    auto in_data_anchor_idx = in_anchor->GetIdx();
    auto to_be_set_format =
      node->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_data_anchor_idx))->GetOriginFormat();
    auto input_desc = node->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_data_anchor_idx));
    GE_IF_BOOL_EXEC(input_desc == nullptr, continue);
    auto to_be_set_format = input_desc->GetOriginFormat();
    if (to_be_set_format == FORMAT_ND) {
      GELOGD("Node [%s] [B], format is ND", (node->GetName()).c_str());
      continue;
--- a/metadef/graph/ge_attr_define.cc
+++ b/metadef/graph/ge_attr_define.cc
@@ -123,6 +123,7 @@ const std::string ATTR_NAME_AIPP_OUTPUTS = "_aipp_outputs";

 const std::string ATTR_NAME_INPUT_DIMS = "input_dims";

 const std::string ATTR_NAME_GRAPH_HAS_BEEN_ADDED = "_graph_has_been_added";
 const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id";
 const std::string ATTR_NAME_PARENT_GRAPH_NAME = "_parent_graph_name";

--- a/metadef/graph/node.cc
+++ b/metadef/graph/node.cc
@@ -68,7 +68,7 @@ graphStatus Node::Init() {
    return GRAPH_SUCCESS;
  }
  GE_CHK_BOOL_EXEC(op_ != nullptr, return GRAPH_FAILED, "original OpDesc is nullptr");
  size_t size = op_->GetInputsSize();
  size_t size = op_->GetAllInputsSize();
  for (size_t i = 0; i < size; i++) {
    std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), i);
    if (anchor == nullptr) {
@@ -305,13 +305,19 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus Node::AddLinkFrom(con
    GELOGE(GRAPH_FAILED, "add input desc failed.");
    return GRAPH_FAILED;
  }
  std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size());
  if (anchor == nullptr) {
    GELOGE(GRAPH_FAILED, "out_anchor size is:%zu, malloc shared_ptr failed.", out_anchors.size());
    return GRAPH_FAILED;

  if (index < GetAllInDataAnchors().size()) {
    (void)out_anchors.at(0)->LinkTo(in_data_anchors_[index]);
  } else {
    std::shared_ptr<InDataAnchor> anchor =
      ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size());
    if (anchor == nullptr) {
      GELOGE(GRAPH_FAILED, "out_anchor size is:%zu, malloc shared_ptr failed.", out_anchors.size());
      return GRAPH_FAILED;
    }
    in_data_anchors_.push_back(anchor);
    (void)out_anchors.at(0)->LinkTo(in_data_anchors_.back());
  }
  in_data_anchors_.push_back(anchor);
  (void)out_anchors.at(0)->LinkTo(in_data_anchors_.back());

  return GRAPH_SUCCESS;
 }
@@ -347,20 +353,30 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus Node::AddLinkFrom(con
  }

  GE_CHECK_NOTNULL(op_);
  auto op_desc = input_node->GetOpDesc();
  GE_CHECK_NOTNULL(op_desc);

  if (op_->AddInputDesc(name, op_desc->GetOutputDesc(0)) != GRAPH_SUCCESS) {
    GELOGE(GRAPH_FAILED, "add input desc failed.");
    return GRAPH_FAILED;
  auto input_op_desc = input_node->GetOpDesc();
  GE_CHECK_NOTNULL(input_op_desc);
  auto index = op_->GetInputIndexByName(name);
  if (index != -1) {
    if (index >= static_cast<int>(in_data_anchors_.size())) {
      GELOGE(GRAPH_FAILED, "op %s get input name %s 's index %d is illegal.", op_->GetName().c_str(), name.c_str(),
             index);
      return GRAPH_FAILED;
    }
    (void)out_anchors.at(0)->LinkTo(in_data_anchors_[index]);
  } else {
    std::shared_ptr<InDataAnchor> anchor =
      ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size());
    if (anchor == nullptr) {
      GELOGE(GRAPH_FAILED, "in_data_anchors_size is:%zu, malloc shared_ptr failed.", in_data_anchors_.size());
      return GRAPH_FAILED;
    }
    in_data_anchors_.push_back(anchor);
    (void)out_anchors.at(0)->LinkTo(in_data_anchors_.back());
  }
  std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size());
  if (anchor == nullptr) {
    GELOGE(GRAPH_FAILED, "out_anchor size is:%zu, malloc shared_ptr failed.", out_anchors.size());
  if (op_->AddInputDesc(name, input_op_desc->GetOutputDesc(0)) != GRAPH_SUCCESS) {
    GELOGE(GRAPH_FAILED, "add input desc failed.");
    return GRAPH_FAILED;
  }
  in_data_anchors_.push_back(anchor);
  (void)out_anchors.at(0)->LinkTo(in_data_anchors_.back());

  return GRAPH_SUCCESS;
 }
--- a/metadef/graph/op_desc.cc
+++ b/metadef/graph/op_desc.cc
@@ -675,7 +675,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ConstGeTensorDescPtr OpDesc::GetI
    return nullptr;
  }
  if (inputs_desc_[index]->IsValid() != GRAPH_SUCCESS) {
    GELOGE(GRAPH_FAILED, "inputsDesc[%u] is InValid", index);
    GELOGW("inputsDesc[%u] is InValid", index);
    return nullptr;
  } else {
    return inputs_desc_[static_cast<size_t>(index)];
--- a/metadef/graph/operator.cc
+++ b/metadef/graph/operator.cc
@@ -1504,7 +1504,9 @@ class GraphBuilderImpl {
          GE_CHK_BOOL_EXEC(dst_anchor != nullptr, return GRAPH_FAILED, "GetInDataAnchor failed.");

          auto ret = GraphUtils::AddEdge(src_anchor, dst_anchor);
          GE_CHK_BOOL_EXEC(ret == GRAPH_SUCCESS, return GRAPH_FAILED, "AddEdge failed.");
          GE_CHK_BOOL_EXEC(ret == GRAPH_SUCCESS, return GRAPH_FAILED,
                           "from node[%s][%d] to node[%s][%d]AddEdge failed.", src_node_ptr->GetName().c_str(),
                           src_anchor->GetIdx(), dst_node_info->second->GetName().c_str(), dst_anchor->GetIdx());
        }
      }
      auto out_control_anchor = src_node_ptr->GetOutControlAnchor();
@@ -1536,19 +1538,23 @@ inline bool HasSameNameNode(const ComputeGraphPtr &compute_graph) {
  for (const auto &graph : compute_graph->GetAllSubgraphs()) {
    std::set<string> node_names;
    for (auto const &node : graph->GetDirectNode()) {
      node_names.insert(node->GetName());
    }

    if (node_names.size() != graph->GetDirectNodesSize()) {
      return true;
      auto result = node_names.insert(node->GetName());
      if (!result.second) {
        GELOGE(GRAPH_FAILED, "graph %s has same name node%s", graph->GetName().c_str(), node->GetName().c_str());
        return true;
      }
    }
  }

  std::set<string> node_names;
  for (auto const &node : compute_graph->GetDirectNode()) {
    node_names.insert(node->GetName());
    auto result = node_names.insert(node->GetName());
    if (!result.second) {
      GELOGE(GRAPH_FAILED, "graph %s has same name node%s", compute_graph->GetName().c_str(), node->GetName().c_str());
      return true;
    }
  }
  return node_names.size() != compute_graph->GetDirectNodesSize();
  return false;
 }

 ComputeGraphPtr GraphUtils::CreateGraphFromOperator(const string &name, const vector<ge::Operator> &inputs) {
--- a/metadef/graph/shape_refiner.cc
+++ b/metadef/graph/shape_refiner.cc
@@ -51,6 +51,9 @@ graphStatus ReverseBrushWhileBodySubGraph(const ConstNodePtr &node) {
  for (const auto &node_sub : sub_graph_body->GetAllNodes()) {
    for (size_t i = 0; i < node_sub->GetAllInDataAnchorsSize(); i++) {
      auto input_desc = node_sub->GetOpDesc()->MutableInputDesc(i);
      GE_IF_BOOL_EXEC(input_desc == nullptr,
                      GELOGW("Get null input by index %zu from node %s ", i, node_sub->GetName().c_str());
                      continue);
      (void)input_desc->SetUnknownDimNumShape();
    }
    for (size_t i = 0; i < node_sub->GetAllOutDataAnchorsSize(); i++) {
@@ -376,10 +379,13 @@ graphStatus UpdateOpInputDesc(const ConstNodePtr &node_ptr) {
      continue;
    }
    int peer_out_idx = peer_out_data_anchor->GetIdx();
    auto in_desc = node_ptr->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_idx));
    auto peer_out_desc = peer_out_data_node->GetOpDesc()->MutableOutputDesc(static_cast<uint32_t>(peer_out_idx));

    // check shape and dtype continuity. do not stop process
    auto in_desc = node_ptr->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_idx));
    if (in_desc == nullptr) {
      continue;
    }
    auto in_shape = in_desc->GetShape().GetDims();
    auto in_dtype = in_desc->GetDataType();
    auto peer_out_shape = peer_out_desc->GetShape().GetDims();
--- a/metadef/graph/utils/ge_ir_utils.cc
+++ b/metadef/graph/utils/ge_ir_utils.cc
@@ -264,11 +264,11 @@ void OnnxUtils::AddAttrProtoForOpInAndOutDesc(onnx::NodeProto *node_proto, const
    return;
  }
  // Input describes
  auto size_in = op_desc->GetInputsSize();
  auto size_in = op_desc->GetAllInputsSize();
  AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, "input_desc_nums", &size_in);
  if (size_in > 0) {
    for (uint32_t i = 0; i < size_in; i++) {
      auto input_desc = op_desc->GetInputDescPtr(i);
      auto input_desc = op_desc->GetInputDescPtrDfault(i);
      if (input_desc != nullptr) {
        auto data_type = TypeUtils::DataTypeToSerialString(input_desc->GetDataType());
        AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, "input_desc_dtype:" + std::to_string(i),
@@ -480,9 +480,20 @@ void OnnxUtils::AddAttrProtoFromNodeMembers(const NodePtr &node, onnx::NodeProto
  if (!recv_list.empty()) {
    AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "recv_event_id_list", &recv_list);
  }
  // 2.Attributes added from node's op_(message OpDef)
  auto op_desc = node->op_;
  if (op_desc != nullptr) {
    // for input_name_idx_ in opdesc
    auto input_name_2_indexs = op_desc->GetAllInputName();
    ::google::protobuf::RepeatedPtrField<::std::string> input_names;
    ::google::protobuf::RepeatedField<::google::protobuf::int64> input_indexes;
    for (const auto &input_name_2_index : input_name_2_indexs) {
      std::string input_name = input_name_2_index.first;
      input_names.Add(std::move(input_name));
      input_indexes.Add(input_name_2_index.second);
    }
    AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRINGS, "_input_name_key", input_names);
    AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "_input_name_value", input_indexes);
    // 2.Attributes added from node's op_(message OpDef)
    // Input and out describes
    AddAttrProtoForOpInAndOutDesc(node_proto, op_desc);
    // Others
--- a/metadef/graph/utils/graph_utils.cc
+++ b/metadef/graph/utils/graph_utils.cc
@@ -1470,8 +1470,7 @@ graphStatus GraphUtils::CopyTensorAttrs(const OpDescPtr &dst_desc, const NodePtr
  for (uint32_t i = 0; i < src_node->GetAllInDataAnchorsSize(); ++i) {
    auto input_desc = dst_desc->MutableInputDesc(i);
    if (input_desc == nullptr) {
      GELOGE(GRAPH_FAILED, "Param dst node not valid");
      return GRAPH_FAILED;
      continue;
    }
    input_desc->CopyAttrsFrom(src_desc->GetInputDesc(i));
  }
--- a/metadef/graph/utils/op_desc_utils.cc
+++ b/metadef/graph/utils/op_desc_utils.cc
@@ -513,7 +513,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector<GeTensorPtr> OpDescUtils::
  }
  return MutableWeights(*node);
 }

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
 OpDescUtils::SetWeights(ge::Node &node, const vector<ge::GeTensorPtr> &weights) {
  GE_CHK_BOOL_EXEC(node.GetOpDesc() != nullptr, return GRAPH_PARAM_INVALID, "node.GetOpDesc is nullptr!");
--- a/metadef/inc/graph/debug/ge_attr_define.h
+++ b/metadef/inc/graph/debug/ge_attr_define.h
@@ -142,6 +142,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_DIMS;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_GRAPH_HAS_BEEN_ADDED;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SESSION_GRAPH_ID;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PARENT_GRAPH_NAME;

--- a/src/ge/client/ge_prof.cc
+++ b/src/ge/client/ge_prof.cc
@@ -0,0 +1,375 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "ge/ge_prof.h"
 #include "ge/ge_api.h"
 #include "init/gelib.h"
 #include "common/debug/log.h"
 #include "framework/common/debug/ge_log.h"
 #include "common/profiling/profiling_manager.h"
 #include "graph/load/graph_loader.h"
 #include "toolchain/prof_acl_api.h"

 using std::map;
 using std::string;
 using std::vector;

 namespace {
 const uint32_t kMaxDeviceNum = 64;
 const std::string PROFILING_INIT = "prof_init";
 const std::string PROFILING_FINALIZE = "prof_finalize";
 const std::string PROFILING_START = "prof_start";
 const std::string PROFILING_STOP = "prof_stop";
 const std::string DEVICES_NUMS = "devNums";
 const std::string DEVICE_ID_LIST = "devIdList";
 const std::string AICORE_METRICS = "aicoreMetrics";

 const std::map<ge::ProfilingAicoreMetrics, std::string> kProfAicoreMetricsToString = {
  {ge::kAicoreArithmaticThroughput, "AICORE_ARITHMATIC_THROUGHPUT"},
  {ge::kAicorePipeline, "AICORE_PIPELINE"},
  {ge::kAicoreSynchronization, "AICORE_SYNCHRONIZATION"},
  {ge::kAicoreMemory, "AICORE_MEMORY"},
  {ge::kAicoreInternalMemory, "AICORE_INTERNAL_MEMORY"},
  {ge::kAicoreStall, "AICORE_STALL"},
  {ge::kAicoreMetricsAll, "AICORE_METRICS_ALL"}};

 const std::map<uint64_t, uint64_t> kDataTypeConfigMapping = {{ge::kProfAcl, PROF_ACL_API},
                                                             {ge::kProfTaskTime, PROF_TASK_TIME},
                                                             {ge::kProfAiCoreMetrics, PROF_AICORE_METRICS},
                                                             {ge::kProfAicpuTrace, PROF_AICPU_TRACE},
                                                             {ge::kProfModelExecute, PROF_MODEL_EXECUTE},
                                                             {ge::kProfRuntimeApi, PROF_RUNTIME_API},
                                                             {ge::kProfRuntimeTrace, PROF_RUNTIME_TRACE},
                                                             {ge::kProfScheduleTimeline, PROF_SCHEDULE_TIMELINE},
                                                             {ge::kProfScheduleTrace, PROF_SCHEDULE_TRACE},
                                                             {ge::kProfAiVectorCoreMetrics, PROF_AIVECTORCORE_METRICS},
                                                             {ge::kProfSubtaskTime, PROF_SUBTASK_TIME},
                                                             {ge::kProfTrainingTrace, PROF_TRAINING_TRACE},
                                                             {ge::kProfHcclTrace, PROF_HCCL_TRACE},
                                                             {ge::kProfDataProcess, PROF_DATA_PROCESS},
                                                             {ge::kProfTaskTrace, PROF_TASK_TRACE},
                                                             {ge::kProfModelLoad, PROF_MODEL_LOAD}};
 }  // namespace

 static bool g_graph_prof_init_ = false;
 static std::mutex g_prof_mutex_;

 namespace ge {
 struct aclgrphProfConfig {
  ProfConfig config;
 };

 Status aclgrphProfInit(const char *profiler_path, uint32_t length) {
  GELOGT(TRACE_INIT, "Graph prof init start");

  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
    return FAILED;
  }

  std::lock_guard<std::mutex> lock(g_prof_mutex_);
  if (g_graph_prof_init_) {
    GELOGW("Multi graph profiling initializations.");
    return GE_PROF_MULTI_INIT;
  }

  Status ret = CheckPath(profiler_path, length);
  if (ret != SUCCESS) {
    GELOGE(ret, "Profiling config path is invalid.");
    return ret;
  }
  // if command mode is set, just return
  if (ProfilingManager::Instance().ProfilingOn()) {
    GELOGW("Graph prof init failed, cause profiling command pattern is running.");
    return GE_PROF_MODE_CONFLICT;
  }

  ret = ProfInit(profiler_path);
  if (ret != SUCCESS) {
    GELOGE(ret, "ProfInit init fail");
    return ret;
  }

  GraphLoader graph_loader;
  Command command;
  command.cmd_params.clear();
  command.cmd_type = PROFILING_INIT;
  command.module_index = kProfModelLoad | kProfTrainingTrace;
  ret = graph_loader.CommandHandle(command);
  if (ret != SUCCESS) {
    GELOGE(ret, "Handle profiling command %s failed, config = %s", PROFILING_INIT.c_str(), profiler_path);
    return ret;
  }
  if (!g_graph_prof_init_) {
    g_graph_prof_init_ = true;
    GELOGI("Profiling init successfully.");
  }

  GELOGI("Successfully execute GraphProfInit.");
  return SUCCESS;
 }

 Status aclgrphProfFinalize() {
  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
    return FAILED;
  }
  std::lock_guard<std::mutex> lock(g_prof_mutex_);
  // if command mode is set, just return
  if (ProfilingManager::Instance().ProfilingOn()) {
    GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
    return GE_PROF_MODE_CONFLICT;
  }

  if (!g_graph_prof_init_) {
    GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
    return GE_PROF_NOT_INIT;
  }
  GraphLoader graph_loader;
  Command command;
  command.cmd_params.clear();
  command.cmd_type = PROFILING_FINALIZE;
  Status ret = graph_loader.CommandHandle(command);
  if (ret != SUCCESS) {
    GELOGE(ret, "Handle profiling command %s failed.", PROFILING_FINALIZE.c_str());
    return ret;
  }

  ret = ProfFinalize();
  if (ret != SUCCESS) {
    GELOGE(ret, "Finalize profiling failed, result = %d", ret);
  }

  if (ret == SUCCESS) {
    g_graph_prof_init_ = false;
    GELOGI("Successfully execute GraphProfFinalize.");
  }
  return ret;
 }

 bool TransProfConfigToParam(const aclgrphProfConfig *profiler_config, vector<string> &prof_config_params) {
  prof_config_params.clear();
  prof_config_params.emplace_back(DEVICES_NUMS);
  prof_config_params.emplace_back(std::to_string(profiler_config->config.devNums));
  prof_config_params.emplace_back(DEVICE_ID_LIST);
  std::string devID = "";
  if (profiler_config->config.devNums == 0) {
    GELOGW("The device num is invalid.");
    return false;
  }
  for (uint32_t i = 0; i < profiler_config->config.devNums; i++) {
    devID.append(std::to_string(profiler_config->config.devIdList[i]));
    if (i != profiler_config->config.devNums - 1) {
      devID.append(",");
    }
  }

  prof_config_params.push_back(devID);
  prof_config_params.push_back(AICORE_METRICS);
  auto iter =
    kProfAicoreMetricsToString.find(static_cast<ProfilingAicoreMetrics>(profiler_config->config.aicoreMetrics));
  if (iter == kProfAicoreMetricsToString.end()) {
    GELOGW("The prof aicore metrics is invalid.");
    return false;
  }
  prof_config_params.push_back(iter->second);
  return true;
 }

 bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) {
  if (deviceid_list == nullptr) {
    GELOGE(PARAM_INVALID, "deviceIdList is nullptr");
    return false;
  }
  if (device_nums == 0 || device_nums > kMaxDeviceNum) {
    GELOGE(PARAM_INVALID, "The device nums is invalid.");
    return false;
  }

  // real device num
  int32_t dev_count = 0;
  rtError_t rt_err = rtGetDeviceCount(&dev_count);
  if (rt_err != RT_ERROR_NONE) {
    GELOGE(INTERNAL_ERROR, "Get the Device count fail.");
    return false;
  }

  if (device_nums > static_cast<uint32_t>(dev_count)) {
    GELOGE(PARAM_INVALID, "Device num(%u) is not in range 1 ~ %d.", device_nums, dev_count);
    return false;
  }

  std::unordered_set<uint32_t> record;
  for (size_t i = 0; i < device_nums; ++i) {
    uint32_t dev_id = deviceid_list[i];
    if (dev_id >= static_cast<uint32_t>(dev_count)) {
      GELOGE(PARAM_INVALID, "Device id %u is not in range 0 ~ %d(exclude %d)", dev_id, dev_count, dev_count);
      return false;
    }
    if (record.count(dev_id) > 0) {
      GELOGE(PARAM_INVALID, "Device id %u is duplicatedly set", dev_id);
      return false;
    }
    record.insert(dev_id);
  }
  return true;
 }

 aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums,
                                           ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events,
                                           uint64_t data_type_config) {
  if (!isProfConfigValid(deviceid_list, device_nums)) {
    return nullptr;
  }
  aclgrphProfConfig *config = new (std::nothrow) aclgrphProfConfig();
  if (config == nullptr) {
    GELOGE(INTERNAL_ERROR, "new aclgrphProfConfig fail");
    return nullptr;
  }
  config->config.devNums = device_nums;
  if (memcpy_s(config->config.devIdList, sizeof(config->config.devIdList), deviceid_list,
               device_nums * sizeof(uint32_t)) != EOK) {
    GELOGE(INTERNAL_ERROR, "copy devID failed. size = %u", device_nums);
    delete config;
    return nullptr;
  }

  config->config.aicoreMetrics = static_cast<ProfAicoreMetrics>(aicore_metrics);
  uint64_t data_type = 0;
  for (auto &iter : kDataTypeConfigMapping) {
    if ((iter.first & data_type_config) == iter.first) {
      data_type |= iter.second;
    }
  }
  config->config.dataTypeConfig = data_type;
  GELOGI("Successfully create prof config.");
  return config;
 }

 Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config) {
  if (profiler_config == nullptr) {
    GELOGE(PARAM_INVALID, "destroy profilerConfig failed, profilerConfig must not be nullptr");
    return PARAM_INVALID;
  }

  delete profiler_config;
  GELOGI("Successfully destroy prof config.");
  return SUCCESS;
 }

 Status aclgrphProfStart(aclgrphProfConfig *profiler_config) {
  if (profiler_config == nullptr) {
    GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
    return FAILED;
  }
  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
    return FAILED;
  }

  std::lock_guard<std::mutex> lock(g_prof_mutex_);
  // if command mode is set, just return
  if (ProfilingManager::Instance().ProfilingOn()) {
    GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
    return GE_PROF_MODE_CONFLICT;
  }
  if (!g_graph_prof_init_) {
    GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
    return GE_PROF_NOT_INIT;
  }

  Status ret = ProfStartProfiling(&profiler_config->config);
  if (ret != SUCCESS) {
    GELOGE(ret, "Start profiling failed, prof result = %d", ret);
    return FAILED;
  }

  std::vector<string> prof_params;
  if (!TransProfConfigToParam(profiler_config, prof_params)) {
    GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
    return PARAM_INVALID;
  }

  GraphLoader graph_loader;
  Command command;
  command.cmd_params.clear();
  command.cmd_type = PROFILING_START;
  command.cmd_params = prof_params;
  command.module_index = profiler_config->config.dataTypeConfig;
  ret = graph_loader.CommandHandle(command);
  if (ret != SUCCESS) {
    GELOGE(ret, "Handle profiling command failed");
    return FAILED;
  }

  GELOGI("Successfully execute GraphProfStartProfiling.");

  return SUCCESS;
 }

 Status aclgrphProfStop(aclgrphProfConfig *profiler_config) {
  if (profiler_config == nullptr) {
    GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
    return FAILED;
  }
  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
    return FAILED;
  }

  std::lock_guard<std::mutex> lock(g_prof_mutex_);
  // if command mode is set, just return
  if (ProfilingManager::Instance().ProfilingOn()) {
    GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
    return GE_PROF_MODE_CONFLICT;
  }
  if (!g_graph_prof_init_) {
    GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
    return GE_PROF_NOT_INIT;
  }

  Status ret = ProfStopProfiling(&profiler_config->config);
  if (ret != SUCCESS) {
    GELOGE(ret, "Stop profiling failed, prof result = %d", ret);
    return ret;
  }

  std::vector<string> prof_params;
  if (!TransProfConfigToParam(profiler_config, prof_params)) {
    GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
    return PARAM_INVALID;
  }

  GraphLoader graph_loader;
  Command command;
  command.cmd_params.clear();
  command.cmd_type = PROFILING_STOP;
  command.cmd_params = prof_params;
  command.module_index = profiler_config->config.dataTypeConfig;
  ret = graph_loader.CommandHandle(command);
  if (ret != SUCCESS) {
    GELOGE(ret, "Handle profiling command failed");
    return FAILED;
  }

  GELOGI("Successfully execute GraphProfStopProfiling.");
  return SUCCESS;
 }
 }  // namespace ge
--- a/third_party/fwkacllib/inc/ops/aipp.h
+++ b/third_party/fwkacllib/inc/ops/aipp.h
@@ -25,14 +25,16 @@

 namespace ge {
 /**
 *@brief Performs AI pre-processing (AIPP) on images including color space conversion (CSC), image normalization (by subtracting the mean value or multiplying a factor), image cropping (by specifying the crop start and cropping the image to the size required by the neural network), and much more.
 *@brief Performs AI pre-processing (AIPP) on images including color space conversion (CSC),
 image normalization (by subtracting the mean value or multiplying a factor), image cropping
 (by specifying the crop start and cropping the image to the size required by the neural network), and much more. \n

 *@par Inputs:
 *@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer.
 *@li params: Dynamic AIPP configuration parameters of type uint8.
 *@li params: Dynamic AIPP configuration parameters of type uint8. \n

 *@par Attributes:
 *aipp_config_path: A required string, specifying the path of the AIPP configuration file
 *aipp_config_path: A required string, specifying the path of the AIPP configuration file. \n

 *@par Outputs:
 *features: The AIPP-processed output tensor of type float16 or uint8.
@@ -47,17 +49,17 @@ REG_OP(Aipp)
    .OP_END_FACTORY_REG(Aipp)

 /**
 *@brief Performs this op is for dynamic aipp.If you set aipp-mode to dynamic \n
 in aipp config file, framework will auto add one input node to graph at last.
 *@brief Performs this op is for dynamic aipp.If you set aipp-mode to dynamic
 in aipp config file, framework will auto add one input node to graph at last. \n

 *@par Inputs:
 *data: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer.
 *data: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer. \n

 *@par Attributes:
 *index: specify aipp serial num
 *index: specify aipp serial num \n

 *@par Outputs:
 *out: The AIPP-processed output tensor of all types.
 *out: The AIPP-processed output tensor of all types. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AippData.
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
--- a/third_party/fwkacllib/inc/ops/audio_ops.h
+++ b/third_party/fwkacllib/inc/ops/audio_ops.h
@@ -26,29 +26,29 @@
 namespace ge {

 /**
 *@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of \n
 taking the DCT-II of a log-magnitude mel-scale spectrogram.
 *@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of
 taking the DCT-II of a log-magnitude mel-scale spectrogram . \n

 *@par Inputs: 
 *Input "spectrogram" is a 3D tensor. Input "sample_rate" is a scalar. \n
 *@par Inputs:
 *Input "spectrogram" is a 3D tensor. Input "sample_rate" is a scalar.
 * @li spectrogram: A 3D float tensor.
 * @li sample_rate: The MFCC sample rate.
 * @li sample_rate: The MFCC sample rate . \n

 *@par Attributes: 
 *@par Attributes:
 *@li upper_frequency_limit: The highest frequency for calculation.
 *@li lower_frequency_limit: The lowest frequency for calculation.
 *@li filterbank_channel_count: Resolution of the Mel bank.
 *@li dct_coefficient_count: Number of output channels to produce \n
 per time slice.
 *@li dct_coefficient_count: Number of output channels to produce
 per time slice . \n

 *@par Outputs: 
 *y: A Tensor of type float32.
 *@par Outputs:
 *y: A Tensor of type float32 . \n

 *@attention Constraints: \n
 *Mfcc runs on the Ascend AI CPU, which delivers poor performance. \n
 *@attention Constraints:
 *Mfcc runs on the Ascend AI CPU, which delivers poor performance.

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator Mfcc.
 *Compatible with the TensorFlow operator Mfcc . \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -64,26 +64,26 @@ REG_OP(Mfcc)
    .OP_END_FACTORY_REG(Mfcc)

 /**
 *@brief Decodes and generates spectrogram using wav float tensor.
 *@brief Decodes and generates spectrogram using wav float tensor . \n

 *@par Inputs: 
 *Input "x" is a 2D matrix. \n
 * x: A float tensor. Float representation of audio data.
 *@par Inputs:
 *Input "x" is a 2D matrix.
 * x: A float tensor. Float representation of audio data . \n

 *@par Attributes: 
 *@par Attributes:
 *@li window_size: Size of the spectrogram window.
 *@li stride: Size of the spectrogram stride.
 *@li magnitude_squared: If true, uses squared magnitude.
 *@li magnitude_squared: If true, uses squared magnitude . \n

 *@par Outputs: 
 *spectrogram: A 3D float Tensor.
 *@par Outputs:
 *spectrogram: A 3D float Tensor . \n

 *@attention Constraints: \n
 *AudioSpectrogram runs on the Ascend AI CPU, which delivers \n
 poor performance.
 *@attention Constraints:
 *AudioSpectrogram runs on the Ascend AI CPU, which delivers
 poor performance . \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AudioSpectrogram.
 *Compatible with the TensorFlow operator AudioSpectrogram . \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
@@ -98,26 +98,26 @@ REG_OP(AudioSpectrogram)
    .OP_END_FACTORY_REG(AudioSpectrogram)

 /**
 *@brief Decodes a 16-bit WAV file into a float tensor.
 *@brief Decodes a 16-bit WAV file into a float tensor . \n

 *@par Inputs: 
 *contents: A Tensor of type string. The WAV-encoded audio, usually from a file.
 *@par Inputs:
 *contents: A Tensor of type string. The WAV-encoded audio, usually from a file . \n

 *@par Attributes: 
 *@li desired_channels: An optional int. Defaults to "-1". \n
 *@par Attributes:
 *@li desired_channels: An optional int. Defaults to "-1".
 Number of sample channels wanted.
 *@li desired_samples: An optional int. Defaults to "-1". \n
 Length of audio requested.
 *@li desired_samples: An optional int. Defaults to "-1".
 Length of audio requested . \n

 *@par Outputs: 
 *@par Outputs:
 *@li *audio: A Tensor of type float32.
 *@li *sample_rate: A Tensor of type int32.
 *@li *sample_rate: A Tensor of type int32 . \n

 *@attention Constraints: \n
 *DecodeWav runs on the Ascend AI CPU, which delivers poor performance. \n
 *@attention Constraints:
 *DecodeWav runs on the Ascend AI CPU, which delivers poor performance.

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator DecodeWav.
 *Compatible with the TensorFlow operator DecodeWav . \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
@@ -132,21 +132,21 @@ REG_OP(DecodeWav)
    .OP_END_FACTORY_REG(DecodeWav)

 /**
 *@brief Encode audio data using the WAV file format.
 *@brief Encode audio data using the WAV file format . \n

 *@par Inputs:
 *Including: \n
 *Including:
 * @li audio: A Tensor of type DT_FLOAT.
 * @li sample_rate: A Tensor of type DT_INT32.
 * @li sample_rate: A Tensor of type DT_INT32 . \n

 *@par Outputs:
 *contents: A Tensor of type DT_STRING.
 *contents: A Tensor of type DT_STRING . \n

 *@attention Constraints:\n
 *EncodeWav runs on the Ascend AI CPU, which delivers poor performance.\n
 *@attention Constraints:
 *EncodeWav runs on the Ascend AI CPU, which delivers poor performance.

 *@par Third-party framework compatibility
 *Compatible with tensorflow Operator EncodeWav.
 *Compatible with tensorflow Operator EncodeWav . \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
--- a/third_party/fwkacllib/inc/ops/batch_ops.h
+++ b/third_party/fwkacllib/inc/ops/batch_ops.h
@@ -26,35 +26,36 @@
 namespace ge {

 /**
 *@brief Creates batches of tensors in "x_tensors".
 *@brief Creates batches of tensors in "x_tensors" .   \n

 *@par Inputs: 
 *Input "x_tensors" is a list or a dictionary of tensors. \n
 *x_tensors: The list or dictionary of tensors to enqueue.
 *@par Inputs:
 *Input "x_tensors" is a list or a dictionary of tensors.
 *x_tensors: The list or dictionary of tensors to enqueue .
 It's a dynamic input  \n

 *@par Attributes: 
 *@li num_batch_threads: The number of threads enqueuing "x_tensors". \n
 *@par Attributes:
 *@li num_batch_threads: The number of threads enqueuing "x_tensors".
 The batching will be nondeterministic if "num_batch_threads" > 1.
 *@li max_batch_size: The maximum batch size pulled from the queue.
 *@li max_enqueued_batches: The maximum number of batches pulled from the queue.
 *@li batch_timeout_micros: The batch processing timeout, in microseconds.
 *@li allowed_batch_sizes: The allowed batch size pulled from the queue.
 *@li grad_timeout_micros: The gradient batch processing timeout, \n
 *@li grad_timeout_micros: The gradient batch processing timeout,
 in microseconds.
 *@li container: If non-empty, this queue is placed in the given container. \n
 *@li container: If non-empty, this queue is placed in the given container.
 Otherwise, a default container is used.
 *@li shared_name: If set, this queue will be shared under the given name \n
 *@li shared_name: If set, this queue will be shared under the given name
 across multiple sessions.
 *@li batching_queue: The queue resource container.
 *@li batching_queue: The queue resource container .   \n

 *@par Outputs: 
 *@par Outputs:
 *@li y_index: A Tensor. The index of a BatchTensor. Must be in row-major order.
 *@li y_id: A Tensor. The ID of a BatchTensor. Must be in row-major order.
 *@li y_tensors: A list or dictionary of tensors with \n
 the same types as "x_tensors".
 *@li y_tensors: A list or dictionary of tensors with
 the same types as "x_tensors" .  It's a dynamic output.  \n

 *@attention Constraints: \n
 *Batch runs on the Ascend AI CPU, which delivers poor performance. \n
 *@attention Constraints:
 *Batch runs on the Ascend AI CPU, which delivers poor performance.   \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator Batch.
@@ -79,26 +80,26 @@ REG_OP(Batch)
  .OP_END_FACTORY_REG(Batch)

 /**
 *@brief Reverses the operation of Batch for a single output Tensor.
 *@brief Reverses the operation of Batch for a single output Tensor .   \n

 *@par Inputs: 
 *Input "x_tensors" is a list or a dictionary of tensors. \n
 *@par Inputs:
 *Input "x_tensors" is a list or a dictionary of tensors.
 * @li x_tensors: The list or dictionary of tensors to enqueue.
 * @li index: The matching "batch_index" obtained from Batch.
 * @li id: The "id" scalar emitted by Batch.
 * @li id: The "id" scalar emitted by Batch .   \n

 *@par Attributes: 
 *@par Attributes:
 *@li timeout_micros: The unbatch processing timeout, in microseconds.
 *@li container: If non-empty, this queue is placed in the given container. \n
 *@li container: If non-empty, this queue is placed in the given container.
 Otherwise, a default container is used.
 *@li shared_name: If set, this queue will be shared under the given name \n
 across multiple sessions.
 *@li shared_name: If set, this queue will be shared under the given name
 across multiple sessions .   \n

 *@par Outputs: 
 *y_tensor: A list or dictionary of tensors with the same types as "x_tensors".
 *@par Outputs:
 *y_tensor: A list or dictionary of tensors with the same types as "x_tensors" .   \n

 *@attention Constraints: \n
 *Unbatch runs on the Ascend AI CPU, which delivers poor performance. \n
 *@attention Constraints:
 *Unbatch runs on the Ascend AI CPU, which delivers poor performance.   \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator Unbatch.
@@ -117,27 +118,27 @@ REG_OP(Unbatch)
  .OP_END_FACTORY_REG(Unbatch)

 /**
 *@brief Acts like Batch but using the given "batch_index" index of batching \n
 things as they become available.
 *@brief Acts like Batch but using the given "batch_index" index of batching
 things as they become available .   \n

 *@par Inputs: 
 *Input "x_input" is a list or a dictionary of tensors. \n
 *@par Inputs:
 *Input "x_input" is a list or a dictionary of tensors.
 * @li x_input: The input to the Unbatch operation.
 * @li index: The batch_index given to the Unbatch operation.
 * @li id: The "id" scalar emitted by Batch.
 * @li grad: The downstream gradient.
 * @li grad: The downstream gradient .   \n

 *@par Attributes: 
 *@li container: If non-empty, this queue is placed in the given container. \n
 *@par Attributes:
 *@li container: If non-empty, this queue is placed in the given container.
 Otherwise, a default container is used.
 *@li shared_name: If set, this queue will be shared under the given name \n
 across multiple sessions.
 *@li shared_name: If set, this queue will be shared under the given name
 across multiple sessions .   \n

 *@par Outputs: 
 *y_grad: The return value, either an empty tensor or the batched gradient.
 *@par Outputs:
 *y_grad: The return value, either an empty tensor or the batched gradient .   \n

 *@attention Constraints: \n
 *UnbatchGrad runs on the Ascend AI CPU, which delivers poor performance. \n
 *@attention Constraints:
 *UnbatchGrad runs on the Ascend AI CPU, which delivers poor performance.   \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator UnbatchGrad.
--- a/third_party/fwkacllib/inc/ops/bitwise_ops.h
+++ b/third_party/fwkacllib/inc/ops/bitwise_ops.h
@@ -26,20 +26,20 @@
 namespace ge {

 /**
 *@brief Element-wise computes the bitwise right-shift of x and y.
 *@brief Element-wise computes the bitwise right-shift of x and y . \n

 *@par Inputs: 
 *Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" \n
 *@par Inputs:
 *Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper"
 are 0D scalars.
 * @li x: A Tensor. Must be one of the following types: int8, int16, int32, \n
 int64, uint8, uint16, uint32, uint64. \n
 * @li y: A Tensor. Has the same type as "x". \n
 * @li x: A Tensor. Must be one of the following types: int8, int16, int32,
 int64, uint8, uint16, uint32, uint64.
 * @li y: A Tensor. Has the same type as "x".  \n

 *@par Outputs: 
 * z: A Tensor. Has the same type as "x". \n
 *@par Outputs:
 * z: A Tensor. Has the same type as "x".  \n

 *@attention Constraints: \n
 *Unique runs on the Ascend AI CPU, which delivers poor performance. \n
 *@attention Constraints:
 *Unique runs on the Ascend AI CPU, which delivers poor performance.  \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator RightShift.
--- a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
+++ b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
@@ -26,28 +26,28 @@
 namespace ge {

 /**
 *@brief Bucketizes each feature based on bucket boundaries.
 *@brief Bucketizes each feature based on bucket boundaries . \n

 *@par Inputs: 
 *Input "float_values" is a 1D tensor. Input "bucket_boundaries" is \n
 a list of 1D tensors.
 * @li float_values: A list of rank 1 tensors each containing float \n
 *@par Inputs:
 *Input "float_values" is a 1D tensor. Input "bucket_boundaries" is
 a list of 1D tensors. It's a dynamic input.
 * @li float_values: A list of rank 1 tensors each containing float
 values for a single feature.
 * @li bucket_boundaries: A list of rank 1 tensors each containing \n
 the bucket boundaries for a single feature.
 * @li bucket_boundaries: A list of rank 1 tensors each containing
 the bucket boundaries for a single feature . It's a dynamic input. \n

 *@par Attributes: 
 *@li num_features: Number of features \n
 *@par Attributes:
 *@li num_features: Number of features

 *@par Outputs: 
 *@li y: A list of rank 1 tensors each containing the bucketized values for \n
 a single feature.
 *@par Outputs:
 *@li y: A list of rank 1 tensors each containing the bucketized values for
 a single feature . \n

 *@attention Constraints: \n
 *@attention Constraints:
 *BoostedTreesBucketize runs on the Ascend AI CPU, which delivers poor performance. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator BoostedTreesBucketize.
 *Compatible with the TensorFlow operator BoostedTreesBucketize . \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
--- a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
+++ b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
@@ -26,44 +26,44 @@
 namespace ge {

 /**
 *@brief Generates labels for candidate sampling with \n
 a learned unigram distribution.
 *@brief Generates labels for candidate sampling with
 a learned unigram distribution. \n

 *@par Inputs: 
 *Input "true_classes" is a 2D matrix. \n
 *true_classes: A "batch_size * num_true" matrix, in which each row contains \n
 the IDs of the "num_true" "target_classes" in the corresponding original label.
 *@par Inputs:
 *Input "true_classes" is a 2D matrix.
 *true_classes: A "batch_size * num_true" matrix, in which each row contains
 the IDs of the "num_true" "target_classes" in the corresponding original label. \n

 *@par Attributes: 
 *@par Attributes:
 *@li num_true: Number of true labels per context.
 *@li num_sampled: Number of candidates to randomly sample.
 *@li unique: If "unique" is true, samples with rejection, \n
 *@li unique: If "unique" is true, samples with rejection,
 so that all sampled candidates in a batch are unique.
 *This requires some approximation to estimate the post-rejection \n
 *This requires some approximation to estimate the post-rejection
 sampling probabilities.
 *@li range_max: The sampler will sample integers from the interval \n
 *@li range_max: The sampler will sample integers from the interval
 [0, range_max).
 *@li seed: If either "seed" or "seed2" are set to be non-zero.
 *@li seed2: A second seed to avoid seed collision.
 *@li seed2: A second seed to avoid seed collision. \n

 *@par Outputs: 
 *@li sampled_candidates: A vector of length "num_sampled", in which each \n
 *@par Outputs:
 *@li sampled_candidates: A vector of length "num_sampled", in which each
 element is the ID of a sampled candidate.
 *@li true_expected_count: A "batch_size * num_true" matrix, representing \n
 the number of times each candidate is expected to occur in a batch of sampled \n
 *@li true_expected_count: A "batch_size * num_true" matrix, representing
 the number of times each candidate is expected to occur in a batch of sampled
 candidates. If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled", \n
 *@li sampled_expected_count: A vector of length "num_sampled",
 for each sampled candidate.
 *representing the number of times the candidate is expected to occur \n
 *representing the number of times the candidate is expected to occur
 in a batch of sampled candidates.
 * If "unique" is true, then this is a probability. \n
 * If "unique" is true, then this is a probability.

 *@attention Constraints: \n
 *ThreadUnsafeUnigramCandidateSampler runs on the Ascend AI CPU, \n
 which delivers poor performance.
 *@attention Constraints:
 *ThreadUnsafeUnigramCandidateSampler runs on the Ascend AI CPU,
 which delivers poor performance. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ThreadUnsafeUnigramCandidateSampler.
 *Compatible with the TensorFlow operator ThreadUnsafeUnigramCandidateSampler. \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -82,44 +82,44 @@ REG_OP(ThreadUnsafeUnigramCandidateSampler)
    .OP_END_FACTORY_REG(ThreadUnsafeUnigramCandidateSampler)

 /**
 *@brief Generates labels for candidate sampling with a learned \n
 unigram distribution.
 *@brief Generates labels for candidate sampling with a learned
 unigram distribution. \n

 *@par Inputs: 
 *true_classes: A "batch_size * num_true" matrix, in which each row contains \n
 *@par Inputs:
 *true_classes: A "batch_size * num_true" matrix, in which each row contains
 the IDs of the "num_true" "target_classes" in the corresponding original label.
 *Input "true_classes" is a 2D matrix.
 *Input "true_classes" is a 2D matrix. \n

 *@par Attributes: 
 *@par Attributes:
 *@li num_true: Number of true labels per context.
 *@li num_sampled: Number of candidates to randomly sample.
 *@li unique: If "unique" is true, samples with rejection, \n
 *@li unique: If "unique" is true, samples with rejection,
 so that all sampled candidates in a batch are unique.
 *This requires some approximation to estimate the post-rejection \n
 *This requires some approximation to estimate the post-rejection
 sampling probabilities.
 *@li range_max: The sampler will sample integers from the interval \n
 *@li range_max: The sampler will sample integers from the interval
 [0, range_max).
 *@li seed: If either "seed" or "seed2" are set to be non-zero.
 *@li seed2: A second seed to avoid seed collision.
 *@li seed2: A second seed to avoid seed collision. \n

 *@par Outputs: 
 *@li sampled_candidates: A vector of length "num_sampled", \n
 *@par Outputs:
 *@li sampled_candidates: A vector of length "num_sampled",
 in which each element is the ID of a sampled candidate.
 *@li true_expected_count: A "batch_size * num_true" matrix, representing the \n
 number of times each candidate is expected to occur \n
 *@li true_expected_count: A "batch_size * num_true" matrix, representing the
 number of times each candidate is expected to occur
 in a batch of sampled candidates.
 *If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled", for each \n
 *@li sampled_expected_count: A vector of length "num_sampled", for each
 sampled candidate representing the number of times.
 * the candidate is expected to occur in a batch of sampled candidates. \n
 *If "unique" is true, then this is a probability.
 * the candidate is expected to occur in a batch of sampled candidates.
 *If "unique" is true, then this is a probability. \n

 *@attention Constraints: \n
 *UniformCandidateSampler runs on the Ascend AI CPU, \n
 which delivers poor performance.
 *@attention Constraints:
 *UniformCandidateSampler runs on the Ascend AI CPU,
 which delivers poor performance. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator UniformCandidateSampler.
 *Compatible with the TensorFlow operator UniformCandidateSampler. \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -138,56 +138,56 @@ REG_OP(UniformCandidateSampler)
    .OP_END_FACTORY_REG(UniformCandidateSampler)

 /**
 *@brief Generates labels for candidate sampling with a learned \n
 unigram distribution.
 *@brief Generates labels for candidate sampling with a learned
 unigram distribution. \n

 *@par Inputs: 
 *true_classes: A "batch_size * num_true" matrix, in which each row contains \n
 *@par Inputs:
 *true_classes: A "batch_size * num_true" matrix, in which each row contains
 the IDs of the "num_true" "target_classes" in the corresponding original label.
 * Input "true_classes" is a 2D matrix.
 * Input "true_classes" is a 2D matrix. \n

 *@par Attributes: 
 *@par Attributes:
 *@li num_true: Number of true labels per context.
 *@li num_sampled: Number of candidates to randomly sample.
 *@li unique: If "unique" is true, samples with rejection, \n
 so that all sampled candidates in a batch are unique. This requires \n
 *@li unique: If "unique" is true, samples with rejection,
 so that all sampled candidates in a batch are unique. This requires
 some approximation to estimate the post-rejection sampling probabilities.
 *@li range_max: The sampler will sample integers from the interval [0, range_max).
 *@li vocab_file: Each valid line in this file (which should have a \n
 CSV-like format) corresponds to a valid word ID. \n
 *@li vocab_file: Each valid line in this file (which should have a
 CSV-like format) corresponds to a valid word ID.
 *IDs are in sequential order, starting from num_reserved_ids.
 *@li distortion: The distortion is used to skew the unigram probability \n
 distribution. Each weight is first raised to the distortion's power before \n
 *@li distortion: The distortion is used to skew the unigram probability
 distribution. Each weight is first raised to the distortion's power before
 adding to the internal unigram distribution.
 *@li num_reserved_ids: Optionally some reserved IDs can be added in the range \n
 [0, ..., num_reserved_ids) by the users. \n
 *@li num_reserved_ids: Optionally some reserved IDs can be added in the range
 [0, ..., num_reserved_ids) by the users.
 * One use case is that a special unknown word token is used as ID 0.
 *@li num_shards: A sampler can be used to sample from a subset of the \n 
 *@li num_shards: A sampler can be used to sample from a subset of the
 original range. in order to speed up the whole computation through parallelism.
 *@li shard: A sampler can be used to sample from a subset of the original \n
 *@li shard: A sampler can be used to sample from a subset of the original
 range in order to speed up the whole computation through parallelism.
 *@li unigrams: A list of unigram counts or probabilities, one per ID in \n
 *@li unigrams: A list of unigram counts or probabilities, one per ID in
 sequential order.
 *@li seed: If either "seed" or "seed2" are set to be non-zero.
 *@li seed2: A second seed to avoid seed collision.
 *@li seed2: A second seed to avoid seed collision. \n

 *@par Outputs: 
 *@li sampled_candidates: A vector of length "num_sampled", in which each \n
 *@par Outputs:
 *@li sampled_candidates: A vector of length "num_sampled", in which each
 element is the ID of a sampled candidate.
 *@li true_expected_count: A "batch_size * num_true" matrix, representing the \n
 number of times each candidate is expected to occur in a batch of sampled \n
 *@li true_expected_count: A "batch_size * num_true" matrix, representing the
 number of times each candidate is expected to occur in a batch of sampled
 candidates. If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled", \n
 for each sampled candidate representing the number of times the candidate is \n
 expected to occur in a batch of sampled candidates. \n
 If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled",
 for each sampled candidate representing the number of times the candidate is
 expected to occur in a batch of sampled candidates.
 If "unique" is true, then this is a probability. \n

 *@attention Constraints: \n
 * FixedUnigramCandidateSampler runs on the Ascend AI CPU, \n
 which delivers poor performance.
 *@attention Constraints:
 * FixedUnigramCandidateSampler runs on the Ascend AI CPU,
 which delivers poor performance. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator FixedUnigramCandidateSampler.
 *Compatible with the TensorFlow operator FixedUnigramCandidateSampler. \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -212,43 +212,43 @@ REG_OP(FixedUnigramCandidateSampler)
    .OP_END_FACTORY_REG(FixedUnigramCandidateSampler)

 /**
 *@brief Generates labels for candidate sampling with a learned \n
 unigram distribution.
 *@brief Generates labels for candidate sampling with a learned
 unigram distribution. \n

 *@par Inputs: 
 *true_classes: A "batch_size * num_true" matrix, in which each row contains \n
 *@par Inputs:
 *true_classes: A "batch_size * num_true" matrix, in which each row contains
 the IDs of the "num_true" "target_classes" in the corresponding original label.
 * Input "true_classes" is a 2D matrix.
 * Input "true_classes" is a 2D matrix. \n

 *@par Attributes: 
 *@par Attributes:
 *@li num_true: Number of true labels per context.
 *@li num_sampled: Number of candidates to randomly sample.
 *@li unique: If "unique" is true, samples with rejection, \n
 so that all sampled candidates in a batch are unique. \n
 *This requires some approximation to estimate the post-rejection \n
 *@li unique: If "unique" is true, samples with rejection,
 so that all sampled candidates in a batch are unique.
 *This requires some approximation to estimate the post-rejection
 sampling probabilities.
 *@li range_max: The sampler will sample integers from the interval \n
 *@li range_max: The sampler will sample integers from the interval
 [0, range_max).
 *@li seed: If either "seed" or "seed2" are set to be non-zero.
 *@li seed2: A second seed to avoid seed collision.
 *@li seed2: A second seed to avoid seed collision. \n

 *@par Outputs: 
 *@li sampled_candidates: A vector of length "num_sampled", in which each \n
 *@par Outputs:
 *@li sampled_candidates: A vector of length "num_sampled", in which each
 element is the ID of a sampled candidate.
 *@li true_expected_count: A "batch_size * num_true" matrix, representing \n
 the number of times each candidate is expected to occur in a batch of sampled candidates. \n
 *If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled", for each \n
 sampled candidate representing the number of times the candidate is expected \n
 to occur in a batch of sampled candidates. \n
 *@li true_expected_count: A "batch_size * num_true" matrix, representing
 the number of times each candidate is expected to occur in a batch of sampled candidates.
 *If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled", for each
 sampled candidate representing the number of times the candidate is expected
 to occur in a batch of sampled candidates.
 *If "unique" is true, then this is a probability. \n

 *@attention Constraints: \n
 *LearnedUnigramCandidateSampler runs on the Ascend AI CPU, which delivers \n
 poor performance.
 *@attention Constraints:
 *LearnedUnigramCandidateSampler runs on the Ascend AI CPU, which delivers
 poor performance. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator LearnedUnigramCandidateSampler.
 *Compatible with the TensorFlow operator LearnedUnigramCandidateSampler. \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -267,42 +267,42 @@ REG_OP(LearnedUnigramCandidateSampler)
    .OP_END_FACTORY_REG(LearnedUnigramCandidateSampler)

 /**
 *@brief Generates labels for candidate sampling with a log-uniform \n
 distribution.
 *@brief Generates labels for candidate sampling with a log-uniform
 distribution. \n

 *@par Inputs: 
 *true_classes: A "batch_size * num_true" matrix, in which each row contains \n
 the IDs of the "num_true" "target_classes" in the corresponding original label. \n
 * Input "true_classes" is a 2D matrix.
 *@par Inputs:
 *true_classes: A "batch_size * num_true" matrix, in which each row contains
 the IDs of the "num_true" "target_classes" in the corresponding original label.
 * Input "true_classes" is a 2D matrix. \n

 *@par Attributes: 
 *@par Attributes:
 *@li num_true: Number of true labels per context.
 *@li num_sampled: Number of candidates to randomly sample.
 *@li unique: If "unique" is true, samples with rejection, so that all \n
 sampled candidates in a batch are unique. This requires some approximation \n
 *@li unique: If "unique" is true, samples with rejection, so that all
 sampled candidates in a batch are unique. This requires some approximation
 to estimate the post-rejection sampling probabilities.
 *@li range_max: The sampler will sample integers from the interval \n
 *@li range_max: The sampler will sample integers from the interval
 [0, range_max).
 *@li seed: If either "seed" or "seed2" are set to be non-zero.
 *@li seed2: A second seed to avoid seed collision.
 *@li seed2: A second seed to avoid seed collision. \n

 *@par Outputs: 
 *@li sampled_candidates: A vector of length "num_sampled", in which each \n
 *@par Outputs:
 *@li sampled_candidates: A vector of length "num_sampled", in which each
 element is the ID of a sampled candidate.
 *@li true_expected_count: A "batch_size * num_true" matrix, representing \n
 the number of times each candidate is expected to occur in a batch of sampled \n
 *@li true_expected_count: A "batch_size * num_true" matrix, representing
 the number of times each candidate is expected to occur in a batch of sampled
 candidates. If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled", for each \n
 sampled candidate representing the number of times the candidate is expected \n
 to occur in a batch of sampled candidates. \n
 *If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled", for each
 sampled candidate representing the number of times the candidate is expected
 to occur in a batch of sampled candidates.
 *If "unique" is true, then this is a probability. \n

 *@attention Constraints: \n
 *LogUniformCandidateSampler runs on the Ascend AI CPU, which delivers \n
 poor performance.
 *@attention Constraints:
 *LogUniformCandidateSampler runs on the Ascend AI CPU, which delivers
 poor performance. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator LogUniformCandidateSampler.
 *Compatible with the TensorFlow operator LogUniformCandidateSampler. \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -321,38 +321,38 @@ REG_OP(LogUniformCandidateSampler)
    .OP_END_FACTORY_REG(LogUniformCandidateSampler)

 /**
 *@brief Generates labels for candidate sampling with a learned \n
 unigram distribution.
 *@brief Generates labels for candidate sampling with a learned
 unigram distribution. \n

 *@par Inputs: 
 *true_classes: A "batch_size * num_true" matrix, in which each row contains \n
 the IDs of the "num_true" "target_classes" in the corresponding original label. \n
 * Input "true_classes" is a 2D matrix.
 *@par Inputs:
 *true_classes: A "batch_size * num_true" matrix, in which each row contains
 the IDs of the "num_true" "target_classes" in the corresponding original label.
 * Input "true_classes" is a 2D matrix. \n

 *@par Attributes: 
 *@par Attributes:
 *@li num_true: Number of true labels per context.
 *@li num_sampled: Number of candidates to randomly sample.
 *@li unique: If "unique" is true, samples with rejection, \n
 so that all sampled candidates in a batch are unique. This requires some \n
 *@li unique: If "unique" is true, samples with rejection,
 so that all sampled candidates in a batch are unique. This requires some
 approximation to estimate the post-rejection sampling probabilities.
 *@li seed: If either "seed" or "seed2" are set to be non-zero.
 *@li seed2: A second seed to avoid seed collision.
 *@li seed2: A second seed to avoid seed collision. \n

 *@par Outputs: 
 *@li sampled_candidates: A vector of length "num_sampled", \n
 *@par Outputs:
 *@li sampled_candidates: A vector of length "num_sampled",
 in which each element is the ID of a sampled candidate.
 *@li true_expected_count: A "batch_size * num_true" matrix, representing the \n
 number of times each candidate is expected to occur in a batch of sampled candidates. \n
 *@li true_expected_count: A "batch_size * num_true" matrix, representing the
 number of times each candidate is expected to occur in a batch of sampled candidates.
 *If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled", for each \n
 sampled candidate representing the number of times the candidate is expected \n
 to occur in a batch of sampled candidates. If "unique" is true, then this is a probability.
 *@li sampled_expected_count: A vector of length "num_sampled", for each
 sampled candidate representing the number of times the candidate is expected
 to occur in a batch of sampled candidates. If "unique" is true, then this is a probability. \n

 *@attention Constraints: \n
 *AllCandidateSampler runs on the Ascend AI CPU, which delivers poor performance. \n
 *@attention Constraints:
 *AllCandidateSampler runs on the Ascend AI CPU, which delivers poor performance.

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AllCandidateSampler.
 *Compatible with the TensorFlow operator AllCandidateSampler. \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -370,31 +370,31 @@ REG_OP(AllCandidateSampler)
    .OP_END_FACTORY_REG(AllCandidateSampler)

 /**
 *@brief Computes the "ids" of the positions in "sampled_candidates" that \n
 match "true_labels".
 *@brief Computes the "ids" of the positions in "sampled_candidates" that
 match "true_labels". \n

 *@par Inputs: 
 * @li Input "true_classes" is a 2D matrix. \n
 * @li true_classes: The "true_classes" output of UnpackSparseLabels. \n
 * @li sampled_candidates: The "sampled_candidates" output of CandidateSampler. \n
 *@par Inputs:
 * @li Input "true_classes" is a 2D matrix.
 * @li true_classes: The "true_classes" output of UnpackSparseLabels.
 * @li sampled_candidates: The "sampled_candidates" output of CandidateSampler.  \n

 *@par Attributes: 
 *@par Attributes:
 *@li num_true: Number of true labels per context.
 *@li seed: If either "seed" or "seed2" are set to be non-zero.
 *@li seed2: A second seed to avoid seed collision.
 *@li seed2: A second seed to avoid seed collision. \n

 *@par Outputs: 
 *@par Outputs:
 * @li indices: A vector of indices corresponding to rows of "true_candidates".
 * @li ids: A vector of IDs of positions in "sampled_candidates" that match a \n
 * @li ids: A vector of IDs of positions in "sampled_candidates" that match a
 "true_label" for the row with the corresponding index in indices.
 * @li weights: A vector of the same length as "indices" and "ids", in which \n
 each element is -FLOAT_MAX.
 * @li weights: A vector of the same length as "indices" and "ids", in which
 each element is -FLOAT_MAX. \n

 *@attention Constraints: \n
 *ComputeAccidentalHits runs on the Ascend AI CPU, which delivers poor performance. \n
 *@attention Constraints:
 *ComputeAccidentalHits runs on the Ascend AI CPU, which delivers poor performance.

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ComputeAccidentalHits.
 *Compatible with the TensorFlow operator ComputeAccidentalHits. \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
--- a/third_party/fwkacllib/inc/ops/condtake_ops.h
+++ b/third_party/fwkacllib/inc/ops/condtake_ops.h
@@ -26,17 +26,17 @@

 namespace ge {
 /**
 *@brief Take elements from data if specific condition is satisfied on mask.
 *@brief Take elements from data if specific condition is satisfied on mask. \n

 *@par Inputs:
 *@li data: input tensor from which to take elements, High-dimension input would \n
 *@li data: input tensor from which to take elements, High-dimension input would
 first be flattened.
 *@li mask: condition param; must be the same shape with data.
 *@li mask: condition param; must be the same shape with data. \n

 *@par Attributes:
 *@li mode:convert by convert in Mode.
 *@li val:convert by <class 'float'>
 *@li eps:convert by <class 'float'> (default: 1e-06)
 *@li eps:convert by <class 'float'> (default: 1e-06) \n

 *@par Outputs:
 *@li out_data: the elements taken
--- a/third_party/fwkacllib/inc/ops/control_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/control_flow_ops.h
@@ -27,21 +27,21 @@
 namespace ge {

 /**
 *@brief Forwards the value of an available tensor from input "x" to output "y". \n
 *       Merge waits for at least one of the input tensors to become available. \n
 *       It is usually combined with Switch to implement branching. \n
 *       Merge forwards the first tensor to become available to output "y", \n
 *       and sets "value_index" the index of the tensor in inputs.
 *@brief Forwards the value of an available tensor from input "x" to output "y".
 *       Merge waits for at least one of the input tensors to become available.
 *       It is usually combined with Switch to implement branching.
 *       Merge forwards the first tensor to become available to output "y",
 *       and sets "value_index" the index of the tensor in inputs . \n

 *@par Inputs:
 *x: The input tensors, one of which will become available. \n
 *   Must be one of the following types: float16, float32, float64, int8, \n
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *x: The input tensors, one of which will become available.
 *   Must be one of the following types: float16, float32, float64, int8,
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool . It's a dynamic input. \n

 *@par Outputs:
 *@li y: The available tensor. Has the same type as "x".
 *@li value_index: A scalar of type int32, for the index of the chosen input \n
 *                 tensor.
 *@li value_index: A scalar of type int32, for the index of the chosen input
 *                 tensor . \n

 *@see Switch()

@@ -59,21 +59,21 @@ REG_OP(Merge)
    .OP_END_FACTORY_REG(Merge)

 /**
 *@brief Forwards the value of an available tensor from input "x" to output "y". \n
 *       Merge waits for at least one of the input tensors to become available. \n
 *       It is usually combined with Switch to implement branching. \n
 *       Merge forwards the first tensor to become available to output "y", \n
 *       and sets "value_index" the index of the tensor in inputs.
 *@brief Forwards the value of an available tensor from input "x" to output "y".
 *       Merge waits for at least one of the input tensors to become available.
 *       It is usually combined with Switch to implement branching.
 *       Merge forwards the first tensor to become available to output "y",
 *       and sets "value_index" the index of the tensor in inputs . \n

 *@par Inputs:
 *x: The input tensors, one of which will become available. \n
 *   Must be one of the following types: float16, float32, float64, int8, \n
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *x: The input tensors, one of which will become available.
 *   Must be one of the following types: float16, float32, float64, int8,
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool . It's a dynamic input. \n

 *@par Outputs:
 *@li y: The available tensor. Has the same type as "x".
 *@li value_index: A scalar of type int32, for the index of the chosen input \n
 *                 tensor.
 *@li value_index: A scalar of type int32, for the index of the chosen input
 *                 tensor . \n

 *@see Switch() | Merge()

@@ -91,21 +91,21 @@ REG_OP(RefMerge)
    .OP_END_FACTORY_REG(RefMerge)

 /**
 *@brief Forwards "data" to the output port determined by "pred". \n
 *       If "pred" is "true", the data input is forwarded to "output_true". \n
 *       Otherwise, the data is forwarded to "output_false".
 *@brief Forwards "data" to the output port determined by "pred".
 *       If "pred" is "true", the data input is forwarded to "output_true".
 *       Otherwise, the data is forwarded to "output_false" . \n

 *@par Inputs:
 *@li data: The tensor to be forwarded. \ n
 *          Must be one of the following types: float16, float32, float64, \n
 *          Must be one of the following types: float16, float32, float64,
 *          int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *@li pred: A boolean scalar. The output port that will receive data.
 *@li pred: A boolean scalar. The output port that will receive data . \n

 *@par Outputs:
 *@li output_false: If "pred" is "false", data will be forwarded to this output. \n
 *@li output_false: If "pred" is "false", data will be forwarded to this output.
 *                  Has the same type as "data".
 *@li output_true: If "pred" is "true", data will be forwarded to this output. \n
 *                 Has the same type as "data".
 *@li output_true: If "pred" is "true", data will be forwarded to this output.
 *                 Has the same type as "data" . \n

 *@see Merge()

@@ -126,21 +126,21 @@ REG_OP(Switch)
    .OP_END_FACTORY_REG(Switch)

 /**
 *@brief Forwards "data" to the output port determined by "pred". \n
 *       If "pred" is "true", the data input is forwarded to "output_true". \n
 *       Otherwise, the data is forwarded to "output_false".
 *@brief Forwards "data" to the output port determined by "pred".
 *       If "pred" is "true", the data input is forwarded to "output_true".
 *       Otherwise, the data is forwarded to "output_false" . \n

 *@par Inputs:
 *@li data: The ref tensor to be forwarded. \n
 *          Must be one of the following types: float16, float32, float64, \n
 *@li data: The ref tensor to be forwarded.
 *          Must be one of the following types: float16, float32, float64,
 *          int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *@li pred: A boolean scalar. The output port that will receive data.
 *@li pred: A boolean scalar. The output port that will receive data . \n

 *@par Outputs:
 *@li output_false: If "pred" is "false", data will be forwarded to this output. \n
 *@li output_false: If "pred" is "false", data will be forwarded to this output.
 *                  Has the same type as "data".
 *@li output_true: If "pred" is "true", data will be forwarded to this output. \n
 *                 Has the same type as "data".
 *@li output_true: If "pred" is "true", data will be forwarded to this output.
 *                 Has the same type as "data" . \n

 *@see Merge() | Switch()

@@ -161,16 +161,16 @@ REG_OP(RefSwitch)
    .OP_END_FACTORY_REG(RefSwitch)

 /**
 *@brief Forwards "data" to the output port determined by "pred_value".
 *@brief Forwards "data" to the output port determined by "pred_value" . \n

 *@par Inputs:
 *@li data: The tensor to be forwarded. \ n
 *          Must be one of the following types: float16, float32, float64, \n
 *          Must be one of the following types: float16, float32, float64,
 *          int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *@li pred_value: A int64 tensor which determines the output port that will receive data.
 *@li pred_value: A int64 tensor which determines the output port that will receive data . \n

 *@par Outputs:
 *output: The output tensors, one of which will become available. \n
 *output: The output tensors, one of which will become available.
 *        Has the same type as "data".
 */
 REG_OP(SwitchN)
@@ -184,24 +184,24 @@ REG_OP(SwitchN)
    .OP_END_FACTORY_REG(SwitchN)

 /**
 *@brief Creates or finds a child frame, and makes "x" available to the child \n
 *       frame. This op is used together with Exit to create loops in the graph. \n
 *       The Executor uses the unique "frame_name" to identify frames. \n
 *       If "is_constant" is "true", output "y" is a constant in the child \n
 *       frame; otherwise it may be changed in the child frame.
 *@brief Creates or finds a child frame, and makes "x" available to the child
 *       frame. This op is used together with Exit to create loops in the graph.
 *       The Executor uses the unique "frame_name" to identify frames.
 *       If "is_constant" is "true", output "y" is a constant in the child
 *       frame; otherwise it may be changed in the child frame . \n

 *@par Inputs:
 *x: The tensor to be made available to the child frame. \n
 *   Must be one of the following types: float16, float32, float64, int8, \n
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *x: The tensor to be made available to the child frame.
 *   Must be one of the following types: float16, float32, float64, int8,
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n

 *@par Attributes:
 *@li frame_name: A required string. The name of the child frame.
 *@li is_constant: A required bool. If true, the output is constant in \n
 *                 the child frame.
 *@li is_constant: A required bool. If true, the output is constant in
 *                 the child frame . \n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x".
 *y: A Tensor. Has the same type as "x" . \n

 *@see Exit()

@@ -220,24 +220,24 @@ REG_OP(Enter)
    .OP_END_FACTORY_REG(Enter)

 /**
 *@brief Creates or finds a child frame, and makes "x" available to the child \n
 *       frame. This op is used together with Exit to create loops in the graph. \n
 *       The Executor uses the unique "frame_name" to identify frames. \n
 *       If "is_constant" is "true", output "y" is a constant in the child \n
 *       frame; otherwise it may be changed in the child frame.
 *@brief Creates or finds a child frame, and makes "x" available to the child
 *       frame. This op is used together with Exit to create loops in the graph.
 *       The Executor uses the unique "frame_name" to identify frames.
 *       If "is_constant" is "true", output "y" is a constant in the child
 *       frame; otherwise it may be changed in the child frame . \n

 *@par Inputs:
 *x: The tensor to be made available to the child frame. \n
 *   Must be one of the following types: float16, float32, float64, int8, \n
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *x: The tensor to be made available to the child frame.
 *   Must be one of the following types: float16, float32, float64, int8,
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n

 *@par Attributes:
 *@li frame_name: A required string. The name of the child frame.
 *@li is_constant: A required bool. If true, the output is constant in \n
 *                 the child frame.
 *@li is_constant: A required bool. If true, the output is constant in
 *                 the child frame . \n

 *@par Outputs:
 *y: A tensor. Has the same type as "x".
 *y: A tensor. Has the same type as "x" . \n

 *@see Exit() | Enter()

@@ -256,14 +256,14 @@ REG_OP(RefEnter)
    .OP_END_FACTORY_REG(RefEnter)

 /**
 *@brief Forwards the input to the output. This op represents the loop \n
 *       termination condition.
 *@brief Forwards the input to the output. This op represents the loop
 *       termination condition . \n

 *@par Inputs:
 *x: A boolean scalar. The condition of the Switch op.
 *x: A boolean scalar. The condition of the Switch op . \n

 *@par Outputs:
 *y: The tensor "x".
 *y: The tensor "x" . \n

 *@see Switch()

@@ -276,15 +276,15 @@ REG_OP(LoopCond)
    .OP_END_FACTORY_REG(LoopCond)

 /**
 *@brief Makes the input available to the next iteration.
 *@brief Makes the input available to the next iteration . \n

 *@par Inputs:
 *x: The tensor to be made available to the next iteration. \n
 *   Must be one of the following types: float16, float32, float64, int8, \n
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *x: The tensor to be made available to the next iteration.
 *   Must be one of the following types: float16, float32, float64, int8,
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x".
 *y: A Tensor. Has the same type as "x" . \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator NextIteration.
@@ -299,15 +299,15 @@ REG_OP(NextIteration)
    .OP_END_FACTORY_REG(NextIteration)

 /**
 *@brief Makes the input available to the next iteration.
 *@brief Makes the input available to the next iteration . \n

 *@par Inputs:
 *x: The tensor to be made available to the next iteration. \n
 *   Must be one of the following types: float16, float32, float64, int8, \n
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *x: The tensor to be made available to the next iteration.
 *   Must be one of the following types: float16, float32, float64, int8,
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n

 *@par Outputs:
 *y: A tensor. Has the same type as "x".
 *y: A tensor. Has the same type as "x" . \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator RefNextIteration.
@@ -322,15 +322,15 @@ REG_OP(RefNextIteration)
    .OP_END_FACTORY_REG(RefNextIteration)

 /**
 *@brief Exits the current frame to its parent frame.
 *@brief Exits the current frame to its parent frame . \n

 *@par Inputs:
 *x: The tensor to be made available to the parent frame. \n
 *   Must be one of the following types: float16, float32, float64, int8, \n
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *x: The tensor to be made available to the parent frame.
 *   Must be one of the following types: float16, float32, float64, int8,
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x".
 *y: A Tensor. Has the same type as "x" . \n

 *@see Enter()

@@ -347,15 +347,15 @@ REG_OP(Exit)
    .OP_END_FACTORY_REG(Exit)

 /**
 *@brief Exits the current frame to its parent frame.
 *@brief Exits the current frame to its parent frame . \n

 *@par Inputs:
 *x: The tensor to be made available to the parent frame. \n
 *   Must be one of the following types: float16, float32, float64, int8, \n
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *x: The tensor to be made available to the parent frame.
 *   Must be one of the following types: float16, float32, float64, int8,
 *   int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n

 *@par Outputs:
 *y: A tensor. Has the same type as "x".
 *y: A tensor. Has the same type as "x" . \n

 *@see Enter() | Exit()

@@ -372,9 +372,9 @@ REG_OP(RefExit)
    .OP_END_FACTORY_REG(RefExit)

 /**
 *@brief Only useful as a placeholder for control edges. \n
 *       It is similar to a no-op that always produces a live control output \n
 *       even when some control inputs are dead.
 *@brief Only useful as a placeholder for control edges.
 *       It is similar to a no-op that always produces a live control output
 *       even when some control inputs are dead . \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator ControlTrigger.
@@ -389,7 +389,7 @@ REG_OP(ControlTrigger)
 * Three inputs, including:
 *@li x: One dimensional tensore of type int32, specifying queried shape, max size is 8.
 *@li data_seq: One dimensional tensore of type int32, specifying the mapped table is queried.
 *@li level_index: One dimensional tensore of type int32, specifying secondary index.
 *@li level_index: One dimensional tensore of type int32, specifying secondary index. \n

 *@par Outputs:
 *@li y: A Tensor with shape [batch, 8], of type int32, specifying index of shape in the map.
--- a/third_party/fwkacllib/inc/ops/ctc_ops.h
+++ b/third_party/fwkacllib/inc/ops/ctc_ops.h
@@ -27,29 +27,29 @@
 namespace ge {

 /**
 *@brief Calculates the CTC Loss (log probability) for each batch entry. \n
 Also calculates the gradient. 
 *@brief Calculates the CTC Loss (log probability) for each batch entry.
 Also calculates the gradient. \n

 *@par Inputs:
 *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
 *@li labels_indices: The indices of a `SparseTensor<int32, 2>`. \n
 `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for \n
 *@li labels_indices: The indices of a `SparseTensor<int32, 2>`.
 `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
 `(batch b, time t)`.
 *@li labels_values: The values (labels) associated with the given batch and time.
 *@li sequence_length: A vector containing sequence lengths (batch).
 *@li sequence_length: A vector containing sequence lengths (batch). \n

 *@par Outputs:
 *@li loss: A vector (batch) containing log-probabilities.
 *@li gradient: The gradient of `loss`.  3-D, shape: `(max_time x \n
 batch_size x num_classes)`.
 *@li gradient: The gradient of `loss`.  3-D, shape: `(max_time x
 batch_size x num_classes)`. \n

 *@par Attributes:
 *@li preprocess_collapse_repeated: Scalar, if true then repeated labels are collapsed prior to \n
 *@li preprocess_collapse_repeated: Scalar, if true then repeated labels are collapsed prior to
 the CTC calculation.If not specified, defaults to false
 *@li ctc_merge_repeated: Scalar. If set to false, *during* CTC calculation \n
 repeated non-blank labels will not be merged and are interpreted as \n
 individual labels.  This is a simplified version of CTC. \n
 If not specified, defaults to true
 *@li ctc_merge_repeated: Scalar. If set to false, *during* CTC calculation
 repeated non-blank labels will not be merged and are interpreted as
 individual labels.  This is a simplified version of CTC.
 If not specified, defaults to true. \n

 *@par Third-party framework compatibility
 * Compatible with TensorFlow CTCLoss operator.
@@ -67,24 +67,24 @@ REG_OP(CTCLoss)
    .OP_END_FACTORY_REG(CTCLoss)

 /**
 *@brief Performs greedy decoding on the logits given in inputs.
 *@brief Performs greedy decoding on the logits given in inputs. \n

 *@par Inputs:
 *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
 *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`.
 *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. \n

 *@par Attributes:
 *@li merge_repeated: If True, merge repeated classes in output.
 *@li merge_repeated: If True, merge repeated classes in output. \n

 *@par Outputs:
 *@li decoded_indices: Indices matrix, size `(total_decoded_outputs x 2)`,\n
 *@li decoded_indices: Indices matrix, size `(total_decoded_outputs x 2)`,
 of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].
 *@li decoded_values: Values vector, size: `(total_decoded_outputs)`,\n
 *@li decoded_values: Values vector, size: `(total_decoded_outputs)`,
 of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.
 *@li decoded_shape: Shape vector, size `(2)`, of the decoded SparseTensor.\n
 *@li decoded_shape: Shape vector, size `(2)`, of the decoded SparseTensor.
 Values are: `[batch_size, max_decoded_length]`.
 *@li log_probability: Matrix, size `(batch_size x 1)`, containing sequence\n
 log-probabilities.
 *@li log_probability: Matrix, size `(batch_size x 1)`, containing sequence
 log-probabilities. \n

 *@par Third-party framework compatibility
 * Compatible with TensorFlow CTCGreedyDecoder operator.
@@ -100,27 +100,27 @@ REG_OP(CTCGreedyDecoder)
    .OP_END_FACTORY_REG(CTCGreedyDecoder)

 /**
 *@brief Performs beam search decoding on the logits given in input.
 *@brief Performs beam search decoding on the logits given in input. \n

 *@par Inputs:
 *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
 *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`.
 *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. \n

 *@par Attributes:
 *@li merge_repeated: If True, merge repeated classes in output.
 *@li merge_repeated: If True, merge repeated classes in output. \n

 *@par Outputs:
 *@li decoded_indices: A list (length: top_paths) of indices matrices.  Matrix j,\n
 size `(total_decoded_outputs[j] x 2)`, has indices of a\n
 *@li decoded_indices: A list (length: top_paths) of indices matrices.  Matrix j,
 size `(total_decoded_outputs[j] x 2)`, has indices of a
 `SparseTensor<int64, 2>`.  The rows store: [batch, time].
 *@li decoded_values: A list (length: top_paths) of values vectors.  Vector j,\n
 size `(length total_decoded_outputs[j])`, has the values of a\n
 *@li decoded_values: A list (length: top_paths) of values vectors.  Vector j,
 size `(length total_decoded_outputs[j])`, has the values of a
 `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.
 *@li decoded_shape: A list (length: top_paths) of shape vector.  Vector j,\n
 size `(2)`, stores the shape of the decoded `SparseTensor[j]`.\n
 *@li decoded_shape: A list (length: top_paths) of shape vector.  Vector j,
 size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
 Its values are: `[batch_size, max_decoded_length[j]]`.
 *@li log_probability: A matrix, shaped: `(batch_size x top_paths)`.  The\n
 sequence log-probabilities.
 *@li log_probability: A matrix, shaped: `(batch_size x top_paths)`.  The
 sequence log-probabilities. \n

 *@par Third-party framework compatibility
 * Compatible with TensorFlow CTCBeamSearchDecoder operator.
--- a/third_party/fwkacllib/inc/ops/data_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
--- a/third_party/fwkacllib/inc/ops/functional_ops.h
+++ b/third_party/fwkacllib/inc/ops/functional_ops.h
@@ -25,40 +25,27 @@
 #include "graph/operator.h"

 namespace ge {
 REG_OP(SymbolicGradient)
    .DYNAMIC_INPUT(input, TensorType::ALL())
    .DYNAMIC_OUTPUT(output, TensorType::ALL())
    .GRAPH(f)
    .OP_END_FACTORY_REG(SymbolicGradient)

 REG_OP(RemoteCall)
    .INPUT(target, DT_STRING)
    .DYNAMIC_INPUT(args, TensorType::ALL())
    .DYNAMIC_OUTPUT(output, TensorType::ALL())
    .GRAPH(f)
    .OP_END_FACTORY_REG(RemoteCall)

 /**
 *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. \n
 *       If "cond" means True, the selected subgraph is "then_branch". \n
 *       Otherwise, the selected subgraph is "else_branch".
 *@brief Select one of the subgraphs to pass the input tensors and return the output tensors.
 *       If "cond" means True, the selected subgraph is "then_branch".
 *       Otherwise, the selected subgraph is "else_branch" . \n

 *@par Inputs:
 *@li cond: A Tensor. If "cond" is not a scalar of boolean type, \n
 *          it will be converted to a boolean according to the following rule: \n
 *          if "cond" is a numerical scalar, non-zero means True and zero means False; \n
 *          if "cond" is a string scalar, non-empty means True and empty means False; \n
 *@li cond: A Tensor. If "cond" is not a scalar of boolean type,
 *          it will be converted to a boolean according to the following rule:
 *          if "cond" is a numerical scalar, non-zero means True and zero means False;
 *          if "cond" is a string scalar, non-empty means True and empty means False;
 *          if "cond" is not a scalar, non-empty means True and empty means False.
 *@li input: The input tensors.
 *@li input: The input tensors . It's a dynamic input. \n

 *@par Graphs:
 *@li then_branch: A subgraph takes 'input' and returns a list of tensors, \n
 *@li then_branch: A subgraph takes 'input' and returns a list of tensors,
 *                 whose types are the same as what else_branch returns.
 *@li else_branch: A subgraph takes 'input' and returns a list of tensors, \n
 *                 whose types are the same as what then_branch returns.
 *@li else_branch: A subgraph takes 'input' and returns a list of tensors,
 *                 whose types are the same as what then_branch returns . \n

 *@par Outputs:
 *output: The output tensors returned by either then_branch(input) or else_branch(input).
 *output: The output tensors returned by either then_branch(input) or else_branch(input) . \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator _If.
@@ -72,26 +59,26 @@ REG_OP(_If)
    .OP_END_FACTORY_REG(_If)

 /**
 *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. \n
 *       If "cond" means True, the selected subgraph is "then_branch". \n
 *       Otherwise, the selected subgraph is "else_branch".
 *@brief Select one of the subgraphs to pass the input tensors and return the output tensors.
 *       If "cond" means True, the selected subgraph is "then_branch".
 *       Otherwise, the selected subgraph is "else_branch" . \n

 *@par Inputs:
 *@li cond: A Tensor. If "cond" is not a scalar of boolean type, \n
 *          it will be converted to a boolean according to the following rule: \n
 *          if "cond" is a numerical scalar, non-zero means True and zero means False; \n
 *          if "cond" is a string scalar, non-empty means True and empty means False; \n
 *@li cond: A Tensor. If "cond" is not a scalar of boolean type,
 *          it will be converted to a boolean according to the following rule:
 *          if "cond" is a numerical scalar, non-zero means True and zero means False;
 *          if "cond" is a string scalar, non-empty means True and empty means False;
 *          if "cond" is not a scalar, non-empty means True and empty means False.
 *@li input: The input tensors.
 *@li input: The input tensors . It's a dynamic input. \n

 *@par Graphs:
 *@li then_branch: A subgraph takes 'input' and returns a list of tensors, \n
 *@li then_branch: A subgraph takes 'input' and returns a list of tensors,
 *                 whose types are the same as what else_branch returns.
 *@li else_branch: A subgraph takes 'input' and returns a list of tensors, \n
 *                 whose types are the same as what then_branch returns.
 *@li else_branch: A subgraph takes 'input' and returns a list of tensors,
 *                 whose types are the same as what then_branch returns . \n

 *@par Outputs:
 *output: The output tensors returned by either then_branch(input) or else_branch(input).
 *output: The output tensors returned by either then_branch(input) or else_branch(input) . \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator StatelessIf.
@@ -105,26 +92,26 @@ REG_OP(StatelessIf)
    .OP_END_FACTORY_REG(StatelessIf)

 /**
 *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. \n
 *       If "cond" means True, the selected subgraph is "then_branch". \n
 *       Otherwise, the selected subgraph is "else_branch".
 *@brief Select one of the subgraphs to pass the input tensors and return the output tensors.
 *       If "cond" means True, the selected subgraph is "then_branch".
 *       Otherwise, the selected subgraph is "else_branch" . \n

 *@par Inputs:
 *@li cond: A Tensor. If "cond" is not a scalar of boolean type, \n
 *          it will be converted to a boolean according to the following rule: \n
 *          if "cond" is a numerical scalar, non-zero means True and zero means False; \n
 *          if "cond" is a string scalar, non-empty means True and empty means False; \n
 *@li cond: A Tensor. If "cond" is not a scalar of boolean type,
 *          it will be converted to a boolean according to the following rule:
 *          if "cond" is a numerical scalar, non-zero means True and zero means False;
 *          if "cond" is a string scalar, non-empty means True and empty means False;
 *          if "cond" is not a scalar, non-empty means True and empty means False.
 *@li input: The input tensors.
 *@li input: The input tensors . It's a dynamic input. \n

 *@par Graphs:
 *@li then_branch: A subgraph takes 'input' and returns a list of tensors, \n
 *@li then_branch: A subgraph takes 'input' and returns a list of tensors,
 *                 whose types are the same as what else_branch returns.
 *@li else_branch: A subgraph takes 'input' and returns a list of tensors, \n
 *                 whose types are the same as what then_branch returns.
 *@li else_branch: A subgraph takes 'input' and returns a list of tensors,
 *                 whose types are the same as what then_branch returns . \n

 *@par Outputs:
 *output: The output tensors returned by either then_branch(input) or else_branch(input).
 *output: The output tensors returned by either then_branch(input) or else_branch(input) . \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator If.
@@ -138,18 +125,18 @@ REG_OP(If)
    .OP_END_FACTORY_REG(If)

 /**
 *@brief Select one of the subgraphs to pass the input tensors and return the output tensors.
 *@brief Select one of the subgraphs to pass the input tensors and return the output tensors . \n

 *@par Inputs:
 *@li branch_index: A int32 scalar which determines the selected subgraph.
 *@li input: The input tensors, which will be passed to the subgraph.
 *@li input: The input tensors, which will be passed to the subgraph . It's a dynamic input. \n

 *@par Graphs:
 *branches: A list of subgraphs, each of which takes 'input' and returns a list of tensors, \n
 *          whose types are the same as what every other subgraph returns.
 *branches: A list of subgraphs, each of which takes 'input' and returns a list of tensors,
 *          whose types are the same as what every other subgraph returns . \n

 *@par Outputs:
 *output: The output tensors returned by one of branches.
 *output: The output tensors returned by one of branches . It's a dynamic output. \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator Case.
@@ -162,25 +149,25 @@ REG_OP(Case)
    .OP_END_FACTORY_REG(Case)

 /**
 *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False.
 *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n

 *@par Inputs:
 *input: The input tensors.
 *input: The input tensors . It's a dynamic input. \n

 *@par Graphs:
 *@li cond: A subgraph takes 'input' and returns a tensor. \n
 *          If the tensor is not a scalar of boolean type, \n
 *          it will be converted to a boolean according to the following rule: \n
 *          if it is a numerical scalar, non-zero means True and zero means False; \n
 *          if it is a string scalar, non-empty means True and empty means False; \n
 *@li cond: A subgraph takes 'input' and returns a tensor.
 *          If the tensor is not a scalar of boolean type,
 *          it will be converted to a boolean according to the following rule:
 *          if it is a numerical scalar, non-zero means True and zero means False;
 *          if it is a string scalar, non-empty means True and empty means False;
 *          if it is not a scalar, non-empty means True and empty means False.
 *@li body: A subgraph takes 'input' and returns a another list of tensors.
 *@li body: A subgraph takes 'input' and returns a another list of tensors .  \n

 *@par Attributes:
 *parallel_iterations: An optional int, default as 10.
 *parallel_iterations: An optional int, default as 10 . \n

 *@par Outputs:
 *output: The output tensors returned by "body". Has the same type as "input".
 *output: The output tensors returned by "body". Has the same type as "input" . \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator _While.
@@ -193,25 +180,25 @@ REG_OP(_While)
    .OP_END_FACTORY_REG(_While)

 /**
 *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False.
 *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n

 *@par Inputs:
 *input: The input tensors.
 *input: The input tensors . It's a dynamic input. \n

 *@par Graphs:
 *@li cond: A subgraph takes 'input' and returns a tensor. \n
 *          If the tensor is not a scalar of boolean type, \n
 *          it will be converted to a boolean according to the following rule: \n
 *          if it is a numerical scalar, non-zero means True and zero means False; \n
 *          if it is a string scalar, non-empty means True and empty means False; \n
 *@li cond: A subgraph takes 'input' and returns a tensor.
 *          If the tensor is not a scalar of boolean type,
 *          it will be converted to a boolean according to the following rule:
 *          if it is a numerical scalar, non-zero means True and zero means False;
 *          if it is a string scalar, non-empty means True and empty means False;
 *          if it is not a scalar, non-empty means True and empty means False.
 *@li body: A subgraph takes 'input' and returns a another list of tensors.
 *@li body: A subgraph takes 'input' and returns a another list of tensors . \n

 *@par Attributes:
 *parallel_iterations: An optional int, default as 10.
 *parallel_iterations: An optional int, default as 10 . \n

 *@par Outputs:
 *output: The output tensors returned by "body". Has the same type as "input".
 *output: The output tensors returned by "body". Has the same type as "input" . It's a dynamic output. \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator While.
@@ -225,25 +212,25 @@ REG_OP(While)
    .OP_END_FACTORY_REG(While)

 /**
 *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False.
 *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n

 *@par Inputs:
 *input: The input tensors.
 *input: The input tensors . It's a dynamic input. \n

 *@par Graphs:
 *@li cond: A subgraph takes 'input' and returns a tensor. \n
 *          If the tensor is not a scalar of boolean type, \n
 *          it will be converted to a boolean according to the following rule: \n
 *          if it is a numerical scalar, non-zero means True and zero means False; \n
 *          if it is a string scalar, non-empty means True and empty means False; \n
 *@li cond: A subgraph takes 'input' and returns a tensor.
 *          If the tensor is not a scalar of boolean type,
 *          it will be converted to a boolean according to the following rule:
 *          if it is a numerical scalar, non-zero means True and zero means False;
 *          if it is a string scalar, non-empty means True and empty means False;
 *          if it is not a scalar, non-empty means True and empty means False.
 *@li body: A subgraph takes 'input' and returns a another list of tensors.
 *@li body: A subgraph takes 'input' and returns a another list of tensors . \n

 *@par Attributes:
 *parallel_iterations: An optional int, default as 10.
 *parallel_iterations: An optional int, default as 10 . \n

 *@par Outputs:
 *output: The output tensors returned by "body". Has the same type as "input".
 *output: The output tensors returned by "body". Has the same type as "input" . It's a dynamic output. \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator StatelessWhile.
@@ -257,19 +244,19 @@ REG_OP(StatelessWhile)
    .OP_END_FACTORY_REG(StatelessWhile)

 /**
 *@brief Cyclic execute the "body" subgraph until the first input of For op exceed upper bound.
 *@brief Cyclic execute the "body" subgraph until the first input of For op exceed upper bound . \n

 *@par Inputs:
 *@li start: A int32 scalar. The lower bound.
 *@li limit: A int32 scalar. The upper bound.
 *@li delta: A int32 scalar. The step size.
 *@li input: The input tensors, which will be passed to "body".
 *@li input: The input tensors, which will be passed to "body" . It's a dynamic input. \n

 *@par Graphs:
 *body: A subgraph takes 'input' and returns a another list of tensors.
 *body: A subgraph takes 'input' and returns a another list of tensors . \n

 *@par Outputs:
 *output: The output tensors returned by "body". Has the same type as "input".
 *output: The output tensors returned by "body". Has the same type as "input" . It's a dynamic output. \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator For.
@@ -284,21 +271,21 @@ REG_OP(For)
    .OP_END_FACTORY_REG(For)

 /**
 *@brief Pass the input tensors to the subgraph "f" and return the output tensors.
 *@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n

 *@par Inputs:
 *args: The input tensors, which will be passed to "f".
 *args: The input tensors, which will be passed to "f" . It's a dynamic input. \n

 *@par Graphs:
 *f: A subgraph takes 'args' and returns a another list of tensors.
 *f: A subgraph takes 'args' and returns a another list of tensors . \n

 *@par Attributes:
 *@li config: An optional string, default as "".
 *@li config_proto: An optional int, default as "".
 *@li executor_type: An optional int, default as "".
 *@li executor_type: An optional int, default as "" . \n

 *@par Outputs:
 *output: The output tensors returned by "f".
 *output: The output tensors returned by "f" . It's a dynamic output. \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator PartitionedCall.
@@ -313,21 +300,21 @@ REG_OP(PartitionedCall)
    .OP_END_FACTORY_REG(PartitionedCall)

 /**
 *@brief Pass the input tensors to the subgraph "f" and return the output tensors.
 *@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n

 *@par Inputs:
 *args: The input tensors, which will be passed to "f".
 *args: The input tensors, which will be passed to "f" . It's a dynamic input. \n

 *@par Graphs:
 *f: A subgraph takes 'args' and returns a another list of tensors.
 *f: A subgraph takes 'args' and returns a another list of tensors . \n

 *@par Attributes:
 *@li config: An optional string, default as "".
 *@li config_proto: An optional int, default as "".
 *@li executor_type: An optional int, default as "".
 *@li executor_type: An optional int, default as "" . \n

 *@par Outputs:
 *output: The output tensors returned by "f".
 *output: The output tensors returned by "f" . It's a dynamic output. \n

 *@par Third-party framework compatibility
 *@Compatible with the TensorFlow operator StatefulPartitionedCall.
@@ -341,11 +328,6 @@ REG_OP(StatefulPartitionedCall)
    .ATTR(executor_type, String, "")
    .OP_END_FACTORY_REG(StatefulPartitionedCall)

 REG_OP(FakeParam)
    .OUTPUT(output, TensorType::ALL())
    .ATTR(shape, ListInt, {})
    .OP_END_FACTORY_REG(FakeParam)

 }  // namespace ge

 #endif  // GE_FUNCTIONAL_OPS_H_
--- a/third_party/fwkacllib/inc/ops/hcom_ops.h
+++ b/third_party/fwkacllib/inc/ops/hcom_ops.h
@@ -27,18 +27,18 @@ namespace ge {
 /**
 * @brief Outputs a tensor gathering all input tensors.
 * @par Inputs:
 * x: A tensor. Must be one of the following types: int8, int16, int32, float16, 
 * float32.
 * x: A tensor. Must be one of the following types: int8, int16, int32, float16,
  float32.
 * @par Attributes:
 * @li rank_size: A required integer identifying the number of ranks 
 * participating in the op.
 * @li group: A required string identifying the group name of ranks 
 * participating in the op.
 * @li rank_size: A required integer identifying the number of ranks
  participating in the op.
 * @li group: A required string identifying the group name of ranks
  participating in the op.
 * @par Outputs:
 * y: A Tensor. Has the same type as "x".
 * @attention Constraints:\n
 * "group" is limited to 128 characters. Use "hccl_world_group" 
 * as the name of a world group.
 * @attention Constraints:
  "group" is limited to 128 characters. Use "hccl_world_group"
  as the name of a world group.
 */
 REG_OP(HcomAllGather)
    .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
@@ -50,25 +50,25 @@ REG_OP(HcomAllGather)
    .OP_END_FACTORY_REG(HcomAllGather)

 /**
 * @brief Outputs a tensor containing the reduction across all input tensors 
 * passed to op.
 * @brief Outputs a tensor containing the reduction across all input tensors
  passed to op.
 * @par Inputs:
 * x: A tensor. Must be one of the following types: int8, int16, int32, float16, 
 * float32.
 * x: A tensor. Must be one of the following types: int8, int16, int32, float16,
  float32.
 * @par Attributes:
 * @li reduction: A required string identifying the reduction operation to 
 * perform.The supported operation are: "sum", "max", "min", "prod".
 * @li group: A required string identifying the group name of ranks 
 * participating in the op.
 * @li fusion: An optional integer identifying the fusion flag of the op. \n
 * 0: no fusion; 1 (default): fusion; 2: fusion the ops by fusion id.
 * @li reduction: A required string identifying the reduction operation to
  perform.The supported operation are: "sum", "max", "min", "prod".
 * @li group: A required string identifying the group name of ranks
  participating in the op.
 * @li fusion: An optional integer identifying the fusion flag of the op.
  0: no fusion; 1 (default): fusion; 2: fusion the ops by fusion id.
 * @li fusion_id: An optional integer identifying the fusion id of the op.
 * The HcomAllReduce ops with the same fusion id will be fused.
 * @par Outputs:
 * y: A Tensor. Has the same type as "x".
 * @attention Constraints: \n
 * "group" is limited to 128 characters. Use "hccl_world_group" 
 * as the name of a world group.
 * @attention Constraints:
 *"group" is limited to 128 characters. Use "hccl_world_group"
  as the name of a world group.
 */
 REG_OP(HcomAllReduce)
    .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
@@ -84,18 +84,19 @@ REG_OP(HcomAllReduce)
 /**
 * @brief Broadcasts the input tensor in root rank to all ranks.
 * @par Inputs:
 * x: A list of dynamic input tensor. Must be one of the following types: 
 * int8, int16, int32, float16, float32.
 * x: A list of dynamic input tensor. Must be one of the following types:
  int8, int16, int32, float16, float32. It's a dynamic input.
 * @par Attributes:
 * @li root_rank: A required integer identifying the root rank in the op 
 * input of this rank will be broadcast to other ranks.
 * @li group: A required string identifying the group name of ranks 
 * participating in the op.
 * @li root_rank: A required integer identifying the root rank in the op
  input of this rank will be broadcast to other ranks.
 * @li group: A required string identifying the group name of ranks
  participating in the op.
 * @par Outputs:
 * y: A list of dynamic output tensor. Has the same type and length as "x".
 * @attention Constraints:\n
 * "group" is limited to 128 characters. Use "hccl_world_group" 
 * as the name of a world group.
 * It's a dynamic output.
 * @attention Constraints:
  "group" is limited to 128 characters. Use "hccl_world_group"
  as the name of a world group.
 */
 REG_OP(HcomBroadcast)
    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
@@ -107,24 +108,24 @@ REG_OP(HcomBroadcast)
    .OP_END_FACTORY_REG(HcomBroadcast)

 /**
 * @brief Performs reduction across all input tensors, scattering in equal 
 * blocks among ranks, each rank getting a chunk of data based on its rank 
 * index.
 * @brief Performs reduction across all input tensors, scattering in equal
  blocks among ranks, each rank getting a chunk of data based on its rank
  index.
 * @par Inputs:
 * x: A tensor. Must be one of the following types: int8, int16, int32, float16, 
 * float32.
 * x: A tensor. Must be one of the following types: int8, int16, int32, float16,
  float32.
 * @par Attributes:
 * @li reduction: A required string identifying the reduction operation to 
 * perform. The supported operation are: "sum", "max", "min", "prod".
 * @li group: A required string identifying the group name of ranks 
 * participating in the op.
 * @li rank_size: A required integer identifying the number of ranks 
 * participating in the op.
 * @li reduction: A required string identifying the reduction operation to
  perform. The supported operation are: "sum", "max", "min", "prod".
 * @li group: A required string identifying the group name of ranks
  participating in the op.
 * @li rank_size: A required integer identifying the number of ranks
  participating in the op.
 * @par Outputs:
 * y: A Tensor. Has the same type as "x".
 * @attention Constraints:\n
 * "group" is limited to 128 characters. Use "hccl_world_group" 
 * as the name of a world group.
 * @attention Constraints:
  "group" is limited to 128 characters. Use "hccl_world_group"
  as the name of a world group.
 */
 REG_OP(HcomReduceScatter)
    .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
@@ -139,19 +140,19 @@ REG_OP(HcomReduceScatter)
 /**
 * @brief Sends the input tensor to destination rank.
 * @par Inputs:
 * x: A tensor. Must be one of the following types: int8, int16, int32, float16, 
 * float32.
 * x: A tensor. Must be one of the following types: int8, int16, int32, float16,
  float32.
 * @par Attributes:
 * @li sr_tag: A required integer identifying the send/recv message tag. The 
 *  message will be received by the HcomReceive op with the same "sr_tag".
 * @li sr_tag: A required integer identifying the send/recv message tag. The
   message will be received by the HcomReceive op with the same "sr_tag".
 * @li dest_rank: A required integer identifying the destination rank.
 * @li group: A string identifying the group name of ranks participating in 
 * the op.
 * @li group: A string identifying the group name of ranks participating in
  the op.
 * @par Outputs:
 * None.
 * @attention Constraints:\n
 * @li "group" is limited to 128 characters. Use 
 * "hccl_world_group" as the name of a world group.
 * @attention Constraints:
  @li "group" is limited to 128 characters. Use
  "hccl_world_group" as the name of a world group.
 * @li Operators HcomSend and HcomReceive have the same "sr_tag".
 * @see HcomReceive
 */
@@ -169,20 +170,20 @@ REG_OP(HcomSend)
 * @par Inputs:
 * None.
 * @par Attributes:
 * @li sr_tag: A required integer identifying the send/recv message tag. The 
 * message will be send by the HcomSend op with the same "sr_tag".
 * @li sr_tag: A required integer identifying the send/recv message tag. The
  message will be send by the HcomSend op with the same "sr_tag".
 * @li src_rank: A required integer identifying the source rank.
 * @li group: A required string identifying the group name of ranks
 * participating in the op.
 * @li shape: A required list identifying the shape of the tensor to be 
 * received.
 * @li dtype: A required integer identifying the type of the tensor to be 
 * received. The supported types are: int8, int16, int32, float16, float32.
 * @li shape: A required list identifying the shape of the tensor to be
  received.
 * @li dtype: A required integer identifying the type of the tensor to be
  received. The supported types are: int8, int16, int32, float16, float32.
 * @par Outputs:
 * y: A tensor with type identified in "dtype".
 * @attention Constraints:\n
 * @li "group" is limited to 128 characters. Use 
 * "hccl_world_group" as the name of a world group.
 * @attention Constraints:
  @li "group" is limited to 128 characters. Use
  "hccl_world_group" as the name of a world group.
 * @li Operators HcomSend and HcomReceive have the same "sr_tag".
 * @li "shape" should be same as the input tensor of HcomSend.
 * @li "dtype" should be same as the input tensor of HcomSend.
--- a/third_party/fwkacllib/inc/ops/hvd_ops.h
+++ b/third_party/fwkacllib/inc/ops/hvd_ops.h
@@ -28,10 +28,10 @@ namespace ge {
 * @brief Outputs a tensor gathering all input tensors.
 * @par Inputs:
 * x: A tensor. Must be one of the following types: uint8, int8, uint16, int16, int32,
 * int64, float16, bool.
 int64, float16, bool.
 * @par Attributes:
 * @li rank_size: A required integer identifying the number of ranks 
 * participating in the op.
 * @li rank_size: A required integer identifying the number of ranks
 participating in the op.
 * @par Outputs:
 * y: A Tensor. Has the same type as "x".
 */
@@ -44,13 +44,13 @@ REG_OP(HorovodAllgather)
    .OP_END_FACTORY_REG(HorovodAllgather)

 /**
 * @brief Outputs a tensor containing the reduction across all input tensors 
 * passed to op.
 * @brief Outputs a tensor containing the reduction across all input tensors
 passed to op.
 * @par Inputs:
 * x: A tensor. Must be one of the following types: int32, int64, float16, float32 
 * @par Attributes:
 * @li reduce_op: A required int identifying the reduction operation to 
 * perform.The supported operation are: "sum", "max", "min", "prod".
 * x: A tensor. Must be one of the following types: int32, int64, float16, float32
 @par Attributes:
 * @li reduce_op: A required int identifying the reduction operation to
 perform.The supported operation are: "sum", "max", "min", "prod".
 * @par Outputs:
 * y: A Tensor. Has the same type as "x".
 */
@@ -63,11 +63,11 @@ REG_OP(HorovodAllreduce)
 /**
 * @brief Broadcasts the input tensor in root rank to all ranks.
 * @par Inputs:
 * x: A list of dynamic input tensor. Must be one of the following types: 
 * int8, int32, float16, float32.
 * x: A list of dynamic input tensor. Must be one of the following types:
 int8, int32, float16, float32.
 * @par Attributes:
 * @li root_rank: A required integer identifying the root rank in the op 
 * input of this rank will be broadcast to other ranks.
 * @li root_rank: A required integer identifying the root rank in the op
 input of this rank will be broadcast to other ranks.
 * @par Outputs:
 * y: A list of dynamic output tensor. Has the same type and length as "x".
 */
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h