Update GraphEngine to synchronize with latest Ascend driver software suite 13 Apr 2020

5 years ago · 3f3c41fd04
--- a/inc/common/opskernel/ops_kernel_info_types.h
+++ b/inc/common/opskernel/ops_kernel_info_types.h
@@ -26,7 +26,6 @@
 using std::string;

 namespace ge {
 /*lint -e148*/
 struct RunContext {
  rtModel_t model;
  rtStream_t stream;
@@ -40,8 +39,6 @@ struct RunContext {
  std::vector<rtEvent_t> graphEventList;    // all events of graph, order by ge event id(0,1,...)
 };

 /*lint +e148*/

 struct Task {
  uint32_t id;
  uint16_t type;
@@ -50,8 +47,7 @@ struct Task {
 };

 struct OpInfo {
  string engine;  // which engin
  /*lint -e148*/
  string engine;       // which engin
  string opKernelLib;  // which opsKernelStore
  int computeCost;     // compute cost
  bool flagPartial;    // whether to support is related to shape
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -98,7 +98,7 @@ const std::string OUTPUT_NODE_NAME = "ge.outputNodeName";
 // its value should be "0" or "1", default value is "0"
 const std::string COMPRESS_FLAG = "ge.compressFlag";

 const std::string ATUO_PRECISION_FLAG = "ge.exec.auto_mix_precision";
 const std::string PRECISION_MODE = "ge.exec.precision_mode";

 // Configure single op flag for FE
 // its value should be "0" or "1", default value is "0"
--- a/inc/framework/common/debug/ge_log.h
+++ b/inc/framework/common/debug/ge_log.h
@@ -44,8 +44,6 @@ inline bool IsLogEnable(int module_name, int log_level) noexcept {
  return false;
 }

 /*lint --emacro((773),GE_TIMESTAMP_START)*/
 /*lint -esym(773,GE_TIMESTAMP_START)*/
 #define GE_TIMESTAMP_START(stage) uint64_t startUsec_##stage = ge::GetCurrentTimestap()

 #define GE_TIMESTAMP_END(stage, stage_name)                                           \
--- a/inc/framework/common/ge_inner_error_codes.h
+++ b/inc/framework/common/ge_inner_error_codes.h
@@ -14,7 +14,6 @@
 * limitations under the License.
 */

 /*lint -e* */
 #ifndef INC_FRAMEWORK_COMMON_GE_INNER_ERROR_CODES_H_
 #define INC_FRAMEWORK_COMMON_GE_INNER_ERROR_CODES_H_

--- a/inc/framework/common/helper/om_file_helper.h
+++ b/inc/framework/common/helper/om_file_helper.h
@@ -88,5 +88,4 @@ class OmFileSaveHelper {
  OmFileContext context_;
 };
 }  // namespace ge
 /*lint +e148*/
 #endif  // INC_FRAMEWORK_COMMON_HELPER_OM_FILE_HELPER_H_
--- a/inc/graph/debug/ge_attr_define.h
+++ b/inc/graph/debug/ge_attr_define.h
@@ -774,4 +774,3 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DYNAMIC_
 }  // namespace ge

 #endif  // INC_GRAPH_DEBUG_GE_ATTR_DEFINE_H_
 /*lint +e618*/
--- a/inc/graph/model.h
+++ b/inc/graph/model.h
@@ -31,8 +31,6 @@ using std::map;
 using std::string;
 using std::vector;

 /*lint -e148*/

 class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Model : public AttrHolder {
 public:
  Model();
@@ -91,7 +89,6 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Model : public AttrHolder {
  std::string platform_version_{""};
  Graph graph_;
 };
 /*lint +e148*/
 }  // namespace ge
 using ModelPtr = std::shared_ptr<ge::Model>;

--- a/src/common/graph/ge_attr_define.cc
+++ b/src/common/graph/ge_attr_define.cc
@@ -124,7 +124,7 @@ const std::string ATTR_NAME_BROACAST_REAL_DIM_CNT = "broacast_real_dim_cnt";
 const std::string ATTR_NAME_DIM_ALIGN = "dim_align";
 const std::string ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE = "original_type";

 const std::string ATTR_NAME_SESSION_GRAPH_ID = "session_graph_id";
 const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id";

 const std::string ATTR_NAME_AUTOMIC_ADD_START = "automic_add_addr_start";
 const std::string ATTR_NAME_AUTOMIC_ADD_MEM_SIZE = "automic_add_mem_size";
--- a/src/common/graph/ge_attr_value.cc
+++ b/src/common/graph/ge_attr_value.cc
@@ -34,7 +34,7 @@ namespace ge {
 GeAttrValue::NamedAttrs::NamedAttrs() { named_attrs_.InitDefault(); }

 GeAttrValue::NamedAttrs::NamedAttrs(const ProtoMsgOwner &owner, proto::NamedAttrs *proto_msg)
    : named_attrs_(owner, proto_msg) {}  // lint !e1744
    : named_attrs_(owner, proto_msg) {}

 void GeAttrValue::NamedAttrs::SetName(const std::string &name) {
  auto proto_msg = named_attrs_.GetProtoMsg();
@@ -239,7 +239,7 @@ ATTR_VALUE_SET_GET_IMP(GeAttrValue::STR)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::STR>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::INT)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::INT>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::FLOAT)  // lint !e524
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::FLOAT)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::FLOAT>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::BOOL)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::BOOL>)
@@ -253,11 +253,9 @@ ATTR_VALUE_SET_GET_IMP(GeAttrValue::BYTES)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::BYTES>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::NAMED_ATTRS)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::NAMED_ATTRS>)
 /*lint -e665*/
 ATTR_VALUE_SET_GET_IMP(vector<vector<int64_t>>)
 /*lint +e665*/
 ATTR_VALUE_SET_GET_IMP(vector<DataType>)        // lint !e665
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::DATA_TYPE)  // lint !e665
 ATTR_VALUE_SET_GET_IMP(vector<DataType>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::DATA_TYPE)

 #undef ATTR_VALUE_SET_GET_IMP

--- a/src/common/graph/model_serialize.cc
+++ b/src/common/graph/model_serialize.cc
@@ -265,13 +265,13 @@ bool ModelSerializeImp::HandleNodeNameRef() {
               item.dst_node_name.c_str(), item.dst_in_index);
        return false;
      }
      GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");  // lint !e737
      GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");
    } else {
      // Control edge
      auto src_anchor = src_node_it->second->GetOutControlAnchor();
      auto dst_anchor = item.dst_node->GetInControlAnchor();
      if (src_anchor != nullptr && dst_anchor != nullptr) {
        GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");  // lint !e737
        GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");
      }
    }
  }
--- a/src/common/graph/op_desc.cc
+++ b/src/common/graph/op_desc.cc
@@ -32,7 +32,6 @@ using std::shared_ptr;
 using std::string;
 using std::vector;

 /*lint -save -e521 -e681 -e732 -e737*/
 namespace ge {
 const std::string ATTR_NAME_ID = "id";

--- a/src/common/graph/operator.cc
+++ b/src/common/graph/operator.cc
@@ -421,7 +421,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator OpDescUtils::CreateOpera
    return Operator("default");
  }
  OperatorKeeper::GetInstance().CheckInOperator(operator_impl_ptr);
  return operator_impl_ptr->ToOperator();  // lint !e514
  return operator_impl_ptr->ToOperator();
 }

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescPtr OpDescUtils::GetOpDescFromOperator(const Operator &oprt) {
--- a/src/common/graph/opsproto/opsproto_manager.cc
+++ b/src/common/graph/opsproto/opsproto_manager.cc
@@ -33,9 +33,7 @@ OpsProtoManager *OpsProtoManager::Instance() {
 }

 bool OpsProtoManager::Initialize(const std::map<std::string, std::string> &options) {
  /*lint -e1561*/
  auto proto_iter = options.find("ge.opsProtoLibPath");
  /*lint +e1561*/
  if (proto_iter == options.end()) {
    GELOGW("ge.opsProtoLibPath option not set, return.");
    return false;
--- a/src/common/graph/utils/op_desc_utils.cc
+++ b/src/common/graph/utils/op_desc_utils.cc
@@ -30,7 +30,6 @@

 using std::vector;

 /*lint -e512 -e737 -e752*/
 namespace ge {
 const char OP_DESC_QUANT_PARAMS[] = "quantize_factor";
 static const int CONST_OP_NORMAL_WEIGHT_SIZE = 1;
@@ -135,11 +134,11 @@ graphStatus OpDescUtils::GetQuantizeFactorParams(const OpDesc &op_desc, Quantize
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
 OpDescUtils::SetQuantizeFactorParams(const OpDescPtr &op_desc, const QuantizeFactorParams &quant) {
  GE_CHK_BOOL_EXEC_INFO(op_desc != nullptr, return GRAPH_FAILED, "op_desc is nullptr");
  return op_desc->SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));  // lint !e732
  return op_desc->SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));
 }

 graphStatus OpDescUtils::SetQuantizeFactorParams(OpDesc &op_desc, const QuantizeFactorParams &quant) {
  return op_desc.SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));  // lint !e732
  return op_desc.SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));
 }

 GeTensorPtr OpDescUtils::MutableWeights(OpDesc &op_desc) {
@@ -164,7 +163,7 @@ graphStatus OpDescUtils::SetWeights(OpDesc &op_desc, const GeTensorPtr weight) {
    GELOGE(GRAPH_FAILED, "weight is null");
    return GRAPH_FAILED;
  }
  return AttrUtils::SetTensor(&op_desc, ATTR_NAME_WEIGHTS, weight) ? GRAPH_SUCCESS : GRAPH_FAILED;  // lint !e737
  return AttrUtils::SetTensor(&op_desc, ATTR_NAME_WEIGHTS, weight) ? GRAPH_SUCCESS : GRAPH_FAILED;
 }

 graphStatus OpDescUtils::SetWeights(OpDescPtr op_desc, const GeTensorPtr weight) {
@@ -230,7 +229,7 @@ size_t OpDescUtils::GetNonConstInputsSize(const ge::Node &node) {
        continue;
      }
    }
    return input_num;  // lint !e712
    return input_num;
  } else {
    GE_IF_BOOL_EXEC(
      node.GetInDataNodes().size() < GetConstInputs(node).size(),
@@ -335,7 +334,7 @@ bool OpDescUtils::IsNonConstInput(const ge::Node &node, const size_t index) {
  bool ret = false;
  if (index < node.GetAllInDataAnchors().size()) {
    if (NodeUtils::IsAnchorStatusSet(node)) {
      ret = (ge::AnchorUtils::GetStatus(node.GetInDataAnchor(static_cast<int>(index))) == ANCHOR_DATA);  // lint !e712
      ret = (ge::AnchorUtils::GetStatus(node.GetInDataAnchor(static_cast<int>(index))) == ANCHOR_DATA);
    } else {
      for (const auto &anchor : node.GetAllInDataAnchors()) {
        if (anchor->GetIdx() != static_cast<int>(index)) {
@@ -574,4 +573,3 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus OpDescUtils::ClearWei
  return GRAPH_SUCCESS;
 }
 }  // namespace ge
 /*lint +e512 +e737 +e752*/
--- a/src/common/graph/utils/tensor_utils.cc
+++ b/src/common/graph/utils/tensor_utils.cc
@@ -286,10 +286,10 @@ static graphStatus CalcTensorElementCnt(const std::vector<int64_t> &dims, Format

  const string type_str = TypeUtils::DataTypeToSerialString(data_type);
  if (graph_status == GRAPH_SUCCESS) {
    GELOGI(
        "CalcTensorElementCnt end, format=%d(%s),"
        " data_type=%d(%s), element_cnt=%ld.",
        format, format_str.c_str(), data_type, type_str.c_str(), element_cnt);
    GELOGD(
      "CalcTensorElementCnt end, format=%d(%s),"
      " data_type=%d(%s), element_cnt=%ld.",
      format, format_str.c_str(), data_type, type_str.c_str(), element_cnt);
  } else {
    GELOGE(GRAPH_FAILED, "CalcTensorElementCnt failed, format=%d(%s), data_type=%d(%s).", format, format_str.c_str(),
           data_type, type_str.c_str());
@@ -329,10 +329,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus TensorUtils::CalcTens
  // Support unknown shape
  if (element_cnt < 0) {
    mem_size = kMemSizeUnknownShape;
    GELOGI(
        "element_cnt is unknown. "
        "format=%d(%s), data_type=%d(%s), mem_size=%ld",
        format, format_str.c_str(), data_type, type_str.c_str(), mem_size);
    GELOGD(
      "element_cnt is unknown. "
      "format=%d(%s), data_type=%d(%s), mem_size=%ld",
      format, format_str.c_str(), data_type, type_str.c_str(), mem_size);
    return GRAPH_SUCCESS;
  }
  auto type_size_int64 = static_cast<int64_t>(type_size);
@@ -343,10 +343,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus TensorUtils::CalcTens
  }
  mem_size = element_cnt * type_size_int64;

  GELOGI(
      "CalcTensorMemSize end, "
      "format=%d(%s), data_type=%d(%s), mem_size=%ld",
      format, format_str.c_str(), data_type, type_str.c_str(), mem_size);
  GELOGD(
    "CalcTensorMemSize end, "
    "format=%d(%s), data_type=%d(%s), mem_size=%ld",
    format, format_str.c_str(), data_type, type_str.c_str(), mem_size);
  return GRAPH_SUCCESS;
 }

--- a/src/ge/common/op/attr_define.cc
+++ b/src/ge/common/op/attr_define.cc
@@ -108,7 +108,7 @@ const std::string ATTR_NAME_NAN_OPT = "nan_opt";
 const std::string ATTR_NAME_AIPP = "aipp";
 const std::string NEW_AIPP_CONV_OP = "new_conv_op_for_aipp";

 const std::string ATTR_NAME_SESSION_GRAPH_ID = "session_graph_id";
 const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id";

 const std::string ATTR_NAME_MULTISHAPE_BATCHLIST = "multi_shape_batchlist";
 const std::string ATTR_NAME_MULTISHAPE_BATCHLIST_SIZE = "multi_shape_batchlist_size";
--- a/src/ge/graph/build/memory/block_mem_assigner.cc
+++ b/src/ge/graph/build/memory/block_mem_assigner.cc
@@ -402,6 +402,31 @@ bool IsOutputBlock(const ge::InDataAnchorPtr &in_data_anchor) {
  return false;
 }

 // current node's output uses previous node's output memory
 bool IsReferencePreviousNodeOutputMemory(const ge::NodePtr &node, uint32_t output_index) {
  // Get the reference type of the node, default is false
  bool is_ref = false;
  // If GetBool fail, is_ref is false.
  auto op_desc = node->GetOpDesc();
  if (op_desc == nullptr) {
    return false;
  }
  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_REFERENCE, is_ref);
  if (!is_ref) {
    return false;
  }
  const string &output_name = op_desc->GetOutputNameByIndex(output_index);
  for (const auto &input_name : op_desc->GetAllInputNames()) {
    if (!input_name.empty() && output_name == input_name) {
      int input_index = op_desc->GetInputIndexByName(input_name);
      GELOGI("Reference memory:name[%s] output[%s][%u] ref to input[%s][%d] ", op_desc->GetName().c_str(),
             output_name.c_str(), output_index, input_name.c_str(), input_index);
      return true;
    }
  }
  return false;
 }

 void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory) {
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null.");
  GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory");
@@ -489,7 +514,7 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
      if (output_op_desc != nullptr) {
        GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS, GELOGI("Get size failed"));
      }
      if ((size == 0) || CheckIsZeroMemNodeType(n->GetType())) {
      if ((size == 0) || CheckIsZeroMemNodeType(n->GetType()) || IsReferencePreviousNodeOutputMemory(n, i)) {
        zero_memory_list_.emplace_back(n, kOutput, i);
        continue;
      }
@@ -607,11 +632,11 @@ void BlockMemAssigner::MergeDynamicBatchBlocks() {
    std::sort(it->second.begin(), it->second.end(), CompareBlockMaxSize);
  }
  if (it_max != dynamic_batch_blocks.end()) {
    GELOGI("MergeDynamicBatch %s block counts %zu", it_max->first.c_str(), it_max->second.size());
    GELOGD("MergeDynamicBatch %s block counts %zu", it_max->first.c_str(), it_max->second.size());
  }
  for (it = dynamic_batch_blocks.begin(); it != dynamic_batch_blocks.end(); ++it) {
    if (it != it_max) {
      GELOGI("MergeDynamicBatch from %s to %s", it->first.c_str(), it_max->first.c_str());
      GELOGD("MergeDynamicBatch from %s to %s", it->first.c_str(), it_max->first.c_str());
      MergeBlocks(it_max->second, it->second);
    }
  }
--- a/src/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/src/ge/graph/build/memory/graph_mem_assigner.cc
@@ -296,7 +296,7 @@ Status GraphMemoryAssigner::ReAssignVirtualConcatMemory() {
      }
      output_list.at(0) = memory_offset_[0].mem_offset_;
      n->GetOpDesc()->SetOutputOffset(output_list);
      GELOGI("Set Concat %s output offset to %zu.", n->GetOpDesc()->GetName().c_str(), memory_offset_[0].mem_offset_);
      GELOGD("Set Concat %s output offset to %zu.", n->GetOpDesc()->GetName().c_str(), memory_offset_[0].mem_offset_);

      size_t extra_memory_size = 0;
      for (const auto &in_data_anchor : n->GetAllInDataAnchors()) {
@@ -401,7 +401,7 @@ Status GraphMemoryAssigner::ReAssignMergeMemory() {
        data_output_offset = output_list[index];
        max_output_size = tmp_output_size;
      }
      GELOGI("merge=%s, input=%s, size=%ld, offset=%ld, max_size=%ld", n->GetName().c_str(),
      GELOGD("merge=%s, input=%s, size=%ld, offset=%ld, max_size=%ld", n->GetName().c_str(),
             src_node->GetName().c_str(), tmp_output_size, data_output_offset, max_output_size);
    }

@@ -541,7 +541,7 @@ Status GraphMemoryAssigner::AssignReferenceMemory(const ge::NodePtr &node) {
      GE_CHECK_NOTNULL(peer_out_op_desc);
      output_list[out_data_anchor->GetIdx()] = peer_out_op_desc->GetOutputOffset()[peer_out_anchor_index];
    } else {
      GELOGI("Reference output : origin %s name[%s] output[%d] offset is [%ld] stream_id[%ld]",
      GELOGD("Reference output : origin %s name[%s] output[%d] offset is [%ld] stream_id[%ld]",
             node->GetOwnerComputeGraph()->GetName().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(),
             output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId());
    }
@@ -576,7 +576,7 @@ bool GraphMemoryAssigner::CheckInputIsSupportAtomic(const ge::NodePtr &node) {
 Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node) {
  auto op_desc = node->GetOpDesc();
  GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(ge::FAILED, "op_desc is null."); return ge::FAILED);
  GELOGI("Begin to assign atomic output memory, node = %s.", op_desc->GetName().c_str());
  GELOGD("Begin to assign atomic output memory, node = %s.", op_desc->GetName().c_str());

  vector<int64_t> atomic_output_index;
  // If GetListInt fail, atomic_output_index is empty.
@@ -620,7 +620,7 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node) {

    // If you have already assigned an atomic address, skip it, and you don't need to reassign it.
    if (is_assigned_mem) {
      GELOGI(
      GELOGD(
        "[IMAS]Atomic output : we have assigned atomic memory as the input of next node in "
        "ReAssignContinuousMemory function.");
      continue;
@@ -822,7 +822,7 @@ Status GraphMemoryAssigner::SetLoopGraphAtomicAttr(const ge::NodePtr &node, int6
        continue;
      }

      GELOGI("SetLoopGraphAtomicAttr,  node is %s, op type is %s.", peer_out_node_desc->GetName().c_str(),
      GELOGD("SetLoopGraphAtomicAttr,  node is %s, op type is %s.", peer_out_node_desc->GetName().c_str(),
             peer_out_node_desc->GetType().c_str());

      if (peer_out_node_desc->GetType() == ATOMICADDRCLEAN) {
--- a/src/ge/graph/build/task_generator.cc
+++ b/src/ge/graph/build/task_generator.cc
@@ -398,23 +398,26 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi
    if (op_kernel_lib_name.empty()) {
      continue;
    }
    if (op_desc->GetName() == bp_point_str) {
      last_bp = current_idx;
      GELOGI("Last bp name %s, idx %u", op_desc->GetName().c_str(), last_bp);
    }

    if (op_desc->GetType() == NETOUTPUT) {
      iter_end = current_idx;
      GELOGI("Iter end name %s, idx %u", op_desc->GetName().c_str(), iter_end);
    }
    if (op_desc->GetName() == fp_point_str) {
      first_fp = current_idx;
      GELOGI("First fp name %s, idx %u", op_desc->GetName().c_str(), first_fp);
    }

    if (op_desc->GetType() == HCOMALLREDUCE) {
      ar_ppoint.emplace_back(current_idx);
      GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx);
    }

    if (first_fp == 0 && IsProfPoint(op_desc, fp_point_str)) {
      first_fp = current_idx;
      GELOGI("First fp name %s, idx %u", op_desc->GetName().c_str(), first_fp);
    }

    if (IsProfPoint(op_desc, bp_point_str)) {
      last_bp = current_idx;
      GELOGI("Last bp name %s, idx %u", op_desc->GetName().c_str(), last_bp);
    }
  }
  ppoint.fp_index = first_fp;
  ppoint.bp_index = last_bp;
@@ -526,4 +529,29 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
  }
  return SUCCESS;
 }

 bool TaskGenerator::IsProfPoint(const OpDescPtr &op, const std::string &name) {
  if (op == nullptr) {
    return false;
  }

  if (op->GetName() == name) {
    return true;
  }

  std::vector<std::string> original_op_names;
  bool ret = AttrUtils::GetListStr(op, ge::ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, original_op_names);
  if (!ret) {
    return false;
  }

  for (auto &origin_name : original_op_names) {
    if (origin_name == name) {
      return true;
    }
  }

  return false;
 }

 }  // namespace ge
--- a/src/ge/graph/build/task_generator.h
+++ b/src/ge/graph/build/task_generator.h
@@ -99,6 +99,8 @@ class TaskGenerator {
                                  std::vector<uint32_t> &ar_ppoint, uint32_t node_index,
                                  std::vector<domi::TaskDef> &task_def_list);

  static bool IsProfPoint(const OpDescPtr &op, const std::string &name);

  uint8_t *var_mem_base_ = nullptr;
  uint64_t var_mem_size_ = 0;
 };
--- a/src/ge/graph/load/graph_loader.cc
+++ b/src/ge/graph/load/graph_loader.cc
@@ -336,7 +336,7 @@ Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model
    auto model_manager = ModelManager::GetInstance();
    GE_CHECK_NOTNULL(model_manager);
    Status ret =
        model_manager->LoadModelOffline(model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize);
      model_manager->LoadModelOffline(model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize);
    if (ret != SUCCESS) {
      GELOGE(ret, "Load model failed, model_id:%u.", model_id);
      return ret;
@@ -428,4 +428,15 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) {
  GELOGI("GetMemoryInfo free[%zu], total[%zu], return free[%ld]", free_mem, total_mem, free);
  return SUCCESS;
 }

 Status GraphLoader::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id) {
  auto model_manager = ModelManager::GetInstance();
  GE_CHECK_NOTNULL(model_manager);
  Status ret = model_manager->DestroyAicpuKernel(session_id, model_id);
  if (ret != SUCCESS) {
    GELOGE(ret, "Destroy aicpu kernel failed.");
    return ret;
  }
  return SUCCESS;
 }
 }  // namespace ge
--- a/src/ge/graph/load/graph_loader.h
+++ b/src/ge/graph/load/graph_loader.h
@@ -73,6 +73,8 @@ class GraphLoader {
  static Status ExecuteModel(uint32_t model_id, rtStream_t stream, bool async_mode, const InputData &input_data,
                             OutputData &output_data);

  static Status DestroyAicpuKernel(uint64_t session_id, uint32_t model_id);

 private:
  static Status LoadModelOnline(uint32_t &model_id, std::shared_ptr<ge::Model> &model,
                                const std::shared_ptr<ModelListener> &listener);
--- a/src/ge/graph/load/new_model_manager/model_manager.cc
+++ b/src/ge/graph/load/new_model_manager/model_manager.cc
@@ -18,7 +18,6 @@

 #include <string>

 #include "cce/aicpu_engine_struct.h"
 #include "common/l2_cache_optimize.h"
 #include "common/profiling/profiling_manager.h"
 #include "common/properties_manager.h"
@@ -41,17 +40,43 @@ std::shared_ptr<ModelManager> ModelManager::GetInstance() {

 ModelManager::ModelManager() { max_model_id_ = 0; }

 static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t session_id) {
 Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, uint64_t session_id, uint32_t model_id) {
  STR_FWK_OP_KERNEL param_base = {};
  void *devicebase = nullptr;
  void *aicpu_kernel_addr = nullptr;
  const uint32_t kKernelType = 0;
  param_base.fwkKernelType = kKernelType;
  param_base.fwkKernelBase.fwk_kernel.opType = opType;
  param_base.fwkKernelBase.fwk_kernel.opType = op_type;
  param_base.fwkKernelBase.fwk_kernel.sessionID = session_id;
  if (op_type == aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY) {
    std::vector<uint64_t> v_aicpu_kernel;
    std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
    auto iter = model_aicpu_kernel_.find(model_key);
    if (iter != model_aicpu_kernel_.end()) {
      GELOGD("kernel destroy session_id %lu, model_id %u.", session_id, model_id);
      v_aicpu_kernel = model_aicpu_kernel_.at(model_key);
      // Insert size of aicpu kernel vector in the first element
      v_aicpu_kernel.insert(v_aicpu_kernel.begin(), v_aicpu_kernel.size());

      auto kernel_size = sizeof(uint64_t) * (v_aicpu_kernel.size());
      rtError_t rt_ret = rtMalloc(&aicpu_kernel_addr, kernel_size, RT_MEMORY_HBM);
      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
                      return RT_FAILED;)

      rt_ret = rtMemcpy(aicpu_kernel_addr, kernel_size, v_aicpu_kernel.data(), kernel_size, RT_MEMCPY_HOST_TO_DEVICE);
      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy to input_output_addr_ error: 0x%X", rt_ret);
                      GE_CHK_RT(rtFree(aicpu_kernel_addr)); return FAILED;)
      uint64_t kernel_id_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(aicpu_kernel_addr));
      param_base.fwkKernelBase.fwk_kernel.kernelID = kernel_id_addr;
      // Remove model key from map
      model_aicpu_kernel_.erase(iter);
    }
  }

  rtError_t rt_ret = rtMalloc(&(devicebase), sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "malloc device memory failed.");
    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    return FAILED;
  }

@@ -59,6 +84,7 @@ static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t
    rtMemcpy(devicebase, sizeof(STR_FWK_OP_KERNEL), &param_base, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "memory copy to device failed.");
    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    GE_CHK_RT(rtFree(devicebase));
    return FAILED;
  }
@@ -67,6 +93,7 @@ static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t
  rt_ret = rtStreamCreate(&stream, 0);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "create stream failed.");
    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    GE_CHK_RT(rtFree(devicebase));
    return FAILED;
  }
@@ -74,6 +101,7 @@ static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t
  rt_ret = rtKernelLaunchEx(devicebase, sizeof(STR_FWK_OP_KERNEL), 0, stream);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "rtKernelLaunchEx failed.");
    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    GE_CHK_RT(rtFree(devicebase));
    GE_CHK_RT(rtStreamDestroy(stream));
    return FAILED;
@@ -81,11 +109,20 @@ static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t
  rt_ret = rtStreamSynchronize(stream);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "rtStreamSynchronize failed.");
    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    GE_CHK_RT(rtFree(devicebase));
    GE_CHK_RT(rtStreamDestroy(stream));
    return FAILED;
  }

  if (aicpu_kernel_addr != nullptr) {
    rt_ret = rtFree(aicpu_kernel_addr);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(rt_ret, "free memory failed.");
      GE_CHK_RT(rtFree(devicebase));
      GE_CHK_RT(rtStreamDestroy(stream));
      return FAILED;
    }
  }
  rt_ret = rtFree(devicebase);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "free memory failed.");
@@ -107,7 +144,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
    GELOGI("The session: %lu not created.", session_id);
    return;
  } else {
    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id);
    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0);
    if (ret != SUCCESS) {
      GELOGW("The session: %lu destroy failed.", session_id);
    } else {
@@ -117,9 +154,36 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
  }
 }

 ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id) {
  GELOGD("destroy aicpu kernel in session_id %lu, model_id %u.", session_id, model_id);
  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
  if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY, session_id, model_id);
    if (ret != SUCCESS) {
      GELOGE(FAILED, "Destroy aicpu kernel failed.");
      return FAILED;
    }
  }
  return SUCCESS;
 }

 ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id) {
  std::vector<uint64_t> v_aicpu_kernel;
  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
  if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
    v_aicpu_kernel = model_aicpu_kernel_.at(model_key);
  }
  v_aicpu_kernel.push_back(kernel_id);
  model_aicpu_kernel_[model_key] = v_aicpu_kernel;
  return SUCCESS;
 }

 ModelManager::~ModelManager() {
  std::lock_guard<std::mutex> lock(map_mutex_);
  model_map_.clear();
  model_aicpu_kernel_.clear();

  GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0)));
 }
@@ -687,7 +751,7 @@ Status ModelManager::CreateAicpuSession(uint64_t session_id) {
  auto it = sess_ids_.find(session_id);
  // never been created by any model
  if (it == sess_ids_.end()) {
    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_CREATE, session_id);
    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_CREATE, session_id, 0);
    if (ret == SUCCESS) {
      (void)sess_ids_.insert(session_id);
      GELOGI("The session: %lu create success.", session_id);
--- a/src/ge/graph/load/new_model_manager/model_manager.h
+++ b/src/ge/graph/load/new_model_manager/model_manager.h
@@ -24,6 +24,7 @@
 #include <memory>
 #include <set>
 #include <vector>
 #include "cce/aicpu_engine_struct.h"
 #include "common/types.h"
 #include "common/ge_types.h"
 #include "common/ge_inner_error_codes.h"
@@ -199,12 +200,18 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  ///
  std::shared_ptr<DavinciModel> GetModel(uint32_t id);

  ge::Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, uint64_t session_id, uint32_t model_id);

  ge::Status CreateAicpuSession(uint64_t session_id);

  static ge::Status GetModelMemAndWeightSize(const ModelData &model, size_t &mem_size, size_t &weight_size);

  void DestroyAicpuSession(uint64_t session_id);

  ge::Status DestroyAicpuKernel(uint64_t session_id, uint32_t model_id);

  ge::Status CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id);

 private:
  ///
  /// @ingroup domi_ome
@@ -233,6 +240,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  void GenModelId(uint32_t *id);

  std::map<uint32_t, std::shared_ptr<DavinciModel>> model_map_;
  std::map<std::string, std::vector<uint64_t>> model_aicpu_kernel_;
  std::vector<uint32_t> free_model_id_;
  uint32_t max_model_id_;
  std::mutex map_mutex_;
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
@@ -120,9 +120,13 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
  GELOGI("session_id: %lu", session_id);
  GE_CHECK_NOTNULL(ModelManager::GetInstance());
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS,
                  GELOGE(ret, "CreateAicpuSession error.");
                  return ret;)

                  GELOGE(FAILED, "CreateAicpuSession error.");
                  return FAILED;)
  // 4.1 Collect aicpu kernel
  uint64_t kernel_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.kernelID;
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuKernel(session_id, davinci_model->Id(), kernel_id) != SUCCESS,
                  GELOGE(FAILED, "CreateAicpuKernel error.");
                  return FAILED;)
  // 5. Return result
  rtError_t rt_ret = rtMalloc(&kernel_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM);
  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc error: 0x%X", rt_ret); return FAILED;)
--- a/src/ge/graph/load/output/output.cc
+++ b/src/ge/graph/load/output/output.cc
@@ -84,8 +84,6 @@ Status Output::Init() {
    }
  }

  GELOGI("Init output:%lu, %lu, %lu", input_num_, v_input_size_.size(), v_input_data_addr_.size());

  return SUCCESS;
 }

@@ -125,11 +123,10 @@ Status Output::SetDataBuf(DataBuffer &data_buf, uint32_t &data_count, size_t i,
  }

  if (data_buf.isDataSupportMemShare && support_mem_share) {
    GELOGI("No need to copy input data, user's output data buffer can be shared.");
    GELOGD("No need to copy input data, user's output data buffer can be shared.");
  } else {
    // Copy result to Databuf
    uint32_t size = v_input_size_[i];
    GELOGI("Tensor data size before: %u", size);

    graphStatus graph_status = TensorUtils::GetTensorSizeInBytes(*tensor_desc, size);
    if (graph_status != ge::GRAPH_SUCCESS) {
@@ -142,7 +139,7 @@ Status Output::SetDataBuf(DataBuffer &data_buf, uint32_t &data_count, size_t i,
      GELOGE(rt_ret, "rtmemcpy error");
      return FAILED;
    }
    GELOGI("Tensor data size: %u data_buflength: %u", size, data_buf.length);
    GELOGD("Tensor data size: %u data_buflength: %u", size, data_buf.length);
  }

  ++data_count;
--- a/src/ge/graph/manager/graph_manager.cc
+++ b/src/ge/graph/manager/graph_manager.cc
@@ -34,6 +34,7 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/ge_types.h"
 #include "graph/common/transop_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/ge_context.h"
 #include "graph/ge_global_options.h"
 #include "graph/ge_local_context.h"
@@ -1078,28 +1079,60 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const std::vector
  GELOGI("[GraphManager] CheckpointHandle, outputsSize=%zu.", outputs.size());
  std::vector<InputOutputDescInfo> outputs_desc = graph_executor_.GetOutputsDesc();
  GELOGI("[GraphManager] CheckpointHandle, outputsDescSize=%zu.", outputs_desc.size());
  // find graph
  GraphNodePtr graph_node = nullptr;
  Status ret = GetGraphNode(graph_id, graph_node);
  if (ret != SUCCESS) {
    GELOGE(ret, "[CheckpointHandle] graph not exist, graph_id = %u.", graph_id);
    return ret;
  }
  ComputeGraphPtr compute_graph_ptr = GraphUtils::GetComputeGraph(*(graph_node->GetGraph()));
  std::map<string, Tensor> save_results;
  for (size_t i = 0; i < outputs_desc.size(); ++i) {
    std::string desc_name = outputs_desc.at(i).name;
    auto index = desc_name.find_last_of("_");
    if (index != std::string::npos) {
      desc_name = desc_name.substr(0, index);
      index = desc_name.find_first_of("_");
      if (index != std::string::npos) {
        desc_name = desc_name.substr(index + 1);
        index = desc_name.find_first_of("_");
        if (index != std::string::npos) {
          desc_name = desc_name.substr(index + 1);
  NodePtr netoutput = nullptr;
  for (const auto &node : compute_graph_ptr->GetDirectNode()) {
    if (node->GetType() == kNetOutput) {
      netoutput = node;
      break;
    }
  }
  if (netoutput == nullptr) {
    GELOGE(FAILED, "Netoutput is null.");
    return FAILED;
  }
  for (const auto &in : netoutput->GetAllInDataAnchors()) {
    std::string desc_name;
    auto out_anchor = in->GetPeerOutAnchor();
    if (out_anchor == nullptr) {
      GELOGE(FAILED, "out_anchor is null.");
      return FAILED;
    }
    ge::NodePtr peer_node = out_anchor->GetOwnerNode();
    // find the variable node in graph
    while (peer_node != nullptr && peer_node->GetType() != kVariable) {
      if (peer_node->GetAllInDataAnchors().size() != 1) {
        GELOGE(FAILED, "More than one prior nodes of peer_node %s in checkpoint Graph.", peer_node->GetName().c_str());
        return FAILED;
      }
      auto peer_node_in = peer_node->GetAllInDataAnchors().at(0);
      auto peer_node_out_anchor = peer_node_in->GetPeerOutAnchor();
      if (peer_node_out_anchor != nullptr) {
        peer_node = peer_node_out_anchor->GetOwnerNode();
        if (peer_node->GetType() == kVariable) {
          break;
        }
      }
    }
    index = desc_name.find("_trans");
    if (index != std::string::npos) {
      desc_name = desc_name.substr(0, index);
    if (peer_node == nullptr) {
      GELOGE(FAILED, "No variable op found in one branch, checkpoint graph illegal.");
      return FAILED;
    }

    desc_name = peer_node->GetName();
    GELOGI("[GraphManager] CheckpointHandle, descName=%s.", desc_name.c_str());
    save_results.emplace(desc_name, TensorAdapter::AsTensor(outputs.at(i)));
    if (in->GetIdx() >= static_cast<int>(outputs.size())) {
      GELOGE(FAILED, "variable index out of range.");
      return FAILED;
    }
    save_results.emplace(desc_name, TensorAdapter::AsTensor(outputs.at(in->GetIdx())));
  }

  if (!save_results.empty()) {
@@ -1447,6 +1480,8 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra
  int64_t memory_size = ret ? value : 0;
  ret = ge::AttrUtils::GetInt(ge_model, ATTR_MODEL_WEIGHT_SIZE, value);
  int64_t weight_size = ret ? value : 0;
  ret = ge::AttrUtils::GetInt(ge_model, MODEL_ATTR_SESSION_ID, value);
  uint64_t session_id = ret ? value : 0;

  int64_t free_memory = 0;
  Status result = GraphLoader::GetMemoryInfo(free_memory);
@@ -1494,6 +1529,11 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra
    if (result != SUCCESS) {
      GELOGW("[GraphManager:] unload model failed, modelId=%u, graphId=%u.", model_id, graph_id);
    }
    result = GraphLoader::DestroyAicpuKernel(session_id, model_id);
    if (result != SUCCESS) {
      GELOGW("[GraphManager:] destroy aicpu kernel failed when dynamic memory, modelId=%u, graphId=%u.", model_id,
             graph_id);
    }
    rt_ret = rtDeviceReset(GetContext().DeviceId());
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, modelId=%u, graphId=%u.", model_id, graph_id);
--- a/src/ge/graph/optimize/graph_optimize.h
+++ b/src/ge/graph/optimize/graph_optimize.h
@@ -33,7 +33,6 @@
 #include "graph/manager/graph_manager_utils.h"
 #include "omg/omg_inner_types.h"

 /*lint -e148*/
 namespace ge {
 using ComputeGraphPtr = std::shared_ptr<ge::ComputeGraph>;
 using GraphOptimizerPtr = std::shared_ptr<ge::GraphOptimizer>;
@@ -56,7 +55,7 @@ class GraphOptimize {

  const std::map<uint32_t, std::map<string, size_t>> &GetSummaryOutputIndexes() const {
    return summary_output_indexes_;
  }  // lint !e1073
  }

  void ClearSummaryOutputIndexes() { summary_output_indexes_.clear(); }

@@ -81,6 +80,5 @@ class GraphOptimize {
  std::map<uint32_t, std::map<string, size_t>> summary_output_indexes_ = {};
  std::string func_bin_path_;
 };
 };  // namespace ge
 /*lint +e148*/
 };      // namespace ge
 #endif  // GE_GRAPH_OPTIMIZE_GRAPH_OPTIMIZE_H_
--- a/src/ge/graph/passes/assert_pass.h
+++ b/src/ge/graph/passes/assert_pass.h
@@ -24,7 +24,7 @@
 namespace ge {
 class AssertPass : public BaseNodePass {
 public:
  Status Run(NodePtr& node) override; /*lint !e148*/
  Status Run(NodePtr& node) override;

 private:
  ///
@@ -33,7 +33,7 @@ class AssertPass : public BaseNodePass {
  /// @param nodes_unused nodes to be deleted
  /// @return void
  ///
  void CollectUnusedNode(const NodePtr& assert_node, std::vector<ge::NodePtr>& nodes_unused); /*lint !e148*/
  void CollectUnusedNode(const NodePtr& assert_node, std::vector<ge::NodePtr>& nodes_unused);

  ///
  /// remove unused nodes from graph
@@ -41,7 +41,7 @@ class AssertPass : public BaseNodePass {
  /// @param nodes_unused nodes to be deleted
  /// @return Status
  ///
  Status RemoveUnusedNode(std::vector<NodePtr>& nodes_unused); /*lint !e148*/
  Status RemoveUnusedNode(std::vector<NodePtr>& nodes_unused);
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_ASSERT_PASS_H_
--- a/src/ge/graph/passes/atomic_addr_clean_pass.cc
+++ b/src/ge/graph/passes/atomic_addr_clean_pass.cc
@@ -170,7 +170,7 @@ Status AtomicAddrCleanPass::LinkToAtomicNode(const NodePtr &atomic_node, NodePtr
           atomic_node->GetName().c_str());
    return INTERNAL_ERROR;
  }
  GELOGI("Graph add cleanAddrNode op out ctrl edge, dst node: %s.", atomic_node->GetName().c_str());
  GELOGD("Graph add cleanAddrNode op out ctrl edge, dst node: %s.", atomic_node->GetName().c_str());
  std::string stream_label;
  if (is_loop_graph && AttrUtils::GetStr(atomic_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label)) {
    if (!AttrUtils::SetStr(atomic_clean_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label)) {
@@ -228,7 +228,7 @@ bool AtomicAddrCleanPass::IsAtomicOp(const NodePtr &node) {
  if (ret != GRAPH_SUCCESS) {
    GELOGW("set attr ATOMIC_ATTR_IS_ATOMIC_NODE fail.");
  }
  GELOGI("Recognized atomic op %s from FE engine.", op_desc->GetName().c_str());
  GELOGD("Recognized atomic op %s from FE engine.", op_desc->GetName().c_str());
  return true;
 }
 }  // namespace ge
--- a/src/ge/graph/passes/compile_nodes_pass.cc
+++ b/src/ge/graph/passes/compile_nodes_pass.cc
@@ -25,6 +25,8 @@
 #include "graph/debug/ge_attr_define.h"
 #include "graph/op_desc.h"

 using domi::ImplyType;

 namespace {
 const char *const kAICPUEngineName = "DNN_VM_AICPU";
 const char *const kAICPUKernelLibName = "aicpu_kernel";
@@ -142,6 +144,7 @@ graphStatus CompileNodesPass::CompileNodes(const std::shared_ptr<GELib> instance
        // this node will go to aicpu engine ,no need compile
        node->GetOpDesc()->SetOpEngineName(kAICPUEngineName);
        node->GetOpDesc()->SetOpKernelLibName(kAICPUKernelLibName);
        AttrUtils::SetInt(node->GetOpDesc(), ATTR_NAME_IMPLY_TYPE, static_cast<int64_t>(ImplyType::AI_CPU));
      }
      continue;
    }
--- a/src/ge/graph/passes/net_output_pass.cc
+++ b/src/ge/graph/passes/net_output_pass.cc
@@ -169,7 +169,6 @@ Status NetOutputPass::RemoveUnusedNode(const ge::ComputeGraphPtr &graph) {
      GELOGE(INTERNAL_ERROR, "Remove node failed, node name:%s.", node->GetName().c_str());
      return INTERNAL_ERROR;
    }
    GELOGI("Net output pass remove node:%s.", node->GetName().c_str());
  }
  return SUCCESS;
 }
@@ -209,7 +208,7 @@ Status NetOutputPass::UpdateNetOutputDesc(const ge::NodePtr &net_output) {
      GELOGE(INTERNAL_ERROR, "Update output desc failed, index:%u.", index);
      return INTERNAL_ERROR;
    }
    GELOGI("Update desc, format:%s, data type:%s, index:%u.",
    GELOGD("Update desc, format:%s, data type:%s, index:%u.",
           TypeUtils::FormatToSerialString(output_in_desc.GetFormat()).c_str(),
           TypeUtils::DataTypeToSerialString(output_in_desc.GetDataType()).c_str(), index);
  }
@@ -234,7 +233,7 @@ Status NetOutputPass::AddCtrlEdgeForTargets(const ge::NodePtr &net_out_node) {
             net_out_node->GetName().c_str(), node->GetName().c_str());
      return INTERNAL_ERROR;
    }
    GELOGI("Add ctrl edge to netoutput node[%s] for target node [%s] success!", net_out_node->GetName().c_str(),
    GELOGD("Add ctrl edge to netoutput node[%s] for target node [%s] success!", net_out_node->GetName().c_str(),
           node->GetName().c_str());
  }
  return SUCCESS;
@@ -265,7 +264,7 @@ Status NetOutputPass::AddEdgesForNetOutput(const ge::ComputeGraphPtr &graph, con
             item.second, net_input_index);
      return INTERNAL_ERROR;
    }
    GELOGI("AddEdge to output node, src name:%s, src index:%d, dst index:%d.", src_node->GetName().c_str(), item.second,
    GELOGD("AddEdge to output node, src name:%s, src index:%d, dst index:%d.", src_node->GetName().c_str(), item.second,
           net_input_index);
    net_input_index++;
  }
@@ -417,7 +416,7 @@ Status NetOutputPass::AddCtrlEdgesBetweenLeafAndNetOutput(const ge::ComputeGraph
        node->GetOutDataNodesSize() == 0 && node->GetOutControlNodes().size() == 0) {
      GE_CHK_STATUS_RET(GraphUtils::AddEdge(node->GetOutControlAnchor(), net_out_node->GetInControlAnchor()),
                        "add edge failed");
      GELOGI("Add ctrl edge success. src name :%s, dst name :%s", node->GetName().c_str(),
      GELOGD("Add ctrl edge success. src name :%s, dst name :%s", node->GetName().c_str(),
             net_out_node->GetName().c_str());
    }
  }
@@ -488,11 +487,10 @@ Status NetOutputPass::Run(ge::ComputeGraphPtr graph) {
      auto it = targets_.find(src_node);
      if (it != targets_.end()) {
        iter = output_nodes_info.erase(iter);
        GELOGI("node [%s] is in processed targets, do not add inout for netoutput!", src_node->GetName().c_str());
        GELOGD("node [%s] is in processed targets, do not add inout for netoutput!", src_node->GetName().c_str());
        continue;
      }
      AddInOutForNetOutputOp(graph, net_output_desc, src_node, src_index);
      GELOGI("Add output node:%s, index:%d.", src_node->GetName().c_str(), src_index);
      is_input_const.push_back(PassUtils::IsConstant(src_node));
      ++iter;
    }
--- a/src/ge/graph/passes/variable_op_pass.cc
+++ b/src/ge/graph/passes/variable_op_pass.cc
@@ -160,7 +160,7 @@ Status VariableOpPass::Run(ge::ComputeGraphPtr graph) {

    auto start_iter = fusion_road.begin();
    auto end_iter = fusion_road.rbegin();
    GELOGI(
    GELOGD(
      "Trans variable data for %s from format %s to %s, shape %s to %s "
      "data-type %s to %s, path len %zu success",
      node->GetName().c_str(), TypeUtils::FormatToSerialString(start_iter->input.GetFormat()).c_str(),
@@ -197,7 +197,7 @@ Status VariableOpPass::DealFusion(const ge::NodePtr &var_node) {
  GELOGD("Begin to fusion var %s with trans", var_node->GetName().c_str());
  auto graph = var_node->GetOwnerComputeGraph();
  for (auto &trans_node : var_node->GetOutDataNodes()) {
    GELOGI("Remove node %s type %s when fusion with variable %s", trans_node->GetName().c_str(),
    GELOGD("Remove node %s type %s when fusion with variable %s", trans_node->GetName().c_str(),
           trans_node->GetType().c_str(), var_node->GetName().c_str());

    if (GraphUtils::IsolateNode(trans_node, {0}) != SUCCESS) {
@@ -218,7 +218,7 @@ Status VariableOpPass::DealFusion(const ge::NodePtr &var_node) {
  for (auto ref_node : iterator->second) {
    GE_CHECK_NOTNULL(ref_node);
    for (auto &trans_node : ref_node->GetInDataNodes()) {
      GELOGI("Remove node %s type %s when fusion with variable %s", trans_node->GetName().c_str(),
      GELOGD("Remove node %s type %s when fusion with variable %s", trans_node->GetName().c_str(),
             trans_node->GetType().c_str(), var_node->GetName().c_str());
      if (trans_node->GetOutDataNodes().size() > 1) {
        GELOGD(
@@ -578,7 +578,7 @@ Status VariableOpPass::RenewVarDesc(ge::ComputeGraphPtr &graph) {
      (node->GetType() == VARIABLE) || (node->GetType() == VARIABLEV2) || (node->GetType() == VARHANDLEOP);
    if (is_var_node) {
      if (!ge::VarManager::Instance(graph->GetSessionID())->IsVarExist(node->GetName())) {
        GELOGI("var manager does not exist var node[%s]", node->GetName().c_str());
        GELOGD("var manager does not exist var node[%s]", node->GetName().c_str());
        continue;
      }
      GELOGD("var manager exist var node[%s], graph name[%s]", node->GetName().c_str(), graph->GetName().c_str());
--- a/src/ge/graph/passes/variable_prepare_op_pass.cc
+++ b/src/ge/graph/passes/variable_prepare_op_pass.cc
@@ -190,7 +190,7 @@ ge::NodePtr VariablePrepareOpPass::CreatVariableRef(ge::NodePtr &final_writable_
    GELOGE(FAILED, "parameter ptr is null.");
    return nullptr;
  }
  GELOGI("Create VarRef Op: final_writable_node: [%s] var_node: [%s]>>>>", final_writable_node->GetName().c_str(),
  GELOGD("Create VarRef Op: final_writable_node: [%s] var_node: [%s]>>>>", final_writable_node->GetName().c_str(),
         var_node->GetName().c_str());

  static uint32_t var_ref_count = 0;
@@ -220,7 +220,7 @@ ge::NodePtr VariablePrepareOpPass::CreatVariableRef(ge::NodePtr &final_writable_

  bool is_set_str = ge::AttrUtils::SetStr(var_ref_op_desc, REF_VAR_SRC_VAR_NAME, var_op_desc->GetName());
  if (is_set_str) {
    GELOGI("Set node [%s] REF_VAR_SRC_VAR_NAME [%s]", var_ref_node->GetName().c_str(), var_op_desc->GetName().c_str());
    GELOGD("Set node [%s] REF_VAR_SRC_VAR_NAME [%s]", var_ref_node->GetName().c_str(), var_op_desc->GetName().c_str());
  }
  return var_ref_node;
 }
@@ -229,7 +229,7 @@ int VariablePrepareOpPass::GetWritableNodeOutIndex(const NodePtr &node, int inpu
  if (node == nullptr) {
    return -1;
  }
  GELOGI("get writable node and input index %s:%d", node->GetName().c_str(), input_index);
  GELOGD("get writable node and input index %s:%d", node->GetName().c_str(), input_index);
  auto node_type = node->GetType();
  if (node_type == ASSIGN) {
    if (UpdateAssignOpDesc(node) != SUCCESS) {
--- a/src/ge/graph/preprocess/graph_preprocess.h
+++ b/src/ge/graph/preprocess/graph_preprocess.h
@@ -37,7 +37,6 @@
 #include "omg/omg_inner_types.h"
 #include "runtime/context.h"

 /*lint -e148*/
 namespace ge {
 class GraphPrepare {
 public:
@@ -73,5 +72,4 @@ class GraphPrepare {
  GraphManagerOptions options_;
 };
 }  // namespace ge
 /*lint +e148*/
 #endif  // GE_GRAPH_PREPROCESS_GRAPH_PREPROCESS_H_
--- a/src/ge/model/ge_model.h
+++ b/src/ge/model/ge_model.h
@@ -74,12 +74,12 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder
 private:
  void Init();

  ProtoAttrMapHelper attrs_; /*lint !e148*/
  ProtoAttrMapHelper attrs_;

  Graph graph_;                              /*lint !e148*/
  std::shared_ptr<domi::ModelTaskDef> task_; /*lint !e148*/
  Graph graph_;
  std::shared_ptr<domi::ModelTaskDef> task_;
  TBEKernelStore tbe_kernal_store_;
  Buffer weights_buffer_; /*lint !e148*/
  Buffer weights_buffer_;

  std::string name_;
  uint32_t version_ = {0};
--- a/src/ge/session/inner_session.h
+++ b/src/ge/session/inner_session.h
@@ -35,8 +35,7 @@ class InnerSession {

  Status AddGraph(uint32_t graph_id, const Graph &graph);

  Status AddGraph(uint32_t graph_id, const Graph &graph,
                  const std::map<std::string, std::string> &options); /*lint !e148*/
  Status AddGraph(uint32_t graph_id, const Graph &graph, const std::map<std::string, std::string> &options);

  Status RunGraph(uint32_t graph_id, const std::vector<Tensor> &inputs, std::vector<Tensor> &outputs);

--- a/tests/ut/common/graph/CMakeLists.txt
+++ b/tests/ut/common/graph/CMakeLists.txt
@@ -35,11 +35,8 @@ include_directories(${GE_SOURCE_DIR}/inc/external)
 include_directories(${GE_SOURCE_DIR}/inc/external/graph)
 include_directories(${GE_SOURCE_DIR}/inc/graph)
 include_directories(${GE_SOURCE_DIR}/inc/common)
 include_directories(${GE_SOURCE_DIR}/inc/ops)
 include_directories(${GE_SOURCE_DIR}/third_party/securec/include)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc)
 include_directories(/usr/local/HiAI/opp/op_proto/built-in/inc)
 include_directories(${CMAKE_BINARY_DIR})
 include_directories(${CMAKE_BINARY_DIR}/proto/ge)

--- a/tests/ut/ge/CMakeLists.txt
+++ b/tests/ut/ge/CMakeLists.txt
@@ -43,13 +43,10 @@ include_directories(${GE_SOURCE_DIR}/inc/external/graph)
 include_directories(${GE_SOURCE_DIR}/inc/graph)
 include_directories(${GE_SOURCE_DIR}/inc/framework)
 include_directories(${GE_SOURCE_DIR}/inc/common)
 include_directories(${GE_SOURCE_DIR}/inc/ops)
 include_directories(${GE_SOURCE_DIR}/third_party/securec/include)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/cce)
 include_directories(${GE_SOURCE_DIR}/tests/ut/ge)
 include_directories(/usr/local/HiAI/opp/op_proto/built-in/inc)
 include_directories(${CMAKE_BINARY_DIR})
 include_directories(${CMAKE_BINARY_DIR}/proto/ge)

--- a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
+++ b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
@@ -44,9 +44,10 @@ enum FWKAdptAPIRetCode {
 // Notice: add new operate type  need check with OMM, and make sure append to the end line.
 enum FWKOperateType {
  FWK_ADPT_SESSION_CREATE = 0,
  FWK_ADPT_KERNEL_RUN = 1,
  FWK_ADPT_SESSION_DESTROY = 2,
  FWK_ADPT_SINGLE_OP_RUN = 3
  FWK_ADPT_KERNEL_RUN,
  FWK_ADPT_KERNEL_DESTROY,
  FWK_ADPT_SESSION_DESTROY,
  FWK_ADPT_SINGLE_OP_RUN
 };

 // API Parameter Structure
--- a/third_party/fwkacllib/inc/hccl/base.h
+++ b/third_party/fwkacllib/inc/hccl/base.h
@@ -63,9 +63,6 @@ typedef enum tagHcclResult {
    HCCL_E_RESERVED                 /**< reserved */
 } hcclResult_t;

 /* handle to communicator */
 typedef void *hcclComm_t;

 /**
 * @brief HCCL Reduction opperation
 */
@@ -88,13 +85,6 @@ typedef enum tagHcclDataType {
    HCCL_DATA_TYPE_RESERVED   /**< reserved */
 } hcclDataType_t;

 const s32 HCCL_TAG_ANY = -1;
 const u32 BASE_UNIQUE_ID_BYTES = 27;
 #define HCCL_UNIQUE_ID_BYTES (BASE_UNIQUE_ID_BYTES + 5 + 16 + 128)
 typedef struct {
    char internal[HCCL_UNIQUE_ID_BYTES];
 } hcclUniqueId;

 const u32 HCCL_MAX_SEGMENT_NUM = 8;   // The max number of gradient segments.

 /**
--- a/third_party/fwkacllib/inc/mmpa/mmpa_api.h
+++ b/third_party/fwkacllib/inc/mmpa/mmpa_api.h
@@ -20,7 +20,7 @@
 #define  LINUX    0
 #define  WIN      1

 #if(OS_TYPE == LINUX) //lint !e553
 #if(OS_TYPE == LINUX)

 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
@@ -84,7 +84,7 @@
 #endif


 #if(OS_TYPE == WIN) //lint !e553
 #if(OS_TYPE == WIN)
 #include <winsock2.h>
 #include <winsock.h>
 #include "Windows.h"
--- a/third_party/fwkacllib/inc/ops/all_ops.h
+++ b/third_party/fwkacllib/inc/ops/all_ops.h
@@ -40,7 +40,6 @@
 #include "nn_detect_ops.h"
 #include "nn_norm_ops.h"
 #include "nn_ops.h"
 #include "nn_other_ops.h"
 #include "nn_pooling_ops.h"
 #include "nn_training_ops.h"
 #include "nonlinear_fuc_ops.h"
@@ -62,5 +61,5 @@
 #include "outfeed_ops.h"
 #include "stateless_random_ops.h"
 #include "dvpp_ops.h"
 #include "basic_lstm_cell.h"
 #include "rnn.h"
 #endif  // BUILT_IN_OP_PROTO_INC_ALL_OPS_H_
--- a/third_party/fwkacllib/inc/ops/atomic_addr_clean_ops.h
+++ b/third_party/fwkacllib/inc/ops/atomic_addr_clean_ops.h
@@ -1,28 +0,0 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef GE_OP_ATOMICADDRCLEAN_H
 #define GE_OP_ATOMICADDRCLEAN_H

 #include "../../../inc/external/graph/operator_reg.h"

 namespace ge{
 REG_OP(AtomicAddrClean)
    .ATTR(automic_add_mem_size, ListInt, {})
    .OP_END()
 }  // namespace ge

 #endif // GE_OP_ATOMICADDRCLEAN_H
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -2543,41 +2543,6 @@ REG_OP(Bias)
    .ATTR(bias_from_blob, Bool, true)
    .OP_END_FACTORY_REG(Bias)

 /**
 * @brief Computes the gradient for Local Response Normalization.

 * @par Inputs:
 * @li grads: A 4D Tensor of type float16 or float32.
 * @li x: A 4D Tensor of type float16 or float32.
 * @li y: A 4D Tensor of type float16 or float32.

 * @par Attributes:
 * @li depth_radius: An optional int, specifying the half-width of the
 * normalization window. Defaults to "5".
 * @li bias: An optional float32. An offset, usually > 0 to avoid dividing by 0.
 * Defaults to "1".
 * @li alpha: An optional float32. A scaling factor, usually positive.
 * Defaults to "1".
 * @li beta: An optional float32. An exponent. Defaults to "0.5".

 * @par Outputs:
 * z: A Tensor. Has the same type and shape as "grads".

 * @attention Constraints:
 * "x" and "y" must have the same shape and type as "grads".
 */

 REG_OP(LRNGrad)
    .INPUT(grads, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(z, TensorType({DT_FLOAT16,DT_FLOAT}))
    .ATTR(depth_radius, Int, 5)
    .ATTR(bias, Float, 1.0)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.5)
    .OP_END_FACTORY_REG(LRNGrad)

 REG_OP(ConfusionMulGrad)
    .INPUT(input0, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT}))
@@ -2588,16 +2553,13 @@ REG_OP(ConfusionMulGrad)
    .ATTR(keep_dims, Bool, false)
    .OP_END_FACTORY_REG(ConfusionMulGrad)

 REG_OP(LRN)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .ATTR(depth_radius, Int, 5)
    .ATTR(bias, Float, 1.0)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.5)
    .ATTR(norm_region, String, "ACROSS_CHANNELS")
    .OP_END_FACTORY_REG(LRN)

 REG_OP(FusedMulAddNL2loss)
    .INPUT(x1, TensorType::NumberType())
    .INPUT(x2, TensorType::NumberType())
    .INPUT(x3, TensorType::NumberType())
    .OUTPUT(y1, TensorType::NumberType())
    .OUTPUT(y2, TensorType::NumberType())
    .OP_END_FACTORY_REG(FusedMulAddNL2loss)
 }  // namespace ge

 #endif  // GE_OP_ELEWISE_CALCULATION_OPS_H
--- a/third_party/fwkacllib/inc/ops/math_ops.h
+++ b/third_party/fwkacllib/inc/ops/math_ops.h
@@ -110,6 +110,83 @@ REG_OP(GetNext)
    .ATTR(output_num, Int, 1)
    .ATTR(channel_name, String, "")
    .OP_END_FACTORY_REG(GetNext)
 /**
 *@brief: Computes the Gauss error function of `x` element-wise.

 *@par Inputs:\n
 *x: A Tensor of type float16 or float32.

 *@par Outputs:
 *y: A Tensor. Has the same type as "x".
 */
 REG_OP(Erf)
    .INPUT(x, TensorType::FloatingDataType())
    .OUTPUT(y, TensorType::FloatingDataType())
    .OP_END_FACTORY_REG(Erf)

 /**
 *@brief: Computes the Gauss complementary error function of "x" element-wise.

 *@par Inputs:\n
 *x: A Tensor of type float16 or float32.

 *@par Outputs:
 *y: A Tensor. Has the same type as "x".
 */
 REG_OP(Erfc)
    .INPUT(x, TensorType::FloatingDataType())
    .OUTPUT(y, TensorType::FloatingDataType())
    .OP_END_FACTORY_REG(Erfc)

 /**
 *@brief This operation returns a rank 1 histogram counting the number of entries in `values` \n
 *  that fell into every bin.The bins are equal width and determined by the arguments \n
 *  'value_range' and 'nbins'. \n

 *@par Inputs:
 *Three inputs, including: \n
 *@li x: A Tensor of type float32,float16,int32.
 *@li range: A Tensor of type float32,float16,int32.
 *@li nbins: A Tensor of type int32.

 *@par Attributes:
 * dtype: An optional attribute. Defaults to "int32".

 *@par Outputs:
 *y: A Tensor. A Tensor of type int32.
 */
 REG_OP(HistogramFixedWidth)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(range, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(nbins, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_INT32}))
    .ATTR(dtype, String, "int32")
    .OP_END_FACTORY_REG(HistogramFixedWidth)

 /**
 *@brief This operation returns a rank 1 histogram counting the number of entries in `values` \n
 *  that fell into every bin.The bins are equal width and determined by the arguments \n
 *  'value_range' and 'nbins'. \n

 *@par Inputs:
 *Two inputs, including: \n
 *@li x: A Tensor of type float32,float16,int32.
 *@li range: A Tensor of type float32,float16,int32.

 *@par Attributes:
 *@li dtype: An optional attribute. Defaults to "int32".
 *@li nbins: A required attribute,the type is int32.

 *@par Outputs:
 *y: A Tensor. A Tensor of type int32.
 */
 REG_OP(HistogramFixedWidthD)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(range, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OUTPUT(y, TensorType({DT_INT32}))
    .REQUIRED_ATTR(nbins, Int)
    .ATTR(dtype, String, "int32")
    .OP_END_FACTORY_REG(HistogramFixedWidthD)
 }  // namespace ge

 #endif  // GE_OP_MATH_OPS_H_
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@@ -287,26 +287,6 @@ REG_OP(ScatterMax)
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterMax)

 REG_OP(SparseApplyAdagrad)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(lr, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyAdagrad)

 REG_OP(SparseApplyAdagradD)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .REQUIRED_ATTR(lr, Float)
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyAdagradD)

 REG_OP(ScatterUpdate)
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType({DT_INT32}))
@@ -315,94 +295,6 @@ REG_OP(ScatterUpdate)
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterUpdate)

 /**
 * @brief Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 * That is for rows we have grad for, we update var, accum and linear

 * @par Inputs:
 * Ten inputs, including:
 * @li var: A mutable Tensor. Must be of type TensorType::NumberType().
 *     Should be a Variable Tensor.
 * @li accum: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li linear: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li grad: A Tensor of the same type as "var", for the gradient.
 * @li indices: A vector of indices into the first dimension of var and accum.
 * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
 * @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar.
 * @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar.
 * @li l2_shrinkage: A Tensor of the same type as "var", L2 shrinkage regulariation. Must be a scalar.
 * @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.

 * @par Attributes:
 * use_locking: An optional bool. Defaults to "False".
 * If "True", updating of the "var" and "accum" tensors will be
 * rotected by a lock; otherwise the behavior is undefined,
 * but may exhibit less contention.

 * @par Outputs:
 * var: A Tensor. Has the same type and format as input "var".
 */
 REG_OP(SparseApplyFtrlV2)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(linear, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(lr, TensorType({DT_FLOAT}))
    .INPUT(l1, TensorType({DT_FLOAT}))
    .INPUT(l2, TensorType({DT_FLOAT}))
    .INPUT(l2_shrinkage, TensorType({DT_FLOAT}))
    .INPUT(lr_power, TensorType({DT_FLOAT}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyFtrlV2)

 /**
 * @brief Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 * That is for rows we have grad for, we update var, accum and linear

 * @par Inputs:
 * Ten inputs, including:
 * @li var: A mutable Tensor. Must be of type TensorType::NumberType().
 *     Should be a Variable Tensor.
 * @li accum: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li linear: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li grad: A Tensor of the same type as "var", for the gradient.
 * @li indices: A vector of indices into the first dimension of var and accum.

 * @par Attributes:
 * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
 * @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar.
 * @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar.
 * @li l2_shrinkage: A Tensor of the same type as "var", L2 shrinkage regulariation. Must be a scalar.
 * @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
 * @li use_locking: An optional bool. Defaults to "False".
 *     If "True", updating of the "var" and "accum" tensors will be
 *     rotected by a lock; otherwise the behavior is undefined,
 *     but may exhibit less contention.

 * @par Outputs:
 * var: A Tensor. Has the same type and format as input "var".
 */
 REG_OP(SparseApplyFtrlV2D)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(linear, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .REQUIRED_ATTR(lr, Float)
    .REQUIRED_ATTR(l1, Float)
    .REQUIRED_ATTR(l2, Float)
    .REQUIRED_ATTR(l2_shrinkage, Float)
    .REQUIRED_ATTR(lr_power, Float)
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyFtrlV2D)

 }  // namespace ge

 #endif  // GE_OP_MATRIX_CALCULATION_OPS_H
--- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
@@ -407,6 +407,32 @@ REG_OP(Conv2DBackpropInputD)
    .ATTR(dilations, ListInt, {1, 1, 1, 1})
    .OP_END_FACTORY_REG(Conv2DBackpropInputD)

 /**
 *@brief Computes the Deconvolution with respect to the input.
 *@par Inputs:
 * Two inputs:
 * @li x: A Tensor. Must have the same type as "filter". 4D with shape\n
 * [batch, out_height, out_width, out_channels]\n
 * or [batch, out_channels, out_height, out_width]. Gradients with respect\n
 * to the output of the convolution.
 * @li filter: A Tensor of type float16.
 * 4D with shape [filter_height, filter_width, in_channels, out_channels],\n
 * or [out_channels, filter_height, filter_width, in_channels], \n
 * or [out_channels, in_channel, filter_height, filter_width].
 * One optional input:
 * @li bias: An optional tensor of type int8
 *@par Attributes:
 * Three attributes:
 * @li strides: A tuple or list of 2 integers. The stride of the sliding window\n
 * for H/W dimension.
 * @li pads: A tuple or list of 4 integers. The [top, bottom, left, right] \n
 * padding on the feature map
 * @li dilations: A tuple or list of 4 integers. The dilation factor for each\n
 * dimension of input. Must be [1, 1, 1, 1].
 *@par Outputs:
 * y: A Tensor. Has the same type as "filter". 4D tensor with shape\n
 * [batch, height, width, channels] or [batch, channels, height, width].
 */
 REG_OP(Deconvolution)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
--- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
@@ -22,6 +22,23 @@

 namespace ge {

 /**
 *@brief Generates bounding boxes based on "rois" and "deltas". It is a customized FasterRcnn operator.

 *@par Inputs:
 * Two inputs, including: \n
 *@li rois: Region of interests (ROIs) generated by the region proposal network (RPN). A 2D Tensor of type float 32 with shape (N, 4). "N" indicates the number of ROIs, and the value "4" refers to "x0", "x1", "y0", and "y1".
 *@li deltas: Absolute variation between the ROIs generated by the RPN and ground truth boxes. A 2D Tensor of type float32 with shape (N, 4). "N" indicates the number of errors, and 4 indicates "dx", "dy", "dw", and "dh".

 *@par Attributes:
 *@li means: An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means".
 *@li stds: An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means".
 *@li max_shape: Shape [h, w], specifying the size of the image transferred to the network. Used to ensure that the bbox shape after conversion does not exceed "max_shape".
 *@li wh_ratio_clip: Defaults to "16/1000". The values of "dw" and "dh" fall within (-wh_ratio_clip, wh_ratio_clip).

 *@par Outputs:
 *bboxes: Bboxes generated based on "rois" and "deltas". Have the same format and type as "rois".
 */
 REG_OP(BoundingBoxDecode)
    .INPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(deltas, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -32,6 +49,21 @@ REG_OP(BoundingBoxDecode)
    .ATTR(wh_ratio_clip, Float, 0.016)
    .OP_END_FACTORY_REG(BoundingBoxDecode)

 /**
 *@brief Computes the coordinate variations between bboxes and ground truth boxes. It is a customized FasterRcnn operator.

 *@par Inputs:
 * Two inputs, including: \n
 *@li anchor_box: Anchor boxes. A 2D Tensor of float32 with shape (N, 4). "N" indicates the number of bounding boxes, and the value "4" refers to "x0", "x1", "y0", and "y1".
 *@li ground_truth_box: Ground truth boxes. A 2D Tensor of float32 with shape (N, 4). "N" indicates the number of bounding boxes, and the value "4" refers to "x0", "x1", "y0", and "y1".

 *@par Attributes:
 *@li means: An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means".
 *@li stds: An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means".

 *@par Outputs:
 *delats: A 2D Tensor of type float32 with shape (N, 4), specifying the variations between all anchor boxes and ground truth boxes.
 */
 REG_OP(BoundingBoxEncode)
    .INPUT(anchor_box, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(ground_truth_box, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -40,12 +72,43 @@ REG_OP(BoundingBoxEncode)
    .ATTR(stds, ListFloat, {1.0, 1.0, 1.0, 1.0})
    .OP_END_FACTORY_REG(BoundingBoxEncode)

 /**
 *@brief Judges whether the bounding box is valid. It is a customized FasterRcnn operator.

 *@par Inputs:
 * Two inputs, including: \n
 *@li bbox_tensor: Bounding box. A 2D Tensor of type float16 with shape (N, 4). "N" indicates the number of bounding boxes, the value "4" indicates "x0", "x1", "y0", and "y1".
 *@li img_metas: Valid boundary value of the image. A 1D Tensor of type float16 with shape (16,)

 *@par Outputs:
 *valid_tensor: A bool with shape (N, 1), specifying whether an input anchor is in an image. "1" indicates valid, while "0" indicates invalid.

 *@attention Constraints:
 * 16 "img_metas" are input. The first three numbers (height, width, ratio) are valid, specifying the valid boundary (heights x ratio, weights x ratio).
 */
 REG_OP(CheckValid)
    .INPUT(bbox_tensor, TensorType({DT_FLOAT16}))
    .INPUT(img_metas, TensorType({DT_FLOAT16}))
    .OUTPUT(valid_tensor, TensorType({DT_INT8}))
    .OP_END_FACTORY_REG(CheckValid)

 /**
 *@brief Computes the intersection over union (iou) or the intersection over foreground (iof) based on the ground-truth and predicted regions.

 *@par Inputs:
 * Two inputs, including: \n
 *@li bboxes: Bounding boxes, a 2D Tensor of type float16 with shape (N, 4). "N" indicates the number of bounding boxes, and the value "4" refers to "x0", "x1", "y0", and "y1".
 *@li gtboxes: Ground-truth boxes, a 2D Tensor of type float16 with shape (M, 4). "M" indicates the number of ground truth boxes, and the value "4" refers to "x0", "x1", "y0", and "y1".

 *@par Attributes:
 *mode: Computation mode, a character string with the value range of [iou, iof].

 *@par Outputs:
 *overlap: A 2D Tensor of type float16 with shape [M, N], specifying the IoU or IoF ratio.

 *@attention Constraints:
 * Only computation of float16 data is supported. To avoid overflow, the input length and width are scaled by 0.2 internally.
 */
 REG_OP(Iou)
    .INPUT(bboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(gtboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -53,6 +116,25 @@ REG_OP(Iou)
    .ATTR(mode, String, "iou")
    .OP_END_FACTORY_REG(Iou)

 /**
 *@brief Performs the backpropagation of ROIAlign for training scenarios.

 *@par Inputs:
 * Three inputs, including: \n
 *@li ydiff: A 5HD gradient input of type float32.
 *@li rois: ROI position. A 2D Tensor of float32 with shape (N, 5). "N" indicates the number of ROIs, the value "5" indicates the indexes of images where the ROIs are located, "x0", "x1", "y0", and "y1".
 *@li rois_n: An optional input, specifying the number of valid ROIs. This parameter is reserved.

 *@par Attributes:
 *@li xdiff_shape: A required list of 4 ints, obtained based on the shape of "features" of ROIAlign.
 *@li pooled_width: A required attribute of type int, specifying the W dimension.
 *@li pooled_height: A required attribute of type int, specifying the H dimension.
 *@li spatial_scale: A required attribute of type float, specifying the scaling ratio of "features" to the original image.
 *@li sample_num: An optional attribute of type int, specifying the horizontal and vertical sampling frequency of each output. If this attribute is set to "0", the sampling frequency is equal to the rounded up value of "rois", which is a floating point number. Defaults to "2".

 *@par Outputs:
 *xdiff: Gradient added to input "features". Has the same 5HD shape as input "features".
 */
 REG_OP(ROIAlignGrad)
    .INPUT(ydiff, TensorType({DT_FLOAT}))
    .INPUT(rois, TensorType({DT_FLOAT}))
@@ -65,6 +147,24 @@ REG_OP(ROIAlignGrad)
    .ATTR(sample_num, Int, 2)
    .OP_END_FACTORY_REG(ROIAlignGrad)

 /**
 *@brief Obtains the ROI feature matrix from the feature map. It is a customized FasterRcnn operator.

 *@par Inputs:
 * Three inputs, including: \n
 *@li features: A 5HD Tensor of type float32.
 *@li rois: ROI position. A 2D Tensor of float32 with shape (N, 5). "N" indicates the number of ROIs, the value "5" indicates the indexes of images where the ROIs are located, "x0", "x1", "y0", and "y1".
 *@li rois_n: An optional input, specifying the number of valid ROIs. This parameter is reserved.

 *@par Attributes:
 *@li spatial_scale: A required attribute of type float, specifying the scaling ratio of "features" to the original image.
 *@li pooled_height: A required attribute of type int, specifying the H dimension.
 *@li pooled_width: A required attribute of type int, specifying the W dimension.
 *@li sample_num: An optional attribute of type int, specifying the horizontal and vertical sampling frequency of each output. If this attribute is set to "0", the sampling frequency is equal to the rounded up value of "rois", which is a floating point number. Defaults to "2".

 *@par Outputs:
 *output: Outputs the feature sample of each ROI position. The format is 5HD. The axis N is the number of input ROIs. Axes H, W, and C are consistent with the values of "pooled_height", "pooled_width", and "features", respectively.
 */
 REG_OP(ROIAlign)
    .INPUT(features, TensorType({DT_FLOAT}))
    .INPUT(rois, TensorType({DT_FLOAT}))
--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@@ -236,6 +236,256 @@ REG_OP(ConfusionSoftmaxGrad)
  .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
  .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
  .OP_END_FACTORY_REG(ConfusionSoftmaxGrad)

 /**
 *@brief Layernorm operator interface implementation
 *  calculating: x, gamma, beta
 *  mean  = np.mean(x, reduce_axis, keepdims=True)
 *  variance = np.mean(np.power((x - mean),2), reduce_axis, keepdims=True)
 *  y = gamma*((x - mean) / np.sqrt(variance + 0.001)) + beta

 *@par Inputs:
 *Three inputs, including:
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li beta: A Tensor. Must be one of the following types: float16, float32.

 *@par Attributes:
 * @li begin_norm_axis: A required attribute, the type is int32.
 * @li begin_params_axis: A required attribute,the type is int32.

 *@par Outputs:
 *Three outputs, including:
 * @li y: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 */
 REG_OP(LayerNorm)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(begin_norm_axis, Int, 0)
    .ATTR(begin_params_axis, Int, 0)
    .ATTR(epsilon, Float, 0.0000001)
    .OP_END_FACTORY_REG(LayerNorm)

 /**
 *@brief LayerNormGrad operator interface implementation
 *  calculating: dy, x, variance, mean, gamma
 *  pd_xl = data_dy*data_gamma
 *  pd_var = np.sum(((-0.5)*pd_xl*(data_x - data_mean)
 *           np.power((data_variance + EPSLON), (-1.5))),
 *           reduce_axis, keepdims=True)
 *  pd_mean = np.sum(((-1.0)*pd_xl
 *            np.power((data_variance + EPSLON), (-0.5))),
 *            reduce_axis, keepdims=True)
 *            + pd_var*(1.0/m)
 *            np.sum(((-2.0)*(data_x - data_mean)), reduce_axis, keepdims=True)
 *  pd_x = pd_xl*np.power((data_variance + EPSLON), (-0.5)) +
 *         pd_var*(2.0/m)*(data_x - data_mean) + pd_mean*(1.0/m)
 *  pd_gamma = np.sum((data_dy*(data_x - data_mean)
 *             np.power((data_variance + EPSLON), (-0.5))), param_axis, keepdims=True)
 *  pd_beta = np.sum(data_dy, param_axis, keepdims=True)

 *@par Inputs:
 *Three inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32.

 *@par Outputs:
 *Three outputs, including:
 * @li pd_x: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_beta: A Tensor. Must be one of the following types: float16, float32.
 */
 REG_OP(LayerNormGrad)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(LayerNormGrad)

 /**
 *@brief LayerNormXBackprop operator interface implementation
 *  calculating: dy, x, variance, mean, gamma
 *  pd_xl = data_dy*data_gamma
 *  pd_var = np.sum(((-0.5)*pd_xl*(data_x - data_mean)
 *           np.power((data_variance + EPSLON), (-1.5))),
 *           reduce_axis, keepdims=True)
 *  pd_mean = np.sum(((-1.0)*pd_xl
 *            np.power((data_variance + EPSLON), (-0.5))),
 *            reduce_axis, keepdims=True)
 *            + pd_var*(1.0/m)
 *            np.sum(((-2.0)*(data_x - data_mean)), reduce_axis, keepdims=True)
 *  pd_x = pd_xl*np.power((data_variance + EPSLON), (-0.5)) +
 *         pd_var*(2.0/m)*(data_x - data_mean) + pd_mean*(1.0/m)
 *  pd_gamma = np.sum((data_dy*(data_x - data_mean)
 *             np.power((data_variance + EPSLON), (-0.5))), param_axis, keepdims=True)
 *  pd_beta = np.sum(data_dy, param_axis, keepdims=True)

 *@par Inputs:
 *Three inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32.

 *@par Outputs:
 *Three outputs, including:
 * @li pd_x: A Tensor. Must be one of the following types: float16, float32.
 */
 REG_OP(LayerNormXBackprop)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(LayerNormXBackprop)

 /**
 *@brief LayerNormBetaGammaBackprop operator interface implementation
 *  calculating: dy, x, variance, mean
 *  pd_xl = data_dy*data_gamma
 *  pd_var = np.sum(((-0.5)*pd_xl*(data_x - data_mean)
 *           np.power((data_variance + EPSLON), (-1.5))),
 *           reduce_axis, keepdims=True)
 *  pd_mean = np.sum(((-1.0)*pd_xl
 *            np.power((data_variance + EPSLON), (-0.5))),
 *            reduce_axis, keepdims=True)
 *            + pd_var*(1.0/m)
 *            np.sum(((-2.0)*(data_x - data_mean)), reduce_axis, keepdims=True)
 *  pd_x = pd_xl*np.power((data_variance + EPSLON), (-0.5)) +
 *         pd_var*(2.0/m)*(data_x - data_mean) + pd_mean*(1.0/m)
 *  pd_gamma = np.sum((data_dy*(data_x - data_mean)
 *             np.power((data_variance + EPSLON), (-0.5))), param_axis, keepdims=True)
 *  pd_beta = np.sum(data_dy, param_axis, keepdims=True)

 *@par Inputs:
 *Three inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.

 *@par Outputs:
 *Three outputs, including:
 * @li pd_gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_beta: A Tensor. Must be one of the following types: float16, float32.
 */
 REG_OP(LayerNormBetaGammaBackprop)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .REQUIRED_ATTR(shape_gamma, ListInt)
    .OP_END_FACTORY_REG(LayerNormBetaGammaBackprop)

 /**
 *@brief Return "output" according to the algorithm of dropout_do_mask: \n
 *  scale_x = x *(1 / keep_prob)
 *  output = select(mask == 1, scale_x, 0)

 *@par Inputs:
 *Three inputs, including: \n
 * @li x: A mutable Tensor. Must be one of the following types:
 *     float16, float32
 * @li mask: A mutable Tensor. Must met all of the following rules:
 *     shape of mask should be 1D.
 *     dtype of mask should be uint8.
 *     value of shape should met the following algorithm:
 *     value = (size(x) + 128 - 1) // 128 * 128 //8
 * @li keep_prob: A mutable Tensor. Must met all of the following rules:
 *     shape of "keep_prob" should be (1,) or [1,].
 *     Has the same type as "x".

 *@par Output:
 *y: A mutable Tensor. Has the same type as "x".
 */
 REG_OP(DropOutDoMask)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mask, TensorType({DT_UINT8}))
    .INPUT(keep_prob, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(DropOutDoMask)

 /**
 *@brief Local Response Normalization.

 *@par Inputs:
 *One input, including:
 *@li x: A Tensor. Must be 4-D shape, and only support the following types: float16, float32.

 *@par Attributes:
 *@li depth_radius: An optional int, specifying the half-width of the
 * normalization window. Defaults to "5".
 *@li bias: An optional float32. An offset, usually > 0 to avoid dividing by 0.
 * Defaults to "1".
 *@li alpha: An optional float32. A scaling factor, usually positive.
 * Defaults to "1".
 *@li beta: An optional float32. An exponent. Defaults to "0.5".
 *@li norm_region: An optional string. A mode option. Defaults to "ACROSS_CHANNELS".

 *@par Outputs:
 *y: A Tensor. Has the same data type and shape as "x".
 */
 REG_OP(LRN)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .ATTR(depth_radius, Int, 5)
    .ATTR(bias, Float, 1.0)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.5)
    .ATTR(norm_region, String, "ACROSS_CHANNELS")
    .OP_END_FACTORY_REG(LRN)

 /**
 * @brief Computes the gradient for Local Response Normalization.

 * @par Inputs:
 * @li grads: A 4D Tensor of type float16 or float32.
 * @li x: A 4D Tensor of type float16 or float32.
 * @li y: A 4D Tensor of type float16 or float32.

 * @par Attributes:
 * @li depth_radius: An optional int, specifying the half-width of the
 * normalization window. Defaults to "5".
 * @li bias: An optional float32. An offset, usually > 0 to avoid dividing by 0.
 * Defaults to "1".
 * @li alpha: An optional float32. A scaling factor, usually positive.
 * Defaults to "1".
 * @li beta: An optional float32. An exponent. Defaults to "0.5".

 * @par Outputs:
 * z: A Tensor. Has the same type and shape as "grads".

 * @attention Constraints:
 * "x" and "y" must have the same shape and type as "grads".
 */
 REG_OP(LRNGrad)
    .INPUT(grads, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(z, TensorType({DT_FLOAT16,DT_FLOAT}))
    .ATTR(depth_radius, Int, 5)
    .ATTR(bias, Float, 1.0)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.5)
    .OP_END_FACTORY_REG(LRNGrad)

 }  // namespace ge

 #endif  //GE_OP_NN_NORM_OPS_H
--- a/third_party/fwkacllib/inc/ops/nn_other_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_other_ops.h
@@ -1,268 +0,0 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef GE_OP_NN_OTHER_OPS_H
 #define GE_OP_NN_OTHER_OPS_H
 #include "../graph/operator_reg.h"

 namespace ge {
 REG_OP(Erf)
    .INPUT(x, TensorType::FloatingDataType())
    .OUTPUT(y, TensorType::FloatingDataType())
    .OP_END_FACTORY_REG(Erf)

 REG_OP(Erfc)
    .INPUT(x, TensorType::FloatingDataType())
    .OUTPUT(y, TensorType::FloatingDataType())
    .OP_END_FACTORY_REG(Erfc)

 /**
 *@brief This operation returns a rank 1 histogram counting the number of entries in `values` \n
 *  that fell into every bin.The bins are equal width and determined by the arguments \n
 *  'value_range' and 'nbins'. \n

 *@par Inputs: 
 *Three inputs, including: \n
 *@li x: A Tensor of type float32,float16,int32.
 *@li range: A Tensor of type float32,float16,int32.
 *@li nbins: A Tensor of type int32.

 *@par Attributes:
 * dtype: An optional attribute. Defaults to "int32".

 *@par Outputs:
 *y: A Tensor. A Tensor of type int32.
 */
 REG_OP(HistogramFixedWidth)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(range, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(nbins, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_INT32}))
    .ATTR(dtype, String, "int32")
    .OP_END_FACTORY_REG(HistogramFixedWidth)

 /**
 *@brief This operation returns a rank 1 histogram counting the number of entries in `values` \n
 *  that fell into every bin.The bins are equal width and determined by the arguments \n
 *  'value_range' and 'nbins'. \n

 *@par Inputs: 
 *Two inputs, including: \n
 *@li x: A Tensor of type float32,float16,int32.
 *@li range: A Tensor of type float32,float16,int32.

 *@par Attributes:
 *@li dtype: An optional attribute. Defaults to "int32".
 *@li nbins: A required attribute,the type is int32.

 *@par Outputs:
 *y: A Tensor. A Tensor of type int32.
 */
 REG_OP(HistogramFixedWidthD)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(range, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OUTPUT(y, TensorType({DT_INT32}))
    .REQUIRED_ATTR(nbins, Int)
    .ATTR(dtype, String, "int32")
    .OP_END_FACTORY_REG(HistogramFixedWidthD)

 /**
 *@brief Layernorm operator interface implementation
 *  calculating: x, gamma, beta
 *  mean  = np.mean(x, reduce_axis, keepdims=True)
 *  variance = np.mean(np.power((x - mean),2), reduce_axis, keepdims=True)
 *  y = gamma*((x - mean) / np.sqrt(variance + 0.001)) + beta

 *@par Inputs:
 *Three inputs, including:
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li beta: A Tensor. Must be one of the following types: float16, float32.

 *@par Attributes:
 * @li begin_norm_axis: A required attribute, the type is int32.
 * @li begin_params_axis: A required attribute,the type is int32.

 *@par Outputs:
 *Three outputs, including:
 * @li y: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 */
 REG_OP(LayerNorm)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(begin_norm_axis, Int, 0)
    .ATTR(begin_params_axis, Int, 0)
    .OP_END_FACTORY_REG(LayerNorm)

 /**
 *@brief LayerNormGrad operator interface implementation
 *  calculating: dy, x, variance, mean, gamma
 *  pd_xl = data_dy*data_gamma
 *  pd_var = np.sum(((-0.5)*pd_xl*(data_x - data_mean)
 *           np.power((data_variance + EPSLON), (-1.5))),
 *           reduce_axis, keepdims=True)
 *  pd_mean = np.sum(((-1.0)*pd_xl
 *            np.power((data_variance + EPSLON), (-0.5))),
 *            reduce_axis, keepdims=True)
 *            + pd_var*(1.0/m)
 *            np.sum(((-2.0)*(data_x - data_mean)), reduce_axis, keepdims=True)
 *  pd_x = pd_xl*np.power((data_variance + EPSLON), (-0.5)) +
 *         pd_var*(2.0/m)*(data_x - data_mean) + pd_mean*(1.0/m)
 *  pd_gamma = np.sum((data_dy*(data_x - data_mean)
 *             np.power((data_variance + EPSLON), (-0.5))), param_axis, keepdims=True)
 *  pd_beta = np.sum(data_dy, param_axis, keepdims=True)

 *@par Inputs:
 *Three inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32.

 *@par Outputs:
 *Three outputs, including:
 * @li pd_x: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_beta: A Tensor. Must be one of the following types: float16, float32.
 */
 REG_OP(LayerNormGrad)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(LayerNormGrad)

 /**
 *@brief LayerNormXBackprop operator interface implementation
 *  calculating: dy, x, variance, mean, gamma
 *  pd_xl = data_dy*data_gamma
 *  pd_var = np.sum(((-0.5)*pd_xl*(data_x - data_mean)
 *           np.power((data_variance + EPSLON), (-1.5))),
 *           reduce_axis, keepdims=True)
 *  pd_mean = np.sum(((-1.0)*pd_xl
 *            np.power((data_variance + EPSLON), (-0.5))),
 *            reduce_axis, keepdims=True)
 *            + pd_var*(1.0/m)
 *            np.sum(((-2.0)*(data_x - data_mean)), reduce_axis, keepdims=True)
 *  pd_x = pd_xl*np.power((data_variance + EPSLON), (-0.5)) +
 *         pd_var*(2.0/m)*(data_x - data_mean) + pd_mean*(1.0/m)
 *  pd_gamma = np.sum((data_dy*(data_x - data_mean)
 *             np.power((data_variance + EPSLON), (-0.5))), param_axis, keepdims=True)
 *  pd_beta = np.sum(data_dy, param_axis, keepdims=True)

 *@par Inputs:
 *Three inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32.

 *@par Outputs:
 *Three outputs, including:
 * @li pd_x: A Tensor. Must be one of the following types: float16, float32.
 */
 REG_OP(LayerNormXBackprop)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(LayerNormXBackprop)

 /**
 *@brief LayerNormBetaGammaBackprop operator interface implementation
 *  calculating: dy, x, variance, mean
 *  pd_xl = data_dy*data_gamma
 *  pd_var = np.sum(((-0.5)*pd_xl*(data_x - data_mean)
 *           np.power((data_variance + EPSLON), (-1.5))),
 *           reduce_axis, keepdims=True)
 *  pd_mean = np.sum(((-1.0)*pd_xl
 *            np.power((data_variance + EPSLON), (-0.5))),
 *            reduce_axis, keepdims=True)
 *            + pd_var*(1.0/m)
 *            np.sum(((-2.0)*(data_x - data_mean)), reduce_axis, keepdims=True)
 *  pd_x = pd_xl*np.power((data_variance + EPSLON), (-0.5)) +
 *         pd_var*(2.0/m)*(data_x - data_mean) + pd_mean*(1.0/m)
 *  pd_gamma = np.sum((data_dy*(data_x - data_mean)
 *             np.power((data_variance + EPSLON), (-0.5))), param_axis, keepdims=True)
 *  pd_beta = np.sum(data_dy, param_axis, keepdims=True)

 *@par Inputs:
 *Three inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.

 *@par Outputs:
 *Three outputs, including:
 * @li pd_gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_beta: A Tensor. Must be one of the following types: float16, float32.
 */
 REG_OP(LayerNormBetaGammaBackprop)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .REQUIRED_ATTR(shape_gamma, ListInt)
    .OP_END_FACTORY_REG(LayerNormBetaGammaBackprop)

 /**
 *@brief Return "output" according to the algorithm of dropout_do_mask: \n
 *  scale_x = x *(1 / keep_prob)
 *  output = select(mask == 1, scale_x, 0)

 *@par Inputs:
 *Three inputs, including: \n
 * @li x: A mutable Tensor. Must be one of the following types:
 *     float16, float32
 * @li mask: A mutable Tensor. Must met all of the following rules:
 *     shape of mask should be 1D.
 *     dtype of mask should be uint8.
 *     value of shape should met the following algorithm:
 *     value = (size(x) + 128 - 1) // 128 * 128 //8
 * @li keep_prob: A mutable Tensor. Must met all of the following rules:
 *     shape of "keep_prob" should be (1,) or [1,].
 *     Has the same type as "x".

 *@par Output:
 *y: A mutable Tensor. Has the same type as "x".
 */
 REG_OP(DropOutDoMask)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mask, TensorType({DT_UINT8}))
    .INPUT(keep_prob, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(DropOutDoMask)

 }  // namespace ge

 #endif  // GE_OP_NN_OTHER_OPS_H
--- a/third_party/fwkacllib/inc/ops/nn_training_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h
@@ -17,6 +17,7 @@
 #ifndef GE_OP_TRAINING_OPS_H
 #define GE_OP_TRAINING_OPS_H

 #include "../../../inc/external/graph/operator_reg.h"
 #include "../graph/operator_reg.h"
 namespace ge {
 /**
@@ -110,6 +111,63 @@ REG_OP(ApplyMomentum)
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ApplyMomentum)

 /**
 *@brief Updates relevant entries in "var" and "accum" according to the adagrad scheme.

 *@par Inputs:
 * Five inputs, including:
 *@li var: An NCHW, NHWC, or ND Tensor of type float32.
 *@li accum: An NCHW, NHWC, or ND Tensor of type float32.
 *@li lr: An NCHW, NHWC, or ND Tensor of type float32.
 *@li grad: An NCHW, NHWC, or ND Tensor of type float32.
 *@li indices: An NCHW, NHWC, or ND Tensor of type float32.

 *@par Attributes:
 *@li use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock.
 *@li update_slots: An optional bool. Defaults to "True". If "True", the calcution will be different as "False".

 *@par Outputs:
 *var: A Tensor. Has the same type and format as input "var".
 */
 REG_OP(SparseApplyAdagrad)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(lr, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyAdagrad)

 /**
 *@brief Updates relevant entries in "var" and "accum" according to the adagrad scheme.

 *@par Inputs:
 * Four inputs, including:
 *@li var: An NCHW, NHWC, or ND Tensor of type float32.
 *@li accum: An NCHW, NHWC, or ND Tensor of type float32.
 *@li grad: An NCHW, NHWC, or ND Tensor of type float32.
 *@li indices: An NCHW, NHWC, or ND Tensor of type int32.

 *@par Attributes:
 *@li lr: Required, used for computation.
 *@li use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock.
 *@li update_slots: An optional bool. Defaults to "True". If "True", the calcution will be different as "False".

 *@par Outputs:
 *var: A Tensor. Has the same type and format as input "var".
 */
 REG_OP(SparseApplyAdagradD)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .REQUIRED_ATTR(lr, Float)
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyAdagradD)


 REG_OP(ApplyMomentumCCE)
    .INPUT(var, TensorType::NumberType())
    .INPUT(accum, TensorType::NumberType())
@@ -967,6 +1025,186 @@ REG_OP(LarsV2Update)
    .ATTR(use_clip, Bool, false)
    .OP_END_FACTORY_REG(LarsV2Update)

 /**
 * @brief Update relevant entries in '*var' according to the Ftrl-proximal scheme.

 * @par Inputs:
 * Nine inputs, including:
 * @li var: A mutable Tensor. Must be of type TensorType::NumberType().
 *     Should be a Variable Tensor.
 * @li accum: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li linear: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li grad: A Tensor of the same type as "var", for the gradient.
 * @li indices: A vector of indices into the first dimension of var and accum.
 * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
 * @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar.
 * @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar.
 * @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.

 * @par Attributes:
 * use_locking: An optional bool. Defaults to "False".
 *     If "True", updating of the "var" and "accum" tensors will be
 *     protected by a lock; otherwise the behavior is undefined,
 *     but may exhibit less contention.

 * @par Outputs:
 * var: A Tensor. Has the same type and format as input "var".
 */
 REG_OP(SparseApplyFtrl)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(linear, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(lr, TensorType({DT_FLOAT}))
    .INPUT(l1, TensorType({DT_FLOAT}))
    .INPUT(l2, TensorType({DT_FLOAT}))
    .INPUT(lr_power, TensorType({DT_FLOAT}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyFtrl)

 /**
 * @brief Update relevant entries in '*var' according to the Ftrl-proximal scheme.

 * @par Inputs:
 * Nine inputs, including:
 * @li var: A mutable Tensor. Must be of type TensorType::NumberType().
 *     Should be a Variable Tensor.
 * @li accum: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li linear: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li grad: A Tensor of the same type as "var", for the gradient.
 * @li indices: A vector of indices into the first dimension of var and accum.
 * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
 * @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar.
 * @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar.
 * @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.

 * @par Attributes:
 * use_locking: An optional bool. Defaults to "False".
 *     If "True", updating of the "var" and "accum" tensors will be
 *     protected by a lock; otherwise the behavior is undefined,
 *     but may exhibit less contention.

 * @par Outputs:
 * var: A Tensor. Has the same type and format as input "var".
 */
 REG_OP(SparseApplyFtrlD)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(linear, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .REQUIRED_ATTR(lr, Float)
    .REQUIRED_ATTR(l1, Float)
    .REQUIRED_ATTR(l2, Float)
    .REQUIRED_ATTR(lr_power, Float)
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyFtrlD)

 /**
 * @brief Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 * That is for rows we have grad for, we update var, accum and linear

 * @par Inputs:
 * Ten inputs, including:
 * @li var: A mutable Tensor. Must be of type TensorType::NumberType().
 *     Should be a Variable Tensor.
 * @li accum: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li linear: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li grad: A Tensor of the same type as "var", for the gradient.
 * @li indices: A vector of indices into the first dimension of var and accum.
 * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
 * @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar.
 * @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar.
 * @li l2_shrinkage: A Tensor of the same type as "var", L2 shrinkage regulariation. Must be a scalar.
 * @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.

 * @par Attributes:
 * use_locking: An optional bool. Defaults to "False".
 * If "True", updating of the "var" and "accum" tensors will be
 * rotected by a lock; otherwise the behavior is undefined,
 * but may exhibit less contention.

 * @par Outputs:
 * var: A Tensor. Has the same type and format as input "var".
 */
 REG_OP(SparseApplyFtrlV2)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(linear, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(lr, TensorType({DT_FLOAT}))
    .INPUT(l1, TensorType({DT_FLOAT}))
    .INPUT(l2, TensorType({DT_FLOAT}))
    .INPUT(l2_shrinkage, TensorType({DT_FLOAT}))
    .INPUT(lr_power, TensorType({DT_FLOAT}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyFtrlV2)

 /**
 * @brief Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 * That is for rows we have grad for, we update var, accum and linear

 * @par Inputs:
 * Ten inputs, including:
 * @li var: A mutable Tensor. Must be of type TensorType::NumberType().
 *     Should be a Variable Tensor.
 * @li accum: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li linear: A mutable Tensor of the same type as "var".
 *     Should be a Variable Tensor.
 * @li grad: A Tensor of the same type as "var", for the gradient.
 * @li indices: A vector of indices into the first dimension of var and accum.

 * @par Attributes:
 * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
 * @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar.
 * @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar.
 * @li l2_shrinkage: A Tensor of the same type as "var", L2 shrinkage regulariation. Must be a scalar.
 * @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
 * @li use_locking: An optional bool. Defaults to "False".
 *     If "True", updating of the "var" and "accum" tensors will be
 *     rotected by a lock; otherwise the behavior is undefined,
 *     but may exhibit less contention.

 * @par Outputs:
 * var: A Tensor. Has the same type and format as input "var".
 */
 REG_OP(SparseApplyFtrlV2D)
    .INPUT(var, TensorType({DT_FLOAT}))
    .INPUT(accum, TensorType({DT_FLOAT}))
    .INPUT(linear, TensorType({DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .REQUIRED_ATTR(lr, Float)
    .REQUIRED_ATTR(l1, Float)
    .REQUIRED_ATTR(l2, Float)
    .REQUIRED_ATTR(l2_shrinkage, Float)
    .REQUIRED_ATTR(lr_power, Float)
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(SparseApplyFtrlV2D)

 /**
 *@brief Clean memory of workspace list.

 *@par Attributes:
 * @li automic_add_mem_size: sizes of workspaces.

 */
 REG_OP(AtomicAddrClean)
    .ATTR(automic_add_mem_size, ListInt, {})
    .OP_END_FACTORY_REG(AtomicAddrClean)
 }  // namespace ge

 #endif // GE_OP_TRAINING_OPS_H
--- a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
+++ b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
@@ -33,15 +33,39 @@ REG_OP(NPUGetFloatStatusOperator)
    .OUTPUT(data, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(NPUGetFloatStatusOperator)

 /**
 *@brief Produces a variable with 0 in memory.

 *@par Outputs:
 *y: A Tensor of type int32, output eight numbers with a value of zero.
 */
 REG_OP(NPUAllocFloatStatus)
    .OUTPUT(data, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(NPUAllocFloatStatus)

 /**
 *@brief Set the value of address 0x40000 to 0 in each core.

 *@par Inputs:
 *@li addr: A tensor of type float32.

 *@par Outputs:
 *data: A Tensor of type float32.
 */
 REG_OP(NPUClearFloatStatus)
    .INPUT(addr, TensorType{DT_FLOAT})
    .OUTPUT(data, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(NPUClearFloatStatus)

 /**
 *@brief Get the value of address 0x40000.

 *@par Inputs:
 *@li addr: A tensor of type float32.

 *@par Outputs:
 *data: A Tensor of type float32.
 */
 REG_OP(NPUGetFloatStatus)
    .INPUT(addr, TensorType{DT_FLOAT})
    .OUTPUT(data, TensorType({DT_FLOAT}))
--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -153,6 +153,20 @@ REG_OP(ReduceAll)
    .ATTR(keep_dims, Bool, false)
    .OP_END_FACTORY_REG(ReduceAll)

 /**
 *@brief  Reduce a tensor on a certain axis based on product..

 *@par Inputs:
 *Two inputs, including:
 *@li x: A mutable Tensor. Must be the type of NumberType.
 *@li axis: A mutable Tensor. The dimensions to reduce.

 *@par Attributes:
 *@li keep_dims: A bool. If true, retains reduced dimensions with length 1. Defaults to "False".

 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "x".
 */
 REG_OP(ReduceProd)
    .INPUT(x,TensorType::NumberType())
    .INPUT(axis, TensorType::IndexNumberType())
@@ -160,6 +174,23 @@ REG_OP(ReduceProd)
    .ATTR(keep_dims, Bool, false)
    .OP_END_FACTORY_REG(ReduceProd)

 /**
 *@brief Computes the product of elements across dimensions of a tensor.

 *@par Inputs:
 * One input: \n
 *x: A Tensor. Must be one of the following types: float16, float, int8, uint8.

 *@par Attributes:
 *@li axis: A required int8, int16, int32, or int64. Specifies the dimensions to reduce. No default value.
 *@li keep_dims: An optional bool. If "True", retains reduced dimensions with length 1. Defaults to "False".

 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "x".

 *@attention Constraints:
 * "keep_dims" is in the range [-rank(input_tensor), rank(input_tensor)].
 */
 REG_OP(ReduceProdD)
    .INPUT(x,TensorType({DT_FLOAT, DT_UINT8, DT_INT8, DT_INT32, DT_FLOAT16}))
    .OUTPUT(y,TensorType({DT_FLOAT, DT_UINT8, DT_INT8, DT_INT32, DT_FLOAT16}))
--- a/third_party/fwkacllib/inc/ops/basic_lstm_cell.h
+++ b/third_party/fwkacllib/inc/ops/basic_lstm_cell.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef GE_OP_BASIC_LSTM_CELL_H
 #define GE_OP_BASIC_LSTM_CELL_H
 #ifndef GE_OP_RNN_H
 #define GE_OP_RNN_H

 #include "../graph/operator_reg.h"

@@ -151,4 +151,4 @@ REG_OP(BasicLSTMCellCStateGrad)
    .OP_END_FACTORY_REG(BasicLSTMCellCStateGrad)
 }  // namespace ge

 #endif  // GE_OP_BASIC_LSTM_CELL_H
 #endif  // GE_OP_RNN_H
--- a/third_party/fwkacllib/inc/ops/rpn_ops.h
+++ b/third_party/fwkacllib/inc/ops/rpn_ops.h
@@ -19,6 +19,32 @@

 #include "../graph/operator_reg.h"
 namespace ge {
 /**
 *@brief Iteratively removes lower scoring boxes which have an IoU greater than
 * iou_threshold with higher scoring box according to their
 * intersection-over-union (IoU).

 *@par Input:
 * @li box_scores: 2-D tensor with shape of [N, 8], including proposal boxes and
 * corresponding confidence scores.

 * @par Attributes:
 * @li iou_threshold: An optional float. The threshold for deciding whether boxes
 * overlap too much with respect to IOU.

 * @par Outputs:
 * @li selected_boxes: 2-D tensor with shape of [N,5], representing filtered
 * boxes including proposal boxes and corresponding confidence scores.
 * @li selected_idx: 1-D tensor with shape of [N], representing the index of
 * input proposal boxes.
 * @li selected_mask: 1-D tensor with shape of [N], the symbol judging whether
 * the output proposal boxes is valid.

 * @attention Constraints:
 * The 2nd-dim of input box_scores must be equal to 8.\n
 * Only supports 2864 input boxes at one time.\n

 */
 REG_OP(NMSWithMask)
    .INPUT(box_scores, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(selected_boxes, TensorType({DT_FLOAT, DT_FLOAT16}))
--- a/third_party/fwkacllib/inc/runtime/event.h
+++ b/third_party/fwkacllib/inc/runtime/event.h
@@ -108,28 +108,6 @@ RTS_API rtError_t rtEventGetTimeStamp(uint64_t *time, rtEvent_t event);
 */
 RTS_API rtError_t rtNameEvent(rtEvent_t event_, const char *name);

 /**
 * @ingroup dvrt_event
 * @brief make event shared interprocess and assigned a name
 * @param [in] event  event to be shared
 * @param [in] name   identification name
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 * @return RT_ERROR_INVALID_RESOURCE_HANDLE for invalid resource handle
 */
 RTS_API rtError_t rtIpcSetEventName(rtEvent_t event, char *name, uint32_t len);

 /**
 * @ingroup dvrt_event
 * @brief open a interprocess shared event
 * @param [in|out] event    event to be opened
 * @param [in] name   identification name
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input of ptr, name
 * @return RT_ERROR_DRV_ERR for driver error
 */
 RTS_API rtError_t rtIpcOpenEvent(rtEvent_t *event, const char *name);

 /**
 * @ingroup dvrt_event
 * @brief Create a notify
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@@ -24,7 +24,6 @@
 extern "C" {
 #endif  // __cplusplus

 /*lint -e148*/
 /**
 * @ingroup rt_kernel
 * @brief shared memory data control
@@ -41,7 +40,6 @@ typedef struct tagRtSmData {
  uint8_t reserved[2];             // reserved
 } rtSmData_t;

 /*lint -e148*/
 /**
 * @ingroup rt_kernel
 * @brief shared memory description
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -17,9 +17,7 @@
 #ifndef __CCE_RUNTIME_MEM_H__
 #define __CCE_RUNTIME_MEM_H__

 /*lint -e7*/
 #include <stddef.h>
 /*lint +e7*/
 #include "base.h"
 #include "config.h"
 #include "stream.h"
--- a/third_party/fwkacllib/inc/runtime/rt_model.h
+++ b/third_party/fwkacllib/inc/runtime/rt_model.h
@@ -92,7 +92,6 @@ typedef struct tagAicpuModelInfo {
  uint64_t aicpuTaskPtr;
 } rtAicpuModelInfo_t;

 /* lint -e148 */
 typedef struct tagKernelTaskInfo {
  uint16_t blockDim;
  uint16_t argsCount;
@@ -102,7 +101,7 @@ typedef struct tagKernelTaskInfo {
  uint8_t *smDesc;
  uint8_t *args;
  uint16_t *argsOffset;
 } rtKernelTaskInfo_t; /* lint +e148 */
 } rtKernelTaskInfo_t;

 typedef struct tagKernelTaskInfoEx {
  uint32_t flags;
--- a/third_party/fwkacllib/inc/tdt/data_common.h
+++ b/third_party/fwkacllib/inc/tdt/data_common.h
@@ -32,6 +32,7 @@ enum TdtDataType {
  TDT_DATA_LABEL,      /**< Data label*/
  TDT_END_OF_SEQUENCE, /**< End of Sequence*/
  TDT_TENSOR,          /**< Tensor*/
  TDT_ABNORMAL,        /**< ABNORMAL*/
  TDT_DATATYPE_MAX     /**< Max*/
 };
 #endif