fixing cmake and code structures

4 years ago · 1becd81e30
--- a/build.sh
+++ b/build.sh
@@ -174,11 +174,9 @@ echo "---------------- GraphEngine output generated ----------------"
 # generate output package in tar form, including ut/st libraries/executables
 cd ${BASEPATH}
 mkdir -p output/plugin/nnengine/ge_config/
 mkdir -p output/plugin/opskernel/
 find output/ -name graphengine_lib.tar -exec rm {} \;
 cp src/ge/engine_manager/engine_conf.json output/plugin/nnengine/ge_config/
 find output/ -maxdepth 1 -name libengine.so -exec mv -f {} output/plugin/nnengine/ \;
 find output/ -maxdepth 1 -name libge_local_engine.so -exec mv -f {} output/plugin/opskernel/ \;
 tar -cf graphengine_lib.tar output/*
 mv -f graphengine_lib.tar output
 echo "---------------- GraphEngine package archive generated ----------------"
--- a/inc/common/opskernel/ge_task_info.h
+++ b/inc/common/opskernel/ge_task_info.h
@@ -52,16 +52,5 @@ struct GETaskInfo {

  std::vector<GETaskKernelHcclInfo> kernelHcclInfo;
 };

 struct HcomOpertion {
  std::string hcclType;
  void *inputPtr;
  void *outputPtr;
  uint64_t count;
  int32_t dataType;
  int32_t opType;
  int32_t root;
 };

 }  // namespace ge
 #endif  // INC_COMMON_OPSKERNEL_GE_TASK_INFO_H_
--- a/inc/common/util/compress/compress.h
+++ b/inc/common/util/compress/compress.h
@@ -28,7 +28,6 @@ struct CompressConfig {
  size_t channel;      // channels of L2 or DDR. For load balance
  size_t fractalSize;  // size of compressing block
  bool isTight;        // whether compose compressed data tightly
  size_t init_offset;
 };

 CmpStatus CompressWeights(char* input, const CompressConfig& compressConfig, char* indexs, char* output,
--- a/inc/common/util/compress/compress_weight.h
+++ b/inc/common/util/compress/compress_weight.h
@@ -1,33 +0,0 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef COMPRESS_WEIGHT_H
 #define COMPRESS_WEIGHT_H

 #include "compress.h"

 const int SHAPE_SIZE_WEIGHT = 4;

 struct CompressOpConfig {
  int64_t wShape[SHAPE_SIZE_WEIGHT];
  size_t compressTilingK;
  size_t compressTilingN;
  struct CompressConfig compressConfig;
 };

 extern "C" CmpStatus CompressWeightsConv2D(const char *const input, char *const zipBuffer, char *const infoBuffer,
                                           CompressOpConfig *const param);
 #endif  // COMPRESS_WEIGHT_H
--- a/inc/common/util/platform_info.h
+++ b/inc/common/util/platform_info.h
@@ -27,6 +27,7 @@ using std::string;
 using std::vector;

 namespace fe {

 class PlatformInfoManager {
 public:
  PlatformInfoManager(const PlatformInfoManager &) = delete;
@@ -38,8 +39,6 @@ class PlatformInfoManager {

  uint32_t GetPlatformInfo(const string SoCVersion, PlatformInfo &platformInfo, OptionalInfo &optiCompilationInfo);

  uint32_t GetPlatformInfoWithOutSocVersion(PlatformInfo &platformInfo, OptionalInfo &optiCompilationInfo);

  void SetOptionalCompilationInfo(OptionalInfo &optiCompilationInfo);

 private:
@@ -95,5 +94,6 @@ class PlatformInfoManager {
  map<string, PlatformInfo> platformInfoMap_;
  OptionalInfo optiCompilationInfo_;
 };

 }  // namespace fe
 #endif
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -44,12 +44,8 @@ const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump";
 const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath";
 const char *const OPTION_EXEC_DUMP_STEP = "ge.exec.dumpStep";
 const char *const OPTION_EXEC_DUMP_MODE = "ge.exec.dumpMode";
 const char *const OPTION_EXEC_ENABLE_DUMP_DEBUG = "ge.exec.enableDumpDebug";
 const char *const OPTION_EXEC_DUMP_DEBUG_MODE = "ge.exec.dumpDebugMode";
 const char *const OPTION_EXEC_OP_DEBUG_LEVEL = "ge.exec.opDebugLevel";
 const char *const OPTION_EXEC_ENABLE_INCRE_BUILD = "ge.exec.enableIncreBuild";
 const char *const OPTION_EXEC_INCRE_BUILD_CACHE_PATH = "ge.exec.increBuildCachePath";
 const char *const OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES = "ge.exec.enableScopeFusionPasses";
 // profiling flag
 const char *const OPTION_EXEC_PROFILING_MODE = "ge.exec.profilingMode";
 const char *const OPTION_EXEC_PROFILING_OPTIONS = "ge.exec.profilingOptions";
@@ -223,10 +219,6 @@ const char *const ENABLE_SINGLE_STREAM = "ge.enableSingleStream";
 // Configure input fp16 nodes
 const std::string INPUT_FP16_NODES = "ge.INPUT_NODES_SET_FP16";

 // Configure debug level, its value should be 0(default), 1 or 2.
 // 0: close debug; 1: open TBE compiler; 2: open ccec compiler
 const std::string OP_DEBUG_LEVEL = "ge.opDebugLevel";

 // Graph run mode
 enum GraphRunMode { PREDICTION = 0, TRAIN };

--- a/inc/external/graph/types.h
+++ b/inc/external/graph/types.h
@@ -145,8 +145,7 @@ enum Format {
  FORMAT_FRACTAL_ZN_LSTM,
  FORMAT_FRACTAL_Z_G,
  FORMAT_RESERVED,
  FORMAT_ALL,
  FORMAT_NULL
  FORMAT_ALL
 };

 // for unknown shape op type
--- a/inc/external/register/register.h
+++ b/inc/external/register/register.h
@@ -98,8 +98,6 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistrationData {

  OpRegistrationData &DelInputWithOriginalType(int input_idx, const std::string &ori_type);

  OpRegistrationData &InputReorderVector(const vector<int> &input_order);

  domi::ImplyType GetImplyType() const;
  std::string GetOmOptype() const;
  std::set<std::string> GetOriginOpTypeSet() const;
--- a/inc/framework/common/debug/ge_log.h
+++ b/inc/framework/common/debug/ge_log.h
@@ -51,6 +51,30 @@ inline pid_t GetTid() {
  return tid;
 }

 #define GE_TIMESTAMP_START(stage) uint64_t startUsec_##stage = ge::GetCurrentTimestap()

 #define GE_TIMESTAMP_END(stage, stage_name)                                           \
  do {                                                                                \
    uint64_t endUsec_##stage = ge::GetCurrentTimestap();                              \
    GEEVENT("[GEPERFTRACE] The time cost of %s is [%lu] micro second.", (stage_name), \
            (endUsec_##stage - startUsec_##stage));                                   \
  } while (0);

 #define GE_TIMESTAMP_CALLNUM_START(stage)                \
  uint64_t startUsec_##stage = ge::GetCurrentTimestap(); \
  uint64_t call_num_of##stage = 0;                       \
  uint64_t time_of##stage = 0

 #define GE_TIMESTAMP_RESTART(stage) (startUsec_##stage = ge::GetCurrentTimestap())

 #define GE_TIMESTAMP_ADD(stage)                                   \
  time_of##stage += ge::GetCurrentTimestap() - startUsec_##stage; \
  call_num_of##stage++

 #define GE_TIMESTAMP_CALLNUM_END(stage, stage_name)                                                                 \
  GEEVENT("[GEPERFTRACE] The time cost of %s is [%lu] micro second, call num is %lu", (stage_name), time_of##stage, \
          call_num_of##stage)

 #define GE_LOG_ERROR(MOD_NAME, ERROR_CODE, fmt, ...)                                       \
  dlog_error(MOD_NAME, "%lu %s: ErrorNo: %d(%s) " fmt, GetTid(), __FUNCTION__, ERROR_CODE, \
             ((GE_GET_ERRORNO_STR(ERROR_CODE)).c_str()), ##__VA_ARGS__)
--- a/inc/framework/common/debug/log.h
+++ b/inc/framework/common/debug/log.h
@@ -19,12 +19,15 @@

 #include <string>

 #include "runtime/rt.h"
 #include "cce/cce_def.hpp"
 #include "common/string_util.h"
 #include "common/util.h"
 #include "framework/common/debug/ge_log.h"
 #include "ge/ge_api_error_codes.h"

 using cce::CC_STATUS_SUCCESS;
 using cce::ccStatus_t;

 #if !defined(__ANDROID__) && !defined(ANDROID)
 #define DOMI_LOGE(...) GE_LOG_ERROR(GE_MODULE_NAME, ge::FAILED, __VA_ARGS__)
 #else
@@ -99,13 +102,17 @@
  } while (0);

 // If expr is not true, print the log and return the specified status
 #define GE_CHK_BOOL_RET_STATUS(expr, _status, ...) \
  do {                                             \
    bool b = (expr);                               \
    if (!b) {                                      \
      GELOGE(_status, __VA_ARGS__);                \
      return _status;                              \
    }                                              \
 #define GE_CHK_BOOL_RET_STATUS(expr, _status, ...)                                                         \
  do {                                                                                                     \
    bool b = (expr);                                                                                       \
    if (!b) {                                                                                              \
      std::string msg;                                                                                     \
      (void)msg.append(ge::StringUtils::FormatString(__VA_ARGS__));                                        \
      (void)msg.append(                                                                                    \
        ge::StringUtils::FormatString(" Error Code:0x%X(%s)", _status, GET_ERRORNO_STR(_status).c_str())); \
      DOMI_LOGE("%s", msg.c_str());                                                                        \
      return _status;                                                                                      \
    }                                                                                                      \
  } while (0);

 // If expr is not true, print the log and return the specified status
@@ -125,7 +132,7 @@
      DOMI_LOGE(__VA_ARGS__);                  \
      exec_expr;                               \
    }                                          \
  }
  };

 // If expr is not true, print the log and execute a custom statement
 #define GE_CHK_BOOL_EXEC_WARN(expr, exec_expr, ...) \
@@ -135,7 +142,7 @@
      GELOGW(__VA_ARGS__);                          \
      exec_expr;                                    \
    }                                               \
  }
  };
 // If expr is not true, print the log and execute a custom statement
 #define GE_CHK_BOOL_EXEC_INFO(expr, exec_expr, ...) \
  {                                                 \
@@ -144,7 +151,7 @@
      GELOGI(__VA_ARGS__);                          \
      exec_expr;                                    \
    }                                               \
  }
  };

 // If expr is not true, print the log and execute a custom statement
 #define GE_CHK_BOOL_TRUE_EXEC_INFO(expr, exec_expr, ...) \
@@ -154,7 +161,7 @@
      GELOGI(__VA_ARGS__);                               \
      exec_expr;                                         \
    }                                                    \
  }
  };

 // If expr is true, print logs and execute custom statements
 #define GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(expr, exec_expr, ...) \
@@ -164,7 +171,7 @@
      DOMI_LOGE(__VA_ARGS__);                                \
      exec_expr;                                             \
    }                                                        \
  }
  };
 // If expr is true, print the Information log and execute a custom statement
 #define GE_CHK_TRUE_EXEC_INFO(expr, exec_expr, ...) \
  {                                                 \
@@ -173,7 +180,7 @@
      GELOGI(__VA_ARGS__);                          \
      exec_expr;                                    \
    }                                               \
  }
  };

 // If expr is not SUCCESS, print the log and execute the expression + return
 #define GE_CHK_BOOL_TRUE_RET_VOID(expr, exec_expr, ...) \
@@ -184,7 +191,7 @@
      exec_expr;                                        \
      return;                                           \
    }                                                   \
  }
  };

 // If expr is not SUCCESS, print the log and execute the expression + return _status
 #define GE_CHK_BOOL_TRUE_EXEC_RET_STATUS(expr, _status, exec_expr, ...) \
@@ -195,7 +202,7 @@
      exec_expr;                                                        \
      return _status;                                                   \
    }                                                                   \
  }
  };

 // If expr is not true, execute a custom statement
 #define GE_CHK_BOOL_EXEC_NOLOG(expr, exec_expr) \
@@ -204,7 +211,7 @@
    if (!b) {                                   \
      exec_expr;                                \
    }                                           \
  }
  };

 // -----------------runtime related macro definitions-------------------------------
 // If expr is not RT_ERROR_NONE, print the log
@@ -224,7 +231,7 @@
      DOMI_LOGE("Call rt api failed, ret: 0x%X", _rt_ret); \
      exec_expr;                                           \
    }                                                      \
  }
  };

 // If expr is not RT_ERROR_NONE, print the log and return
 #define GE_CHK_RT_RET(expr)                                \
@@ -236,13 +243,23 @@
    }                                                      \
  } while (0);

 // ------------------------cce related macro definitions----------------------------
 // If expr is not CC_STATUS_SUCCESS, print the log
 #define GE_CHK_CCE(expr)                                    \
  do {                                                      \
    ccStatus_t _cc_ret = (expr);                            \
    if (_cc_ret != CC_STATUS_SUCCESS) {                     \
      DOMI_LOGE("Call cce api failed, ret: 0x%X", _cc_ret); \
    }                                                       \
  } while (0);

 // If expr is true, execute exec_expr without printing logs
 #define GE_IF_BOOL_EXEC(expr, exec_expr) \
  {                                      \
    if (expr) {                          \
      exec_expr;                         \
    }                                    \
  }
  };

 // If make_shared is abnormal, print the log and execute the statement
 #define GE_MAKE_SHARED(exec_expr0, exec_expr1) \
--- a/inc/framework/common/ge_types.h
+++ b/inc/framework/common/ge_types.h
@@ -54,9 +54,9 @@ const char *const GE_ENGINE_ATTR_MEM_TYPE_HBM = "HBM";
 struct DataBuffer {
 public:
  void *data;       // Data address
  uint64_t length;  // Data length
  uint32_t length;  // Data length
  bool isDataSupportMemShare = false;
  DataBuffer(void *dataIn, uint64_t len, bool isSupportMemShare)
  DataBuffer(void *dataIn, uint32_t len, bool isSupportMemShare)
      : data(dataIn), length(len), isDataSupportMemShare(isSupportMemShare) {}

  DataBuffer() : data(nullptr), length(0), isDataSupportMemShare(false) {}
@@ -106,7 +106,7 @@ struct ShapeDescription {
 // Definition of input and output description information
 struct InputOutputDescInfo {
  std::string name;
  uint64_t size;
  uint32_t size;
  uint32_t data_type;
  ShapeDescription shape_info;
 };
@@ -231,7 +231,6 @@ struct Options {

 // Profiling info of task
 struct TaskDescInfo {
  std::string model_name;
  std::string op_name;
  uint32_t block_dim;
  uint32_t task_id;
@@ -240,7 +239,6 @@ struct TaskDescInfo {

 // Profiling info of graph
 struct ComputeGraphDescInfo {
  std::string model_name;
  std::string op_name;
  std::string op_type;
  std::vector<Format> input_format;
--- a/inc/framework/common/helper/model_helper.h
+++ b/inc/framework/common/helper/model_helper.h
@@ -44,6 +44,8 @@ class ModelHelper {
  void SetSaveMode(bool val) { is_offline_ = val; }
  bool GetSaveMode(void) const { return is_offline_; }

  static Status TransModelToGeModel(const ModelPtr& model, GeModelPtr& ge_model);
  static Status TransGeModelToModel(const GeModelPtr& geModelPtr, ModelPtr& modelPtr);
  Status GetBaseNameFromFileName(const std::string& file_name, std::string& base_name);
  Status GetModelNameFromMergedGraphName(const std::string& graph_name, std::string& model_name);

--- a/inc/framework/common/types.h
+++ b/inc/framework/common/types.h
@@ -48,9 +48,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string DUMP_S
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string DUMP_LAYER;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string DUMP_FILE_PATH;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string DUMP_MODE;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string OP_DEBUG_AICORE;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string OP_DEBUG_ATOMIC;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string OP_DEBUG_ALL;

 // Supported public properties name
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string PROP_OME_START_TIME;  // Start time
@@ -338,7 +335,6 @@ REGISTER_OPTYPE_DECLARE(BASICLSTMCELL, "BasicLSTMCell");
 REGISTER_OPTYPE_DECLARE(GETNEXT, "GetNext");
 REGISTER_OPTYPE_DECLARE(INITDATA, "InitData");
 REGISTER_OPTYPE_DECLARE(TRANSSHAPE, "TransShape")
 REGISTER_OPTYPE_DECLARE(REFIDENTITY, "RefIdentity");

 // ANN dedicated operator
 REGISTER_OPTYPE_DECLARE(ANN_MEAN, "AnnMean");
@@ -635,9 +631,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string NODE_N

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string NODE_NAME_END_GRAPH;

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string NODE_NAME_OP_DEBUG;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string OP_TYPE_OP_DEBUG;

 // convolution node type
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string OP_TYPE_CONVOLUTION;
 // adds a convolutional node name for the hard AIPP
--- a/inc/framework/executor/ge_executor.h
+++ b/inc/framework/executor/ge_executor.h
@@ -21,12 +21,12 @@
 #include <string>
 #include <vector>

 #include "common/dynamic_aipp.h"
 #include "common/ge_inner_error_codes.h"
 #include "common/ge_types.h"
 #include "common/types.h"
 #include "graph/tensor.h"
 #include "runtime/base.h"
 #include "common/dynamic_aipp.h"

 namespace ge {
 class ModelListenerAdapter;
--- a/inc/framework/generator/ge_generator.h
+++ b/inc/framework/generator/ge_generator.h
@@ -27,7 +27,6 @@
 #include "graph/ge_tensor.h"
 #include "graph/graph.h"
 #include "graph/op_desc.h"
 #include "graph/detail/attributes_holder.h"

 namespace ge {
 class GeGenerator {
--- a/inc/framework/omg/omg.h
+++ b/inc/framework/omg/omg.h
@@ -106,6 +106,7 @@ void GetOutputNodesNameAndIndex(std::vector<std::pair<ge::NodePtr, int32_t>> &ou
 void UpdateOmgCtxWithParserCtx();

 void UpdateParserCtxWithOmgCtx();

 }  // namespace ge

 namespace domi {
--- a/inc/graph/compute_graph.h
+++ b/inc/graph/compute_graph.h
@@ -74,9 +74,6 @@ class ComputeGraph : public std::enable_shared_from_this<ComputeGraph>, public A

  size_t GetAllNodesSize() const;
  Vistor<NodePtr> GetAllNodes() const;
  // is_unknown_shape: false, same with GetAllNodes func
  // is_unknown_shape: true, same with GetDirectNodes func
  Vistor<NodePtr> GetNodes(bool is_unknown_shape) const;
  size_t GetDirectNodesSize() const;
  Vistor<NodePtr> GetDirectNode() const;
  Vistor<NodePtr> GetInputNodes() const;
@@ -177,10 +174,6 @@ class ComputeGraph : public std::enable_shared_from_this<ComputeGraph>, public A
  void SetInputSize(uint32_t size) { input_size_ = size; }
  uint32_t GetInputSize() const { return input_size_; }

  // false: known shape  true: unknow shape
  bool GetGraphUnknownFlag() const { return is_unknown_shape_graph_; }
  void SetGraphUnknownFlag(bool flag) { is_unknown_shape_graph_ = flag; }

  ///
  /// Set is need train iteration.
  /// If set true, it means this graph need to be run iteration some
@@ -289,8 +282,7 @@ class ComputeGraph : public std::enable_shared_from_this<ComputeGraph>, public A
  std::map<uint32_t, std::string> op_name_map_;
  uint64_t session_id_ = 0;
  ge::Format data_format_ = ge::FORMAT_ND;
  // unknown graph indicator, default is false, mean known shape
  bool is_unknown_shape_graph_ = false;
 };
 }  // namespace ge

 #endif  // INC_GRAPH_COMPUTE_GRAPH_H_
--- a/inc/graph/debug/ge_attr_define.h
+++ b/inc/graph/debug/ge_attr_define.h
@@ -778,10 +778,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MOD

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_CORE_TYPE;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_ATC_VERSION;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_OPP_VERSION;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_SCALE_MODE;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_SCALE_VALUE;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_SCALE_OFFSET;
@@ -1000,7 +996,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DATA_DUMP_ORIGIN_FORMAT;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DATA_DUMP_ORIGIN_DATA_TYPE;

 // used for lX fusion
 // used for l1 fusion and other fusion in future
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_L1_FUSION_GROUP_ID;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_L1_FUSION_GROUP_KEY;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FUSION_GROUP_KEY;
@@ -1014,17 +1010,9 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SWITCH_FOR_L1_FUSION;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_N_BATCH_SPILT;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NO_TASK_AND_DUMP_NEEDED;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_DATA_DUMP_REF;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_L2_FUSION_GROUP_ID;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SWITCH_FOR_L2_FUSION;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_INPUT_L1_FLAG;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_INPUT_L1_ADDR;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_INPUT_L1_VALID_SIZE;

 // op overflow dump
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_OP_DEBUG_FLAG;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_OP_DEBUG_MODE;

 // functional ops attr
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IF_THEN_BRANCH;
@@ -1070,13 +1058,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_HOR
 // for gradient group
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_HCCL_FUSED_GROUP;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_HCCL_FUSED_FLAG;

 // dynamic shape attrs
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_DYNAMIC_SHAPE_FIXED_ADDR;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_DYNAMIC_SHAPE_FIXED_ADDR_INDEX;

 // for fusion op plugin
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FUSIONOP_ORIGINAL_TYPE;
 }  // namespace ge

 #endif  // INC_GRAPH_DEBUG_GE_ATTR_DEFINE_H_
--- a/inc/graph/detail/attributes_holder.h
+++ b/inc/graph/detail/attributes_holder.h
@@ -149,4 +149,5 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY AttrHolder {
  AnyMap extAttrs_;
 };
 }  // namespace ge

 #endif  // INC_GRAPH_DETAIL_ATTRIBUTES_HOLDER_H_
--- a/inc/graph/ge_context.h
+++ b/inc/graph/ge_context.h
@@ -28,7 +28,6 @@ class GEContext {
  uint32_t DeviceId();
  uint64_t TraceId();
  void Init();
  void SetSessionId(uint64_t session_id);
  void SetCtxDeviceId(uint32_t device_id);

 private:
--- a/inc/graph/ge_tensor.h
+++ b/inc/graph/ge_tensor.h
@@ -25,7 +25,6 @@
 #include "graph/buffer.h"
 #include "graph/ge_error_codes.h"
 #include "graph/types.h"

 namespace ge {
 class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeShape {
 public:
@@ -109,11 +108,8 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeTensorDesc : public AttrH
  DataType GetDataType() const;
  void SetDataType(DataType dt);

  DataType GetOriginDataType() const;
  void SetOriginDataType(DataType originDataType);

  std::vector<uint32_t> GetRefPortIndex() const;
  void SetRefPortByIndex(const std::vector<uint32_t> &index);
  DataType GetOriginDataType() const;

  GeTensorDesc Clone() const;
  GeTensorDesc &operator=(const GeTensorDesc &desc);
@@ -190,4 +186,5 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeTensor {
  GeTensorDesc &DescReference() const;
 };
 }  // namespace ge

 #endif  // INC_GRAPH_GE_TENSOR_H_
--- a/inc/graph/model_serialize.h
+++ b/inc/graph/model_serialize.h
@@ -49,4 +49,5 @@ class ModelSerialize {
  friend class GraphDebugImp;
 };
 }  // namespace ge

 #endif  // INC_GRAPH_MODEL_SERIALIZE_H_
--- a/inc/graph/op_desc.h
+++ b/inc/graph/op_desc.h
@@ -105,8 +105,6 @@ class OpDesc : public std::enable_shared_from_this<OpDesc>, public AttrHolder {

  GeTensorDescPtr MutableInputDesc(uint32_t index) const;

  GeTensorDescPtr MutableInputDesc(const string &name) const;

  Vistor<GeTensorDesc> GetAllInputsDesc() const;

  Vistor<GeTensorDescPtr> GetAllInputsDescPtr() const;
@@ -129,8 +127,6 @@ class OpDesc : public std::enable_shared_from_this<OpDesc>, public AttrHolder {

  GeTensorDescPtr MutableOutputDesc(uint32_t index) const;

  GeTensorDescPtr MutableOutputDesc(const string &name) const;

  uint32_t GetAllOutputsDescSize() const;

  Vistor<GeTensorDesc> GetAllOutputsDesc() const;
--- a/src/ge/CMakeLists.txt
+++ b/src/ge/CMakeLists.txt
@@ -60,7 +60,6 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "common/formats/formats.cc"
        "common/formats/utils/formats_trans_utils.cc"
        "common/fp16_t.cc"
        "common/ge/op_tiling_manager.cc"
        "common/ge/plugin_manager.cc"
        "common/helper/model_cache_helper.cc"
        "common/profiling/profiling_manager.cc"
@@ -95,6 +94,7 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc"
        "graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc"
        "graph/load/new_model_manager/task_info/task_info.cc"
        "graph/load/output/output.cc"
        "graph/manager/*.cc"
        "graph/manager/model_manager/event_manager.cc"
        "graph/manager/util/debug.cc"
@@ -159,11 +159,8 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "hybrid/node_executor/aicpu/aicpu_ext_info.cc"
        "hybrid/node_executor/aicpu/aicpu_node_executor.cc"
        "hybrid/node_executor/compiledsubgraph/known_node_executor.cc"
        "hybrid/node_executor/controlop/control_op_executor.cc"
        "hybrid/node_executor/hccl/hccl_node_executor.cc"
        "hybrid/node_executor/hostcpu/ge_local_node_executor.cc"
        "hybrid/node_executor/node_executor.cc"
        "hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc"
        "hybrid/node_executor/task_context.cc"
        "init/gelib.cc"
        "model/ge_model.cc"
@@ -207,7 +204,6 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "common/formats/formats.cc"
        "common/formats/utils/formats_trans_utils.cc"
        "common/fp16_t.cc"
        "common/ge/op_tiling_manager.cc"
        "common/ge/plugin_manager.cc"
        "common/helper/model_cache_helper.cc"
        "common/profiling/profiling_manager.cc"
@@ -240,6 +236,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc"
        "graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc"
        "graph/load/new_model_manager/task_info/task_info.cc"
        "graph/load/output/output.cc"
        "graph/manager/*.cc"
        "graph/manager/model_manager/event_manager.cc"
        "graph/manager/util/debug.cc"
--- a/src/ge/client/ge_api.cc
+++ b/src/ge/client/ge_api.cc
@@ -28,7 +28,6 @@
 #include "graph/opsproto_manager.h"
 #include "graph/utils/type_utils.h"
 #include "graph/manager/util/rt_context_util.h"
 #include "graph/common/ge_call_wrapper.h"
 #include "register/op_registry.h"
 #include "common/ge/tbe_plugin_manager.h"

@@ -42,8 +41,8 @@ namespace {
 const int32_t kMaxStrLen = 128;
 }

 static bool g_ge_initialized = false;
 static std::mutex g_ge_release_mutex;  // GEFinalize and ~Session use
 static bool kGeInitialized = false;
 static std::mutex kGeReleaseMutex;  // GEFinalize and ~Session use

 namespace ge {
 void GetOpsProtoPath(std::string &opsproto_path) {
@@ -62,6 +61,31 @@ void GetOpsProtoPath(std::string &opsproto_path) {
  opsproto_path = (path_base + "ops/op_proto/custom/" + ":") + (path_base + "ops/op_proto/built-in/");
 }

 Status CheckDumpAndReuseMemory(const std::map<string, string> &options) {
  const int kDecimal = 10;
  auto dump_op_env = std::getenv("DUMP_OP");
  int dump_op_flag = (dump_op_env != nullptr) ? std::strtol(dump_op_env, nullptr, kDecimal) : 0;
  auto disableReuseMemoryIter = options.find("ge.exec.disableReuseMemory");
  if (disableReuseMemoryIter != options.end()) {
    if (disableReuseMemoryIter->second == "0") {
      GELOGD("ge.exec.disableReuseMemory=0, reuse memory is open");
      if (dump_op_flag) {
        GELOGW("Will dump incorrect op data with GE Option ge.exec.disableReuseMemory=0");
      }
    } else if (disableReuseMemoryIter->second == "1") {
      GELOGD("ge.exec.disableReuseMemory=1, reuse memory is close");
    } else {
      GELOGE(PARAM_INVALID, "CheckDumpAndReuseMemory ge.exec.disableReuseMemory is valid");
      return FAILED;
    }
  } else {
    if (dump_op_flag) {
      GELOGW("Will dump incorrect op data with default reuse memory");
    }
  }
  return SUCCESS;
 }

 Status CheckOptionsValid(const std::map<string, string> &options) {
  // check job_id is valid
  auto job_id_iter = options.find(OPTION_EXEC_JOB_ID);
@@ -72,6 +96,11 @@ Status CheckOptionsValid(const std::map<string, string> &options) {
    }
  }

  // Check ge.exec.disableReuseMemory and env DUMP_OP
  if (CheckDumpAndReuseMemory(options) != SUCCESS) {
    return FAILED;
  }

  return SUCCESS;
 }

@@ -79,7 +108,7 @@ Status CheckOptionsValid(const std::map<string, string> &options) {
 Status GEInitialize(const std::map<string, string> &options) {
  GELOGT(TRACE_INIT, "GEInitialize start");
  // 0.check init status
  if (g_ge_initialized) {
  if (kGeInitialized) {
    GELOGW("GEInitialize is called more than once");
    return SUCCESS;
  }
@@ -118,9 +147,9 @@ Status GEInitialize(const std::map<string, string> &options) {
  }

  // 7.check return status, return
  if (!g_ge_initialized) {
  if (!kGeInitialized) {
    // Initialize success, first time calling initialize
    g_ge_initialized = true;
    kGeInitialized = true;
  }

  GELOGT(TRACE_STOP, "GEInitialize finished");
@@ -131,12 +160,12 @@ Status GEInitialize(const std::map<string, string> &options) {
 Status GEFinalize() {
  GELOGT(TRACE_INIT, "GEFinalize start");
  // check init status
  if (!g_ge_initialized) {
  if (!kGeInitialized) {
    GELOGW("GEFinalize is called before GEInitialize");
    return SUCCESS;
  }

  std::lock_guard<std::mutex> lock(g_ge_release_mutex);
  std::lock_guard<std::mutex> lock(kGeReleaseMutex);
  // call Finalize
  Status ret = SUCCESS;
  Status middle_ret;
@@ -158,10 +187,10 @@ Status GEFinalize() {
    ret = middle_ret;
  }

  if (g_ge_initialized && ret == SUCCESS) {
  if (kGeInitialized && ret == SUCCESS) {
    // Unified destruct rt_context
    RtContextUtil::GetInstance().DestroyAllRtContexts();
    g_ge_initialized = false;
    RtContextUtil::GetInstance().DestroyrtContexts();
    kGeInitialized = false;
  }

  GELOGT(TRACE_STOP, "GEFinalize finished");
@@ -173,7 +202,7 @@ Session::Session(const std::map<string, string> &options) {
  GELOGT(TRACE_INIT, "Session Constructor start");
  // check init status
  sessionId_ = 0;
  if (!g_ge_initialized) {
  if (!kGeInitialized) {
    GELOGE(GE_CLI_GE_NOT_INITIALIZED);
    return;
  }
@@ -203,13 +232,13 @@ Session::Session(const std::map<string, string> &options) {
 Session::~Session() {
  GELOGT(TRACE_INIT, "Session Destructor start");
  // 0.check init status
  if (!g_ge_initialized) {
  if (!kGeInitialized) {
    GELOGW("GE is not yet initialized or is finalized.");
    return;
  }

  Status ret = FAILED;
  std::lock_guard<std::mutex> lock(g_ge_release_mutex);
  std::lock_guard<std::mutex> lock(kGeReleaseMutex);
  try {
    uint64_t session_id = sessionId_;
    // call DestroySession
--- a/src/ge/engine_manager/dnnengine_manager.cc
+++ b/src/ge/engine_manager/dnnengine_manager.cc
@@ -24,7 +24,6 @@

 #include "common/debug/log.h"
 #include "common/ge/ge_util.h"
 #include "common/util/error_manager/error_manager.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/ge_context.h"
 #include "init/gelib.h"
@@ -162,10 +161,6 @@ bool DNNEngineManager::IsEngineRegistered(const std::string &name) {
  return false;
 }

 void DNNEngineManager::InitPerformanceStaistic() { checksupport_cost_.clear(); }

 const map<string, uint64_t> &DNNEngineManager::GetCheckSupportCost() const { return checksupport_cost_; }

 std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) {
  GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(GE_CLI_GE_NOT_INITIALIZED, "DNNEngineManager: op_desc is nullptr");
                  return "");
@@ -199,20 +194,15 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) {
    if (kernel_info_store != kernel_map.end()) {
      std::string unsupported_reason;
      // It will be replaced by engine' checksupport
      uint64_t start_time = GetCurrentTimestap();
      if (kernel_info_store->second->CheckSupported(op_desc, unsupported_reason)) {
        checksupport_cost_[kernel_name] += GetCurrentTimestap() - start_time;
        op_desc->SetOpEngineName(it.engine);
        op_desc->SetOpKernelLibName(kernel_name);
        GELOGD("DNNEngineManager:Set OpKernelLibName %s and engine name %s into op_desc %s", kernel_name.c_str(),
               it.engine.c_str(), op_desc->GetName().c_str());
        return it.engine;
      } else {
        checksupport_cost_[kernel_name] += GetCurrentTimestap() - start_time;
        bool is_custom_op = false;
        if ((ge::AttrUtils::GetBool(op_desc, kCustomOpFlag, is_custom_op)) && is_custom_op) {
          ErrorManager::GetInstance().ATCReportErrMessage("E13001", {"kernelname", "optype", "opname"},
                                                          {kernel_name, op_desc->GetType(), op_desc->GetName()});
          GELOGE(FAILED,
                 "The custom operator registered by the user does not support the logic function delivered by this "
                 "network. Check support failed, kernel_name is %s, op type is %s, op name is %s",
@@ -231,13 +221,9 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) {
    }
  }
  for (const auto &it : unsupported_reasons) {
    ErrorManager::GetInstance().ATCReportErrMessage("E13002", {"optype", "opskernel", "reason"},
                                                    {op_desc->GetType(), it.first, it.second});
    GELOGE(GE_GRAPH_ASSIGN_ENGINE_FAILED, "GetDNNEngineName:Op type %s of ops kernel %s is unsupported, reason:%s",
           op_desc->GetType().c_str(), it.first.c_str(), it.second.c_str());
  }
  ErrorManager::GetInstance().ATCReportErrMessage("E13003", {"opname", "optype"},
                                                  {op_desc->GetName(), op_desc->GetType()});
  GELOGE(GE_GRAPH_ASSIGN_ENGINE_FAILED, "Can't find any supported ops kernel and engine of %s, type is %s",
         op_desc->GetName().c_str(), op_desc->GetType().c_str());
  return "";
@@ -398,13 +384,7 @@ Status DNNEngineManager::ReadJsonFile(const std::string &file_path, JsonHandle h
    return FAILED;
  }

  try {
    ifs >> *json_file;
  } catch (const json::exception &e) {
    GELOGE(FAILED, "Read json file failed");
    ifs.close();
    return FAILED;
  }
  ifs >> *json_file;
  ifs.close();
  GELOGI("Read json file success");
  return SUCCESS;
--- a/src/ge/engine_manager/dnnengine_manager.h
+++ b/src/ge/engine_manager/dnnengine_manager.h
@@ -63,8 +63,6 @@ class DNNEngineManager {
  // If can't find appropriate engine name, return "", report error
  string GetDNNEngineName(const OpDescPtr &op_desc);
  const map<string, SchedulerConf> &GetSchedulers() const;
  const map<string, uint64_t> &GetCheckSupportCost() const;
  void InitPerformanceStaistic();

 private:
  DNNEngineManager();
@@ -80,7 +78,6 @@ class DNNEngineManager {
  std::map<std::string, DNNEnginePtr> engines_map_;
  std::map<std::string, ge::DNNEngineAttribute> engines_attrs_map_;
  std::map<string, SchedulerConf> schedulers_;
  std::map<string, uint64_t> checksupport_cost_;
  bool init_flag_;
 };
 }  // namespace ge
--- a/src/ge/executor/CMakeLists.txt
+++ b/src/ge/executor/CMakeLists.txt
@@ -26,7 +26,6 @@ file(GLOB PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}

 file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "ge_executor.cc"
        "../common/ge/op_tiling_manager.cc"
        "../common/ge/plugin_manager.cc"
        "../common/profiling/profiling_manager.cc"
        "../graph/execute/graph_execute.cc"
@@ -60,6 +59,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "../graph/load/new_model_manager/task_info/task_info.cc"
        "../graph/load/new_model_manager/tbe_handle_store.cc"
        "../graph/load/new_model_manager/zero_copy_task.cc"
        "../graph/load/output/output.cc"
        "../graph/manager/graph_caching_allocator.cc"
        "../graph/manager/graph_manager_utils.cc"
        "../graph/manager/graph_mem_allocator.cc"
--- a/src/ge/executor/ge_executor.cc
+++ b/src/ge/executor/ge_executor.cc
@@ -854,4 +854,5 @@ Status GeExecutor::GetAllAippInputOutputDims(uint32_t model_id, uint32_t index,
  GELOGI("GetAllAippInputOutputDims succ.");
  return SUCCESS;
 }

 }  // namespace ge
--- a/src/ge/executor/module.mk
+++ b/src/ge/executor/module.mk
@@ -4,7 +4,6 @@ local_ge_executor_src_files :=  \
    ge_executor.cc \
    ../common/profiling/profiling_manager.cc \
    ../common/ge/plugin_manager.cc \
    ../common/ge/op_tiling_manager.cc \
    ../graph/load/graph_loader.cc \
    ../graph/execute/graph_execute.cc \
    ../omm/csa_interact.cc \
@@ -45,6 +44,7 @@ local_ge_executor_src_files :=  \
    ../graph/load/new_model_manager/task_info/end_graph_task_info.cc        \
    ../graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc   \
    ../graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc  \
    ../graph/load/output/output.cc \
    ../single_op/single_op_manager.cc \
    ../single_op/single_op_model.cc \
    ../single_op/single_op.cc \
@@ -53,7 +53,6 @@ local_ge_executor_src_files :=  \
    ../single_op/task/build_task_utils.cc \
    ../single_op/task/tbe_task_builder.cc \
    ../single_op/task/aicpu_task_builder.cc \
    ../single_op/task/aicpu_kernel_task_builder.cc \
    ../hybrid/hybrid_davinci_model_stub.cc\

 local_ge_executor_c_include :=             \
--- a/src/ge/ge_inference.mk
+++ b/src/ge/ge_inference.mk
@@ -32,7 +32,6 @@ COMMON_LOCAL_SRC_FILES := \

 GRAPH_MANAGER_LOCAL_SRC_FILES := \
    common/ge/plugin_manager.cc\
    common/ge/op_tiling_manager.cc\
    init/gelib.cc \
    session/inner_session.cc \
    session/session_manager.cc \
@@ -92,7 +91,6 @@ OMG_HOST_SRC_FILES := \
    graph/passes/no_use_reshape_remove_pass.cc \
    graph/passes/iterator_op_pass.cc \
    graph/passes/atomic_addr_clean_pass.cc \
    graph/passes/mark_same_addr_pass.cc \
    graph/common/omg_util.cc \
    graph/common/bcast.cc \
    graph/passes/dimension_compute_pass.cc \
@@ -147,7 +145,6 @@ OMG_HOST_SRC_FILES := \
    graph/passes/stop_gradient_pass.cc \
    graph/passes/prevent_gradient_pass.cc \
    graph/passes/identity_pass.cc \
    graph/passes/ref_identity_delete_op_pass.cc \
    graph/passes/placeholder_with_default_pass.cc \
    graph/passes/snapshot_pass.cc \
    graph/passes/guarantee_const_pass.cc \
@@ -156,9 +153,7 @@ OMG_HOST_SRC_FILES := \
    graph/passes/folding_pass.cc \
    graph/passes/cast_translate_pass.cc \
    graph/passes/prune_pass.cc \
    graph/passes/merge_to_stream_merge_pass.cc \
    graph/passes/switch_to_stream_switch_pass.cc \
    graph/passes/attach_stream_label_pass.cc \
    graph/passes/switch_op_pass.cc \
    graph/passes/multi_batch_pass.cc \
    graph/passes/next_iteration_pass.cc \
    graph/passes/control_trigger_pass.cc \
@@ -178,6 +173,7 @@ OMG_HOST_SRC_FILES := \
    graph/passes/variable_op_pass.cc \
    graph/passes/cast_remove_pass.cc \
    graph/passes/transpose_transdata_pass.cc \
    graph/passes/identify_reference_pass.cc \
    graph/passes/hccl_memcpy_pass.cc \
    graph/passes/flow_ctrl_pass.cc \
    graph/passes/link_gen_mask_nodes_pass.cc \
@@ -203,6 +199,7 @@ OME_HOST_SRC_FILES := \
    graph/load/new_model_manager/tbe_handle_store.cc                     \
    graph/load/new_model_manager/cpu_queue_schedule.cc                   \
    graph/load/new_model_manager/zero_copy_task.cc                       \
    graph/load/output/output.cc                                          \
    graph/load/new_model_manager/data_dumper.cc                          \
    graph/load/new_model_manager/task_info/task_info.cc                  \
    graph/load/new_model_manager/task_info/event_record_task_info.cc     \
@@ -227,7 +224,6 @@ OME_HOST_SRC_FILES := \
    single_op/task/build_task_utils.cc                                   \
    single_op/task/tbe_task_builder.cc                                   \
    single_op/task/aicpu_task_builder.cc                                 \
    single_op/task/aicpu_kernel_task_builder.cc                          \
    single_op/single_op.cc                                               \
    single_op/single_op_model.cc                                         \
    single_op/stream_resource.cc                                         \
@@ -372,7 +368,7 @@ endif

 LOCAL_C_INCLUDES := $(COMMON_LOCAL_C_INCLUDES)

 LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_ir_build.cc
 LOCAL_SRC_FILES := ../../out/atc/lib64/stub/ge_ir_build.cc


 LOCAL_SHARED_LIBRARIES :=
--- a/src/ge/ge_runner.mk
+++ b/src/ge/ge_runner.mk
@@ -23,7 +23,6 @@ LIBGE_LOCAL_SRC_FILES := \
    common/formats/utils/formats_trans_utils.cc \
    common/fp16_t.cc \
    common/ge/plugin_manager.cc\
    common/ge/op_tiling_manager.cc\
    common/helper/model_cache_helper.cc \
    common/profiling/profiling_manager.cc \
    engine_manager/dnnengine_manager.cc \
@@ -78,6 +77,7 @@ LIBGE_LOCAL_SRC_FILES := \
    graph/load/new_model_manager/task_info/task_info.cc \
    graph/load/new_model_manager/tbe_handle_store.cc \
    graph/load/new_model_manager/zero_copy_task.cc \
    graph/load/output/output.cc \
    graph/manager/graph_context.cc \
    graph/manager/graph_manager.cc \
    graph/manager/graph_manager_utils.cc \
@@ -99,7 +99,6 @@ LIBGE_LOCAL_SRC_FILES := \
    graph/passes/aicpu_constant_folding_pass.cc \
    graph/passes/assert_pass.cc \
    graph/passes/atomic_addr_clean_pass.cc \
    graph/passes/mark_same_addr_pass.cc \
    graph/partition/dynamic_shape_partition.cc \
    graph/passes/base_pass.cc \
    graph/passes/cast_remove_pass.cc \
@@ -159,8 +158,8 @@ LIBGE_LOCAL_SRC_FILES := \
    graph/passes/get_original_format_pass.cc \
    graph/passes/guarantee_const_pass.cc \
    graph/passes/hccl_memcpy_pass.cc \
    graph/passes/identify_reference_pass.cc \
    graph/passes/identity_pass.cc \
    graph/passes/ref_identity_delete_op_pass.cc \
    graph/passes/infershape_pass.cc \
    graph/passes/isolated_op_remove_pass.cc \
    graph/passes/iterator_op_pass.cc \
@@ -192,9 +191,7 @@ LIBGE_LOCAL_SRC_FILES := \
    graph/passes/data_pass.cc \
    graph/passes/switch_data_edges_bypass.cc \
    graph/passes/switch_logic_remove_pass.cc \
    graph/passes/merge_to_stream_merge_pass.cc \
    graph/passes/switch_to_stream_switch_pass.cc \
    graph/passes/attach_stream_label_pass.cc \
    graph/passes/switch_op_pass.cc \
    graph/passes/switch_dead_branch_elimination.cc \
    graph/passes/replace_transshape_pass.cc \
    graph/passes/transop_breadth_fusion_pass.cc \
@@ -233,7 +230,6 @@ LIBGE_LOCAL_SRC_FILES := \
    single_op/task/op_task.cc \
    single_op/task/tbe_task_builder.cc \
    single_op/task/aicpu_task_builder.cc \
    single_op/task/aicpu_kernel_task_builder.cc \
    hybrid/common/tensor_value.cc                                        \
    hybrid/common/npu_memory_allocator.cc                                \
    hybrid/executor/rt_callback_manager.cc                               \
@@ -243,15 +239,12 @@ LIBGE_LOCAL_SRC_FILES := \
    hybrid/executor/hybrid_model_executor.cc                             \
    hybrid/executor/hybrid_model_async_executor.cc                       \
    hybrid/executor/hybrid_execution_context.cc                          \
    hybrid/executor/subgraph_context.cc                                  \
    hybrid/executor/subgraph_executor.cc                                 \
    hybrid/executor/worker/task_compile_engine.cc                        \
    hybrid/executor/worker/shape_inference_engine.cc                     \
    hybrid/executor/worker/execution_engine.cc                           \
    hybrid/model/hybrid_model.cc                                         \
    hybrid/model/hybrid_model_builder.cc                                 \
    hybrid/model/node_item.cc                                            \
    hybrid/model/graph_item.cc                                           \
    hybrid/node_executor/aicore/aicore_node_executor.cc                  \
    hybrid/node_executor/aicore/aicore_op_task.cc                        \
    hybrid/node_executor/aicore/aicore_task_builder.cc                   \
@@ -260,9 +253,6 @@ LIBGE_LOCAL_SRC_FILES := \
    hybrid/node_executor/aicpu/aicpu_node_executor.cc                    \
    hybrid/node_executor/compiledsubgraph/known_node_executor.cc         \
    hybrid/node_executor/hostcpu/ge_local_node_executor.cc               \
    hybrid/node_executor/controlop/control_op_executor.cc                \
    hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc \
    hybrid/node_executor/hccl/hccl_node_executor.cc               \
    hybrid/node_executor/node_executor.cc                                \
    hybrid/node_executor/task_context.cc                                 \
    hybrid/hybrid_davinci_model.cc                                       \
@@ -348,28 +338,6 @@ LOCAL_SHARED_LIBRARIES += \

 include $(BUILD_HOST_SHARED_LIBRARY)

 #compiler for GeRunner
 include $(CLEAR_VARS)

 LOCAL_MODULE := stub/libge_runner

 LOCAL_CFLAGS += -DPROTOBUF_INLINE_NOT_IN_HEADERS=0 -DREUSE_MEMORY=1 -O2
 LOCAL_CFLAGS += -DFMK_SUPPORT_DUMP -DDAVINCI_SUPPORT_PROFILING -DDAVINCI_CLOUD
 ifeq ($(DEBUG), 1)
 LOCAL_CFLAGS += -g -O0
 endif


 LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES)

 LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_api.cc


 LOCAL_SHARED_LIBRARIES :=

 LOCAL_LDFLAGS := -lrt -ldl

 include $(BUILD_HOST_SHARED_LIBRARY)

 # add engine_conf.json to host
 include $(CLEAR_VARS)
@@ -439,7 +407,6 @@ LOCAL_CFLAGS += -DFMK_SUPPORT_DUMP -DDAVINCI_SUPPORT_PROFILING -DDAVINCI_CLOUD
 LOCAL_CFLAGS += -g -O0

 LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES)

 LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES)
 LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)

--- a/src/ge/ge_train.mk
+++ b/src/ge/ge_train.mk
@@ -0,0 +1,333 @@
 LOCAL_PATH := $(call my-dir)

 COMMON_LOCAL_SRC_FILES := \
    proto/fusion_model.proto \
    proto/optimizer_priority.proto \
    session/inner_session.cc \
    session/session_manager.cc \
    common/ge/plugin_manager.cc\
    common/fp16_t.cc \
    common/formats/utils/formats_trans_utils.cc \
    common/formats/format_transfers/datatype_transfer.cc \
    common/formats/format_transfers/format_transfer_transpose.cc \
    common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc \
    common/formats/format_transfers/format_transfer_fractal_z.cc \
    common/formats/format_transfers/format_transfer_fractal_nz.cc \
    common/formats/format_transfers/format_transfer_fractal_zz.cc \
    common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc \
    common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc \
    common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc \
    common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc \
    common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc \
    common/formats/format_transfers/format_transfer_fracz_nchw.cc \
    common/formats/format_transfers/format_transfer_fracz_nhwc.cc \
    common/formats/format_transfers/format_transfer_fracz_hwcn.cc \
    common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc \
    common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc \
    common/formats/formats.cc \
    init/gelib.cc \
    engine_manager/dnnengine_manager.cc \
    opskernel_manager/ops_kernel_manager.cc \
    graph/manager/graph_manager.cc \
    graph/manager/graph_manager_utils.cc \
    graph/manager/graph_context.cc \
    graph/preprocess/graph_preprocess.cc \
    graph/preprocess/multi_batch_copy_graph.cc \
    graph/execute/graph_execute.cc \
    graph/load/graph_loader.cc \
    graph/optimize/graph_optimize.cc \
    graph/passes/folding_pass.cc \
    graph/optimize/summary_optimize.cc \
    graph/build/graph_builder.cc \
    graph/partition/engine_place.cc \
    graph/partition/graph_partition.cc \
    graph/partition/dynamic_shape_partition.cc \
    generator/ge_generator.cc \
    generator/generator_api.cc \
    common/profiling/profiling_manager.cc \
    ge_local_engine/engine/host_cpu_engine.cc \
    common/helper/model_cache_helper.cc \

 OMG_HOST_SRC_FILES := \
    model/ge_model.cc \
    model/ge_root_model.cc \
    graph/common/transop_util.cc \
    graph/manager/graph_var_manager.cc \
    graph/manager/trans_var_data_utils.cc \
    omm/csa_interact.cc \
    graph/passes/pass_manager.cc \
    graph/passes/pass_utils.cc \
    graph/passes/base_pass.cc \
    graph/passes/resource_pair_add_control_pass.cc \
    graph/passes/resource_pair_remove_control_pass.cc \
    graph/passes/constant_folding_pass.cc \
    graph/passes/aicpu_constant_folding_pass.cc \
    graph/passes/reshape_remove_pass.cc \
    graph/passes/reshape_recovery_pass.cc \
    graph/passes/transop_breadth_fusion_pass.cc \
    graph/passes/transop_depth_fusion_pass.cc \
    graph/passes/same_transdata_breadth_fusion_pass.cc \
    graph/passes/transop_without_reshape_fusion_pass.cc \
    graph/passes/compile_nodes_pass.cc \
    graph/passes/transop_nearby_allreduce_fusion_pass.cc \
    graph/passes/variable_prepare_op_pass.cc \
    graph/passes/variable_ref_delete_op_pass.cc \
    graph/passes/variable_ref_useless_control_out_delete_pass.cc \
    graph/passes/variable_op_pass.cc \
    graph/passes/cast_remove_pass.cc \
    graph/passes/replace_transshape_pass.cc \
    graph/passes/transpose_transdata_pass.cc \
    graph/passes/identify_reference_pass.cc \
    graph/passes/variable_format_pass.cc \
    graph/passes/subgraph_pass.cc \
    graph/passes/data_pass.cc \
    graph/passes/net_output_pass.cc \
    graph/passes/constant_fuse_same_pass.cc \
    graph/passes/print_op_pass.cc \
    graph/passes/no_use_reshape_remove_pass.cc \
    graph/passes/iterator_op_pass.cc \
    graph/passes/atomic_addr_clean_pass.cc \
    graph/optimize/optimizer/allreduce_fusion_pass.cc \
    graph/common/omg_util.cc \
    graph/common/bcast.cc \
    graph/passes/dimension_compute_pass.cc \
    graph/passes/dimension_adjust_pass.cc \
    graph/passes/get_original_format_pass.cc \
    graph/passes/shape_operate_op_remove_pass.cc \
    graph/passes/unused_op_remove_pass.cc \
    graph/passes/assert_pass.cc \
    graph/passes/dropout_pass.cc \
    graph/passes/infershape_pass.cc \
    graph/passes/unused_const_pass.cc \
    graph/passes/isolated_op_remove_pass.cc \
    graph/passes/permute_pass.cc \
    graph/passes/ctrl_edge_transfer_pass.cc \
    host_kernels/broadcast_gradient_args_kernel.cc \
    host_kernels/greater_kernel.cc \
    host_kernels/gather_v2_kernel.cc  \
    host_kernels/maximum_kernel.cc \
    host_kernels/floormod_kernel.cc \
    host_kernels/floordiv_kernel.cc \
    host_kernels/range_kernel.cc \
    host_kernels/shape_kernel.cc \
    host_kernels/size_kernel.cc \
    host_kernels/shape_n_kernel.cc \
    host_kernels/rank_kernel.cc \
    host_kernels/broadcast_args_kernel.cc \
    host_kernels/fill_kernel.cc \
    host_kernels/empty_kernel.cc \
    host_kernels/expanddims_kernel.cc \
    host_kernels/reshape_kernel.cc \
    host_kernels/squeeze_kernel.cc \
    host_kernels/kernel_utils.cc \
    host_kernels/cast_kernel.cc \
    host_kernels/transdata_kernel.cc \
    host_kernels/transpose_kernel.cc \
    host_kernels/permute_kernel.cc \
    host_kernels/pack_kernel.cc \
    host_kernels/concat_v2_kernel.cc \
    host_kernels/concat_offset_kernel.cc \
    host_kernels/strided_slice_kernel.cc \
    host_kernels/ssd_prior_box_kernel.cc \
    host_kernels/add_kernel.cc \
    host_kernels/unpack_kernel.cc \
    host_kernels/sub_kernel.cc \
    host_kernels/mul_kernel.cc \
    host_kernels/reduce_prod_kernel.cc \
    host_kernels/rsqrt_kernel.cc \
    host_kernels/slice_kernel.cc \
    host_kernels/slice_d_kernel.cc \
    host_kernels/dynamic_stitch_kernel.cc \
    graph/passes/stop_gradient_pass.cc \
    graph/passes/prevent_gradient_pass.cc \
    graph/passes/identity_pass.cc \
    graph/passes/placeholder_with_default_pass.cc \
    graph/passes/snapshot_pass.cc \
    graph/passes/guarantee_const_pass.cc \
    graph/passes/var_is_initialized_op_pass.cc \
    graph/passes/parallel_concat_start_op_pass.cc \
    graph/passes/cast_translate_pass.cc \
    graph/passes/addn_pass.cc \
    graph/passes/common_subexpression_elimination_pass.cc \
    graph/passes/transop_symmetry_elimination_pass.cc \
    graph/passes/save_pass.cc \
    graph/passes/switch_dead_branch_elimination.cc \
    graph/passes/merge_pass.cc \
    graph/passes/prune_pass.cc \
    graph/passes/flow_ctrl_pass.cc \
    graph/passes/control_trigger_pass.cc \
    graph/passes/switch_data_edges_bypass.cc \
    graph/passes/switch_op_pass.cc \
    graph/passes/multi_batch_pass.cc \
    graph/passes/switch_logic_remove_pass.cc \
    graph/passes/next_iteration_pass.cc \
    graph/passes/cond_pass.cc \
    graph/passes/cond_remove_pass.cc \
    graph/passes/for_pass.cc \
    graph/passes/enter_pass.cc \
    graph/passes/hccl_memcpy_pass.cc \
    graph/passes/link_gen_mask_nodes_pass.cc \
    graph/passes/replace_with_empty_const_pass.cc \
    graph/passes/hccl_group_pass.cc \

 OME_SRC_FILES := \
    graph/manager/graph_mem_allocator.cc \
    graph/manager/graph_caching_allocator.cc \
    graph/manager/model_manager/event_manager.cc        \
    graph/manager/util/debug.cc                       \
    graph/manager/util/rt_context_util.cc               \
    graph/manager/util/variable_accelerate_ctrl.cc               \
    graph/manager/util/hcom_util.cc                 \
    graph/load/new_model_manager/model_manager.cc                        \
    graph/load/new_model_manager/data_inputer.cc                         \
    graph/load/new_model_manager/davinci_model.cc                        \
    graph/load/new_model_manager/davinci_model_parser.cc                 \
    graph/load/new_model_manager/model_utils.cc                          \
    graph/load/new_model_manager/tbe_handle_store.cc                     \
    graph/load/new_model_manager/cpu_queue_schedule.cc                   \
    graph/load/new_model_manager/zero_copy_task.cc                       \
    graph/load/output/output.cc                                          \
    graph/load/new_model_manager/data_dumper.cc                          \
    graph/load/new_model_manager/task_info/task_info.cc                  \
    graph/load/new_model_manager/task_info/event_record_task_info.cc     \
    graph/load/new_model_manager/task_info/event_wait_task_info.cc       \
    graph/load/new_model_manager/task_info/fusion_start_task_info.cc     \
    graph/load/new_model_manager/task_info/fusion_stop_task_info.cc      \
    graph/load/new_model_manager/task_info/hccl_task_info.cc             \
    graph/load/new_model_manager/task_info/kernel_ex_task_info.cc        \
    graph/load/new_model_manager/task_info/kernel_task_info.cc           \
    graph/load/new_model_manager/task_info/label_set_task_info.cc        \
    graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc \
    graph/load/new_model_manager/task_info/label_goto_ex_task_info.cc    \
    graph/load/new_model_manager/task_info/memcpy_async_task_info.cc     \
    graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc \
    graph/load/new_model_manager/task_info/profiler_trace_task_info.cc   \
    graph/load/new_model_manager/task_info/stream_active_task_info.cc    \
    graph/load/new_model_manager/task_info/stream_switch_task_info.cc    \
    graph/load/new_model_manager/task_info/stream_switchn_task_info.cc   \
    graph/load/new_model_manager/task_info/end_graph_task_info.cc        \
    graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc   \
    graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc  \
    single_op/task/op_task.cc                                            \
    single_op/task/build_task_utils.cc                                   \
    single_op/task/tbe_task_builder.cc                                   \
    single_op/task/aicpu_task_builder.cc                                 \
    single_op/single_op.cc                                               \
    single_op/single_op_model.cc                                         \
    single_op/stream_resource.cc                                         \
    single_op/single_op_manager.cc                                       \
    hybrid/hybrid_davinci_model_stub.cc                                  \


 COMMON_LOCAL_C_INCLUDES := \
    proto/om.proto \
    proto/task.proto \
    proto/insert_op.proto \
    proto/ge_ir.proto \
    proto/fwk_adapter.proto \
    proto/op_mapping_info.proto \
    proto/tensorflow/attr_value.proto \
    proto/tensorflow/function.proto \
    proto/tensorflow/graph.proto \
    proto/tensorflow/node_def.proto \
    proto/tensorflow/op_def.proto \
    proto/tensorflow/resource_handle.proto \
    proto/tensorflow/tensor.proto \
    proto/tensorflow/tensor_shape.proto \
    proto/tensorflow/types.proto \
    proto/tensorflow/versions.proto \
    $(LOCAL_PATH) ./ \
    $(TOPDIR)inc \
    $(TOPDIR)inc/external \
    $(TOPDIR)inc/external/graph \
    $(TOPDIR)inc/framework \
    $(TOPDIR)inc/framework/common \
    $(TOPDIR)inc/runtime \
    $(TOPDIR)libc_sec/include \
    $(TOPDIR)ops/built-in/op_proto/inc \
    third_party/json/include \
    third_party/protobuf/include \
    third_party/opencv/include \

 NEW_OMG_HOST_SRC_FILES := \
    graph/preprocess/insert_op/util_insert_aipp_op.cc \
    graph/preprocess/insert_op/ge_aipp_op.cc \
    graph/build/model_builder.cc \
    graph/build/task_generator.cc \
    graph/build/stream_allocator.cc \
    graph/build/logical_stream_allocator.cc \
    graph/build/stream_graph_optimizer.cc \
    graph/build/run_context.cc \
    graph/build/label_allocator.cc \
    graph/label/label_maker.cc \
    graph/label/if_label_maker.cc \
    graph/label/case_label_maker.cc \
    graph/label/while_label_maker.cc \
    graph/label/partitioned_call_label_maker.cc \



 #compiler for host train
 include $(CLEAR_VARS)

 LOCAL_MODULE := libge_train

 LOCAL_CFLAGS += -DPROTOBUF_INLINE_NOT_IN_HEADERS=0 -DREUSE_MEMORY=1 -O2
 LOCAL_CFLAGS += -DDAVINCI_CLOUD -DDAVINCI_TRAIN -DFMK_SUPPORT_DUMP -DDAVINCI_SUPPORT_PROFILING
 LOCAL_CFLAGS += -DFMK_SUPPORT_DEBUG
 ifeq ($(DEBUG), 1)
 LOCAL_CFLAGS += -g -O0
 endif

 LOCAL_C_INCLUDES := $(COMMON_LOCAL_C_INCLUDES)

 LOCAL_SRC_FILES := $(COMMON_LOCAL_SRC_FILES)
 LOCAL_SRC_FILES += $(OMG_HOST_SRC_FILES)
 LOCAL_SRC_FILES += $(OME_SRC_FILES)
 LOCAL_SRC_FILES += $(NEW_OMG_HOST_SRC_FILES)

 LOCAL_STATIC_LIBRARIES := libge_memory \

 LOCAL_SHARED_LIBRARIES := \
    libc_sec \
    libprotobuf \
    libslog \
    libmmpa \
    libgraph \
    libregister \
    libge_common \
    libhccl                                \
    libmsprof     \


 LOCAL_LDFLAGS := -lrt -ldl

 LOCAL_SHARED_LIBRARIES += \
    libruntime \
    libresource \

 include $(BUILD_HOST_SHARED_LIBRARY)

 # add engine_conf.json to host
 include $(CLEAR_VARS)

 LOCAL_MODULE := engine_conf.json

 LOCAL_SRC_FILES := engine_manager/engine_conf.json

 LOCAL_MODULE_CLASS := ETC

 LOCAL_INSTALLED_PATH := $(HOST_OUT_ROOT)/engine_conf.json
 include $(BUILD_HOST_PREBUILT)

 # add optimizer_priority.pbtxt to host
 include $(CLEAR_VARS)

 LOCAL_MODULE := optimizer_priority.pbtxt

 LOCAL_SRC_FILES := opskernel_manager/optimizer_priority.pbtxt

 LOCAL_MODULE_CLASS := ETC

 LOCAL_INSTALLED_PATH := $(HOST_OUT_ROOT)/optimizer_priority.pbtxt
 include $(BUILD_HOST_PREBUILT)
--- a/src/ge/generator/ge_generator.cc
+++ b/src/ge/generator/ge_generator.cc
@@ -207,13 +207,6 @@ class GeGenerator::Impl {
  GraphManager graph_manager_;
  SaveParam save_param_;
  bool is_offline_ = true;

 private:
  static std::string Trim(const std::string &str);
  bool ParseVersion(const std::string &line, std::string &version);
  bool GetVersionFromPath(const std::string &file_path, std::string &version);
  bool SetAtcVersionInfo(AttrHolder &obj);
  bool SetOppVersionInfo(AttrHolder &obj);
 };

 Status GeGenerator::Initialize(const map<string, string> &options) {
@@ -295,124 +288,6 @@ Status GeGenerator::GenerateInfershapeGraph(const Graph &graph) {
  return SUCCESS;
 }

 // Remove the space and tab before and after the string
 std::string GeGenerator::Impl::Trim(const std::string &str) {
  if (str.empty()) {
    return str;
  }

  std::string::size_type start = str.find_first_not_of(" \t\r\n");
  if (start == std::string::npos) {
    return str;
  }

  std::string::size_type end = str.find_last_not_of(" \t\r\n") + 1;
  return str.substr(start, end);
 }

 // Parsing the command line
 bool GeGenerator::Impl::ParseVersion(const std::string &line, std::string &version) {
  std::string flag = "Version=";
  std::string temp = Trim(line);

  if (temp.empty()) {
    GELOGW("line is empty.");
    return false;
  }

  std::string::size_type pos = temp.find(flag);
  if (pos == std::string::npos) {
    GELOGW("Incorrect line [%s], it must include [%s].", line.c_str(), flag.c_str());
    return false;
  }

  if (temp.size() == flag.size()) {
    GELOGW("version information is empty. %s", line.c_str());
    return false;
  }

  version = temp.substr(pos + flag.size());
  GELOGI("Version=%s", version.c_str());

  return true;
 }

 bool GeGenerator::Impl::GetVersionFromPath(const std::string &file_path, std::string &version) {
  // Normalize the path
  string resolved_file_path = RealPath(file_path.c_str());
  if (resolved_file_path.empty()) {
    GELOGW("Invalid input file path [%s], make sure that the file path is correct.", file_path.c_str());
    return false;
  }
  std::ifstream fs(resolved_file_path, std::ifstream::in);
  if (!fs.is_open()) {
    GELOGW("Open %s failed.", file_path.c_str());
    return false;
  }

  std::string line;
  if (getline(fs, line)) {
    if (!ParseVersion(line, version)) {
      GELOGW("Parse version failed. content is [%s].", line.c_str());
      fs.close();
      return false;
    }
  } else {
    GELOGW("No version information found in the file path:%s", file_path.c_str());
    fs.close();
    return false;
  }

  fs.close();  // close the file
  return true;
 }

 // Set package version information in the model
 bool GeGenerator::Impl::SetAtcVersionInfo(AttrHolder &obj) {
  std::string path_base = ge::GELib::GetPath();
  path_base = path_base.substr(0, path_base.rfind('/'));
  path_base = path_base.substr(0, path_base.rfind('/') + 1);

  std::string version_path = path_base + "version.info";
  GELOGI("version_path is %s", version_path.c_str());
  std::string version;
  if (!GetVersionFromPath(version_path, version)) {
    GELOGW("Get atc version information failed!");
    return false;
  }
  // set version info
  if (!ge::AttrUtils::SetStr(obj, ATTR_MODEL_ATC_VERSION, version)) {
    GELOGW("Ge model set atc version failed!");
    return false;
  }
  GELOGI("Ge model set atc version information success.");
  return true;
 }

 // Set package version information in the model
 bool GeGenerator::Impl::SetOppVersionInfo(AttrHolder &obj) {
  const char *path_env = std::getenv("ASCEND_OPP_PATH");
  if (path_env == nullptr) {
    GELOGW("Get environment variable ASCEND_OPP_PATH failed!");
    return false;
  }
  std::string version_path = path_env;
  version_path += "/version.info";
  GELOGI("version_path is %s", version_path.c_str());
  std::string version;
  if (!GetVersionFromPath(version_path, version)) {
    GELOGW("Get opp version information failed!");
    return false;
  }
  // set version info
  if (!ge::AttrUtils::SetStr(obj, ATTR_MODEL_OPP_VERSION, version)) {
    GELOGW("Ge model set opp version failed!");
    return false;
  }
  GELOGI("Ge Model set opp version information success.");
  return true;
 }

 Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_prefix, const vector<GeTensor> &inputs,
                                  ModelBufferData &model, bool is_offline) {
  rtContext_t ctx = nullptr;
@@ -440,7 +315,6 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr
  string model_name = "";
  Status name_ret = model_helper.GetModelNameFromMergedGraphName(ge_root_model->GetRootGraph()->GetName(), model_name);
  if (name_ret != SUCCESS) {
    ErrorManager::GetInstance().ATCReportErrMessage("E10000", {"parameter"}, {"output"});
    GELOGE(FAILED, "Get model_name failed. Param --output is invalid");
    return PARAM_INVALID;
  }
@@ -590,14 +464,6 @@ Status GeGenerator::Impl::SaveParams(GeModelPtr &ge_model, const string &type, c
 }

 Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, GeModelPtr &model, ModelBufferData &model_buff) {
  // set atc version
  if (!SetAtcVersionInfo(*(model.get()))) {
    GELOGW("SetPackageVersionInfo of atc failed!");
  }
  // set opp version
  if (!SetOppVersionInfo(*(model.get()))) {
    GELOGW("SetPackageVersionInfo of ops failed!");
  }
  ModelHelper model_helper;
  model_helper.SetSaveMode(is_offline_);
  Status ret = model_helper.SaveToOmModel(model, save_param_, file_name_prefix, model_buff);
@@ -660,4 +526,5 @@ Status GeGenerator::Impl::GenerateInfershapeGraph(const Graph &graph, GraphId &g

  return SUCCESS;
 }

 }  // namespace ge
--- a/src/ge/graph/common/ge_call_wrapper.h
+++ b/src/ge/graph/common/ge_call_wrapper.h
@@ -18,41 +18,6 @@
 #define GE_GE_CALL_WRAPPER_H_
 #include "framework/common/debug/ge_log.h"

 #define GE_TIMESTAMP_START(stage) uint64_t startUsec_##stage = ge::GetCurrentTimestap()

 #define GE_TIMESTAMP_END(stage, stage_name)                                          \
  do {                                                                               \
    uint64_t endUsec_##stage = ge::GetCurrentTimestap();                             \
    GELOGI("[GEPERFTRACE] The time cost of %s is [%lu] micro second.", (stage_name), \
           (endUsec_##stage - startUsec_##stage));                                   \
  } while (0);

 #define GE_TIMESTAMP_EVENT_END(stage, stage_name)                                     \
  do {                                                                                \
    uint64_t endUsec_##stage = ge::GetCurrentTimestap();                              \
    GEEVENT("[GEPERFTRACE] The time cost of %s is [%lu] micro second.", (stage_name), \
            (endUsec_##stage - startUsec_##stage));                                   \
  } while (0);

 #define GE_TIMESTAMP_CALLNUM_START(stage)                \
  uint64_t startUsec_##stage = ge::GetCurrentTimestap(); \
  uint64_t call_num_of##stage = 0;                       \
  uint64_t time_of##stage = 0

 #define GE_TIMESTAMP_RESTART(stage) (startUsec_##stage = ge::GetCurrentTimestap())

 #define GE_TIMESTAMP_ADD(stage)                                   \
  time_of##stage += ge::GetCurrentTimestap() - startUsec_##stage; \
  call_num_of##stage++

 #define GE_TIMESTAMP_CALLNUM_END(stage, stage_name)                                                                \
  GELOGI("[GEPERFTRACE] The time cost of %s is [%lu] micro second, call num is %lu", (stage_name), time_of##stage, \
         call_num_of##stage)

 #define GE_TIMESTAMP_CALLNUM_EVENT_END(stage, stage_name)                                                           \
  GEEVENT("[GEPERFTRACE] The time cost of %s is [%lu] micro second, call num is %lu", (stage_name), time_of##stage, \
          call_num_of##stage)

 #define RUN_WITH_TIMESTAMP_NAME(var_name, prefix, func, ...)           \
  do {                                                                 \
    GE_TIMESTAMP_START(var_name);                                      \
@@ -64,23 +29,10 @@
    }                                                                  \
  } while (0)

 #define RUN_WITH_PERF_TIMESTAMP_NAME(var_name, prefix, func, ...)      \
  do {                                                                 \
    GE_TIMESTAMP_START(var_name);                                      \
    auto ret_inner_macro = func(__VA_ARGS__);                          \
    GE_TIMESTAMP_EVENT_END(var_name, #prefix "::" #func)               \
    if (ret_inner_macro != ge::SUCCESS) {                              \
      GELOGE(ret_inner_macro, "Failed to process " #prefix "_" #func); \
      return ret_inner_macro;                                          \
    }                                                                  \
  } while (0)

 #define JOIN_NAME_INNER(a, b) a##b
 #define JOIN_NAME(a, b) JOIN_NAME_INNER(a, b)
 #define COUNTER_NAME(a) JOIN_NAME(a, __COUNTER__)
 #define GE_RUN(prefix, func, ...) \
  RUN_WITH_TIMESTAMP_NAME(COUNTER_NAME(ge_timestamp_##prefix), prefix, func, __VA_ARGS__)
 #define GE_RUN_PERF(prefix, func, ...) \
  RUN_WITH_PERF_TIMESTAMP_NAME(COUNTER_NAME(ge_timestamp_##prefix), prefix, func, __VA_ARGS__)

 #endif  // GE_GE_CALL_WRAPPER_H_
--- a/src/ge/graph/execute/graph_execute.cc
+++ b/src/ge/graph/execute/graph_execute.cc
@@ -120,7 +120,7 @@ Status GraphExecutor::FreeInOutBuffer() {
  }
 }

 Status GraphExecutor::MallocInOutBuffer(const std::vector<uint64_t> &buffer_size, std::vector<void *> &data_addr) {
 Status GraphExecutor::MallocInOutBuffer(const std::vector<uint32_t> &buffer_size, std::vector<void *> &data_addr) {
  if (malloc_flag_) {
    auto all_size_same = true;
    if (buffer_size.size() == buffer_size_.size()) {
@@ -169,7 +169,7 @@ Status GraphExecutor::PrepareInputData(const std::vector<GeTensor> &input_tensor
  graph_input_data.timestamp = 0;
  std::size_t inputSize = input_tensor.size();
  std::size_t output_size = output_desc.size();
  std::vector<uint64_t> bufferSizeVec;
  std::vector<uint32_t> bufferSizeVec;
  std::vector<void *> addrVec;

  for (std::size_t i = 0; i < inputSize; ++i) {
@@ -211,7 +211,7 @@ Status GraphExecutor::PrepareInputData(const std::vector<GeTensor> &input_tensor

  for (std::size_t j = 0; j < output_size; j++) {
    auto desc = output_desc[j];
    uint64_t buffer_size = desc.size;
    uint32_t buffer_size = desc.size;

    DataBuffer out_data_buf;
    out_data_buf.data = reinterpret_cast<uint8_t *>(addrVec[inputSize + j]);
@@ -225,13 +225,6 @@ Status GraphExecutor::PrepareInputData(const std::vector<GeTensor> &input_tensor

 Status GraphExecutor::SyncExecuteModel(uint32_t model_id, const std::vector<GeTensor> &input_tensor,
                                       std::vector<GeTensor> &output_tensor) {
  auto model_manager = ge::ModelManager::GetInstance();
  GE_CHECK_NOTNULL(model_manager);
  if (model_manager->IsDynamicShape(model_id)) {
    GELOGI("[ExecuteGraph] GetInputOutputDescInfo via dynamic shape model executor, modelId=%u", model_id);
    return model_manager->SyncExecuteModel(model_id, input_tensor, output_tensor);
  }

  // Prepare input and output
  std::vector<InputOutputDescInfo> inputs_desc;
  std::vector<InputOutputDescInfo> output_desc;
@@ -582,4 +575,5 @@ Status GraphExecutor::GetAllAippInputOutputDims(uint32_t model_id, uint32_t inde

  return SUCCESS;
 }

 }  // namespace ge
--- a/src/ge/graph/execute/graph_execute.h
+++ b/src/ge/graph/execute/graph_execute.h
@@ -110,7 +110,7 @@ class GraphExecutor {

  Status FreeInOutBuffer();

  Status MallocInOutBuffer(const std::vector<uint64_t> &buffer_size, std::vector<void *> &data_addr);
  Status MallocInOutBuffer(const std::vector<uint32_t> &buffer_size, std::vector<void *> &data_addr);

  bool init_flag_;

@@ -129,7 +129,7 @@ class GraphExecutor {

  bool malloc_flag_;
  std::vector<void *> buffer_addr_;
  std::vector<uint64_t> buffer_size_;
  std::vector<uint32_t> buffer_size_;
 };
 }  // namespace ge

--- a/src/ge/graph/load/graph_loader.cc
+++ b/src/ge/graph/load/graph_loader.cc
@@ -350,8 +350,7 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) {
    return RT_FAILED;
  }
  // Add small page memory size
  free =
    static_cast<int64_t>(free_mem + VarManager::Instance(GetContext().SessionId())->GetUseMaxMemorySize() - total_mem);
  free = static_cast<int64_t>(free_mem + VarManager::Instance(0)->GetUseMaxMemorySize() - total_mem);
  GELOGI("GetMemoryInfo free[%zu], total[%zu], return free[%ld]", free_mem, total_mem, free);
  return SUCCESS;
 }
--- a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc
+++ b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc
@@ -339,7 +339,7 @@ Status CpuTaskActiveEntry::Distribute() {
    return RT_FAILED;
  }

  GELOGI("Cpu kernel launch active entry task success.");
  GELOGI("Cpu kernel launch wait end task success.");
  return SUCCESS;
 }

--- a/src/ge/graph/load/new_model_manager/data_dumper.cc
+++ b/src/ge/graph/load/new_model_manager/data_dumper.cc
@@ -21,6 +21,7 @@
 #include <utility>
 #include <vector>

 #include "common/debug/log.h"
 #include "common/properties_manager.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/util.h"
@@ -36,36 +37,9 @@
 namespace {
 const uint32_t kAicpuLoadFlag = 1;
 const uint32_t kAicpuUnloadFlag = 0;
 const int64_t kOpDebugSize = 2048;
 const int64_t kOpDebugShape = 2048;
 const int8_t kDecimal = 10;
 const uint32_t kAddrLen = sizeof(void *);
 const char *const kDumpOutput = "output";
 const char *const kDumpInput = "input";
 const char *const kDumpAll = "all";

 // parse for format like nodename:input:index
 static bool ParseNameIndex(const std::string &node_name_index, std::string &node_name, std::string &input_or_output,
                           size_t &index) {
  auto sep = node_name_index.rfind(':');
  if (sep == std::string::npos) {
    return false;
  }
  auto index_str = node_name_index.substr(sep + 1);
  index = static_cast<size_t>(std::strtol(index_str.c_str(), nullptr, kDecimal));
  auto node_name_without_index = node_name_index.substr(0, sep);
  sep = node_name_without_index.rfind(':');
  if (sep == std::string::npos) {
    return false;
  }
  node_name = node_name_without_index.substr(0, sep);
  input_or_output = node_name_without_index.substr(sep + 1);
  return !(input_or_output != kDumpInput && input_or_output != kDumpOutput);
 }

 static bool IsTensorDescWithSkipDumpAddrType(bool has_mem_type_attr, vector<int64_t> v_memory_type, size_t i) {
  return has_mem_type_attr && (v_memory_type[i] == RT_MEMORY_L1);
 }
 }  // namespace

 static int32_t GetIrDataType(ge::DataType data_type) {
@@ -164,13 +138,6 @@ void DataDumper::SaveEndGraphId(uint32_t task_id, uint32_t stream_id) {
  end_graph_stream_id_ = stream_id;
 }

 void DataDumper::SaveOpDebugId(uint32_t task_id, uint32_t stream_id, void *op_debug_addr, bool is_op_debug) {
  op_debug_task_id_ = task_id;
  op_debug_stream_id_ = stream_id;
  op_debug_addr_ = op_debug_addr;
  is_op_debug_ = is_op_debug;
 }

 void DataDumper::SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr<OpDesc> &op_desc,
                              uintptr_t args) {
  if (op_desc == nullptr) {
@@ -235,121 +202,56 @@ static void SetOpMappingLoopAddr(uintptr_t step_id, uintptr_t loop_per_iter, uin
  }
 }

 Status DataDumper::GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
                                  const uintptr_t &addr, size_t index) {
  output.set_data_type(static_cast<int32_t>(GetIrDataType(tensor_descs.at(index).GetDataType())));
  output.set_format(static_cast<int32_t>(tensor_descs.at(index).GetFormat()));
 Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) {
  GELOGI("Start dump output");
  if (inner_dump_info.is_task) {
    // tbe or aicpu op
    const auto &output_descs = inner_dump_info.op->GetAllOutputsDesc();
    const auto input_size = inner_dump_info.op->GetAllInputsDesc().size();
    const std::vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, inner_dump_info.op, false);
    if (output_descs.size() != output_addrs.size()) {
      GELOGE(PARAM_INVALID, "Invalid output desc addrs size %zu, op %s has %zu output desc.", output_addrs.size(),
             inner_dump_info.op->GetName().c_str(), output_descs.size());
      return PARAM_INVALID;
    }

  for (auto dim : tensor_descs.at(index).GetShape().GetDims()) {
    output.mutable_shape()->add_dim(dim);
  }
  int64_t output_size = 0;
  if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(index), output_size) != SUCCESS) {
    GELOGE(PARAM_INVALID, "Get output size filed");
    return PARAM_INVALID;
  }
  GELOGD("Get output size in dump is %ld", output_size);
  std::string origin_name;
  int32_t origin_output_index = -1;
  (void)AttrUtils::GetStr(&tensor_descs.at(index), ATTR_NAME_DATA_DUMP_ORIGIN_NAME, origin_name);
  (void)AttrUtils::GetInt(&tensor_descs.at(index), ATTR_NAME_DATA_DUMP_ORIGIN_OUTPUT_INDEX, origin_output_index);
  output.set_size(output_size);
  output.set_original_name(origin_name);
  output.set_original_output_index(origin_output_index);
  output.set_original_output_format(static_cast<int32_t>(tensor_descs.at(index).GetOriginFormat()));
  output.set_original_output_data_type(static_cast<int32_t>(tensor_descs.at(index).GetOriginDataType()));
  output.set_address(static_cast<uint64_t>(addr));
  return SUCCESS;
 }
    for (size_t i = 0; i < output_descs.size(); ++i) {
      aicpu::dump::Output output;
      output.set_data_type(static_cast<int32_t>(GetIrDataType(output_descs.at(i).GetDataType())));
      output.set_format(static_cast<int32_t>(output_descs.at(i).GetFormat()));

 Status DataDumper::DumpRefOutput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Output &output,
                                 size_t i, const std::string &node_name_index) {
  std::string dump_op_name;
  std::string input_or_output;
  size_t index;
  // parser and find which node's input or output tensor desc is chosen for dump info
  if (!ParseNameIndex(node_name_index, dump_op_name, input_or_output, index)) {
    GELOGE(PARAM_INVALID, "Op [%s] output desc[%zu] with invalid ATTR_DATA_DUMP_REF attr[%s].",
           inner_dump_info.op->GetName().c_str(), i, node_name_index.c_str());
    return PARAM_INVALID;
  }
  GE_CHECK_NOTNULL(compute_graph_);
  auto replace_node = compute_graph_->FindNode(dump_op_name);
  GE_RT_PARAM_INVALID_WITH_LOG_IF_TRUE(replace_node == nullptr,
                                       "Op [%s] output desc[%zu] with invalid ATTR_DATA_DUMP_REF attr[%s],"
                                       " cannot find redirect node[%s].",
                                       inner_dump_info.op->GetName().c_str(), i, node_name_index.c_str(),
                                       dump_op_name.c_str());
  auto replace_opdesc = replace_node->GetOpDesc();
  GE_CHECK_NOTNULL(replace_opdesc);
  auto iter = ref_info_.find(replace_opdesc);
  GE_RT_PARAM_INVALID_WITH_LOG_IF_TRUE(iter == ref_info_.end(),
                                       "Op [%s] output desc[%zu] cannot find any saved redirect node[%s]'s info.",
                                       inner_dump_info.op->GetName().c_str(), i, replace_opdesc->GetName().c_str());
  GE_CHECK_NOTNULL(iter->second);
  auto addr = reinterpret_cast<uintptr_t>(iter->second);
  if (input_or_output == kDumpInput) {
    const auto &replace_input_descs = replace_opdesc->GetAllInputsDesc();
    addr += kAddrLen * index;
    GE_CHK_STATUS_RET(GenerateOutput(output, replace_input_descs, addr, index), "Generate output failed");
  } else if (input_or_output == kDumpOutput) {
    const auto &replace_output_descs = replace_opdesc->GetAllOutputsDesc();
    const auto replace_input_size = replace_opdesc->GetAllInputsDesc().size();
    addr += (index + replace_input_size) * kAddrLen;
    GE_CHK_STATUS_RET(GenerateOutput(output, replace_output_descs, addr, index), "Generate output failed");
  }
  GELOGD("Op [%s] output desc[%zu] dump info is replaced by node[%s] [%s] tensor_desc [%zu]",
         inner_dump_info.op->GetName().c_str(), i, dump_op_name.c_str(), input_or_output.c_str(), index);
  return SUCCESS;
 }
      for (auto dim : output_descs.at(i).GetShape().GetDims()) {
        output.mutable_shape()->add_dim(dim);
      }

 Status DataDumper::DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) {
  const auto &output_descs = inner_dump_info.op->GetAllOutputsDesc();
  const std::vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, inner_dump_info.op);
  if (output_descs.size() != output_addrs.size()) {
    GELOGE(PARAM_INVALID, "Invalid output desc addrs size %zu, op %s has %zu output desc.", output_addrs.size(),
           inner_dump_info.op->GetName().c_str(), output_descs.size());
    return PARAM_INVALID;
  }
  std::vector<int64_t> v_memory_type;
  bool has_mem_type_attr = ge::AttrUtils::GetListInt(inner_dump_info.op, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, v_memory_type);
  GE_RT_PARAM_INVALID_WITH_LOG_IF_TRUE(has_mem_type_attr && (v_memory_type.size() != output_descs.size()),
                                       "DumpOutputWithTask[%s], output size[%zu], output memory type size[%zu]",
                                       inner_dump_info.op->GetName().c_str(), output_descs.size(),
                                       v_memory_type.size());

  for (size_t i = 0; i < output_descs.size(); ++i) {
    aicpu::dump::Output output;
    std::string node_name_index;
    const auto &output_desc = output_descs.at(i);
    // check dump output tensor desc is redirected by attr ATTR_DATA_DUMP_REF
    if (AttrUtils::GetStr(&output_desc, ATTR_DATA_DUMP_REF, node_name_index)) {
      GE_CHK_STATUS_RET(DumpRefOutput(inner_dump_info, output, i, node_name_index), "DumpRefOutput failed");
    } else {
      GE_IF_BOOL_EXEC(
        IsTensorDescWithSkipDumpAddrType(has_mem_type_attr, v_memory_type, i),
        GELOGD("DumpOutputWithTask[%s] output[%zu] is l1 addr, skip it", inner_dump_info.op->GetName().c_str(), i);
        continue;);

      const auto input_size = inner_dump_info.op->GetInputsSize();
      auto addr = inner_dump_info.args + (i + input_size) * kAddrLen;
      GE_CHK_STATUS_RET(GenerateOutput(output, output_descs, addr, i), "Generate output failed");
      int64_t output_size = 0;
      if (TensorUtils::GetTensorSizeInBytes(output_descs.at(i), output_size) != SUCCESS) {
        GELOGE(PARAM_INVALID, "Get output size filed");
        return PARAM_INVALID;
      }
      GELOGI("Get output size in dump is %ld", output_size);
      std::string origin_name;
      int32_t origin_output_index = -1;
      (void)AttrUtils::GetStr(&output_descs.at(i), ATTR_NAME_DATA_DUMP_ORIGIN_NAME, origin_name);
      (void)AttrUtils::GetInt(&output_descs.at(i), ATTR_NAME_DATA_DUMP_ORIGIN_OUTPUT_INDEX, origin_output_index);
      GE_IF_BOOL_EXEC(output_size <= 0, GELOGE(PARAM_INVALID, "Output size %ld is less than zero", output_size);
                      return PARAM_INVALID)
      output.set_size(output_size);
      output.set_original_name(origin_name);
      output.set_original_output_index(origin_output_index);
      output.set_original_output_format(static_cast<int32_t>(output_descs.at(i).GetOriginFormat()));
      output.set_original_output_data_type(static_cast<int32_t>(output_descs.at(i).GetOriginDataType()));
      output.set_address(static_cast<uint64_t>(inner_dump_info.args + (i + input_size) * sizeof(void *)));

      task.mutable_output()->Add(std::move(output));
    }
    task.mutable_output()->Add(std::move(output));
    return SUCCESS;
  }
  return SUCCESS;
 }

 Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) {
  GELOGI("Start dump output");
  if (inner_dump_info.is_task) {
    // tbe or aicpu op, these ops are with task
    return DumpOutputWithTask(inner_dump_info, task);
  }
  // else data, const or variable op
  aicpu::dump::Output output;
  auto output_tensor = inner_dump_info.op->GetOutputDescPtr(inner_dump_info.output_anchor_index);
  const std::vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, inner_dump_info.op);
  const std::vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, inner_dump_info.op, false);
  if (output_tensor == nullptr) {
    GELOGE(PARAM_INVALID, "output_tensor is null, index: %d, size: %zu.", inner_dump_info.output_anchor_index,
           inner_dump_info.op->GetOutputsSize());
@@ -367,6 +269,9 @@ Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump:
  int32_t origin_output_index = -1;
  (void)AttrUtils::GetStr(output_tensor, ATTR_NAME_DATA_DUMP_ORIGIN_NAME, origin_name);
  (void)AttrUtils::GetInt(output_tensor, ATTR_NAME_DATA_DUMP_ORIGIN_OUTPUT_INDEX, origin_output_index);
  GE_IF_BOOL_EXEC(inner_dump_info.data_size <= 0,
                  GELOGE(PARAM_INVALID, "The size of data %ld is less than zero", inner_dump_info.data_size);
                  return PARAM_INVALID)
  output.set_size(inner_dump_info.data_size);
  output.set_original_name(origin_name);
  output.set_original_output_index(origin_output_index);
@@ -377,7 +282,7 @@ Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump:
    GELOGE(FAILED, "Index is out of range.");
    return FAILED;
  }
  auto data_addr = inner_dump_info.args + kAddrLen * static_cast<uint32_t>(inner_dump_info.input_anchor_index);
  auto data_addr = inner_dump_info.args + sizeof(void *) * static_cast<uint32_t>(inner_dump_info.input_anchor_index);
  output.set_address(static_cast<uint64_t>(data_addr));

  task.mutable_output()->Add(std::move(output));
@@ -385,98 +290,37 @@ Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump:
  return SUCCESS;
 }

 Status DataDumper::GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
                                 const uintptr_t &addr, size_t index) {
  input.set_data_type(static_cast<int32_t>(GetIrDataType(tensor_descs.at(index).GetDataType())));
  input.set_format(static_cast<int32_t>(tensor_descs.at(index).GetFormat()));

  for (auto dim : tensor_descs.at(index).GetShape().GetDims()) {
    input.mutable_shape()->add_dim(dim);
  }
  int64_t input_size = 0;
  if (AttrUtils::GetInt(tensor_descs.at(index), ATTR_NAME_INPUT_ORIGIN_SIZE, input_size)) {
    GELOGI("Get aipp input size according to attr is %ld", input_size);
  } else if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(index), input_size) != SUCCESS) {
    GELOGE(PARAM_INVALID, "Get input size filed");
    return PARAM_INVALID;
  }
  GELOGD("Get input size in dump is %ld", input_size);
  input.set_size(input_size);
  input.set_address(static_cast<uint64_t>(addr));
  return SUCCESS;
 }

 Status DataDumper::DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Input &input, size_t i,
                                const std::string &node_name_index) {
  std::string dump_op_name;
  std::string input_or_output;
  size_t index;
  // parser and find which node's input or output tensor desc is chosen for dump info
  if (!ParseNameIndex(node_name_index, dump_op_name, input_or_output, index)) {
    GELOGE(PARAM_INVALID, "Op [%s] input desc[%zu] with invalid ATTR_DATA_DUMP_REF attr[%s].",
           inner_dump_info.op->GetName().c_str(), i, node_name_index.c_str());
    return PARAM_INVALID;
  }
  GE_CHECK_NOTNULL(compute_graph_);
  auto replace_node = compute_graph_->FindNode(dump_op_name);
  GE_RT_PARAM_INVALID_WITH_LOG_IF_TRUE(replace_node == nullptr,
                                       "Op [%s] input desc[%zu] with invalid ATTR_DATA_DUMP_REF attr[%s],"
                                       " cannot find redirect node[%s].",
                                       inner_dump_info.op->GetName().c_str(), i, node_name_index.c_str(),
                                       dump_op_name.c_str());
  auto replace_opdesc = replace_node->GetOpDesc();
  GE_CHECK_NOTNULL(replace_opdesc);
  auto iter = ref_info_.find(replace_opdesc);
  GE_RT_PARAM_INVALID_WITH_LOG_IF_TRUE(iter == ref_info_.end(),
                                       "Op [%s] input desc[%zu] cannot find any saved redirect node[%s]'s info.",
                                       inner_dump_info.op->GetName().c_str(), i, replace_opdesc->GetName().c_str());
  GE_CHECK_NOTNULL(iter->second);
  auto addr = reinterpret_cast<uintptr_t>(iter->second);
  if (input_or_output == kDumpInput) {
    const auto &replace_input_descs = replace_opdesc->GetAllInputsDesc();
    addr += kAddrLen * index;
    GE_CHK_STATUS_RET(GenerateInput(input, replace_input_descs, addr, index), "Generate input failed");
  } else if (input_or_output == kDumpOutput) {
    const auto &replace_output_descs = replace_opdesc->GetAllOutputsDesc();
    const auto replace_input_size = replace_opdesc->GetAllInputsDesc().size();
    addr += (index + replace_input_size) * kAddrLen;
    GE_CHK_STATUS_RET(GenerateInput(input, replace_output_descs, addr, index), "Generate input failed");
  }
  GELOGD("Op [%s] input desc[%zu] dump info is replaced by node[%s] [%s] tensor_desc [%zu]",
         inner_dump_info.op->GetName().c_str(), i, dump_op_name.c_str(), input_or_output.c_str(), index);
  return SUCCESS;
 }

 Status DataDumper::DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) {
  GELOGI("Start dump input");
  const auto &input_descs = inner_dump_info.op->GetAllInputsDesc();
  const std::vector<void *> input_addrs = ModelUtils::GetInputDataAddrs(runtime_param_, inner_dump_info.op);
  const std::vector<void *> input_addrs = ModelUtils::GetInputDataAddrs(runtime_param_, inner_dump_info.op, false);
  if (input_descs.size() != input_addrs.size()) {
    GELOGE(PARAM_INVALID, "Invalid input desc addrs size %zu, op %s has %zu input desc.", input_addrs.size(),
           inner_dump_info.op->GetName().c_str(), input_descs.size());
    return PARAM_INVALID;
  }
  std::vector<int64_t> v_memory_type;
  bool has_mem_type_attr = ge::AttrUtils::GetListInt(inner_dump_info.op, ATTR_NAME_INPUT_MEM_TYPE_LIST, v_memory_type);
  GE_RT_PARAM_INVALID_WITH_LOG_IF_TRUE(has_mem_type_attr && (v_memory_type.size() != input_descs.size()),
                                       "DumpInput[%s], input size[%zu], input memory type size[%zu]",
                                       inner_dump_info.op->GetName().c_str(), input_descs.size(), v_memory_type.size());

  for (size_t i = 0; i < input_descs.size(); ++i) {
    aicpu::dump::Input input;
    std::string node_name_index;
    // check dump input tensor desc is redirected by attr ATTR_DATA_DUMP_REF
    if (AttrUtils::GetStr(&input_descs.at(i), ATTR_DATA_DUMP_REF, node_name_index)) {
      GE_CHK_STATUS_RET(DumpRefInput(inner_dump_info, input, i, node_name_index), "DumpRefInput failed");
      // normal dump without attr
    } else {
      GE_IF_BOOL_EXEC(IsTensorDescWithSkipDumpAddrType(has_mem_type_attr, v_memory_type, i),
                      GELOGD("DumpInput[%s] input[%zu] is l1 addr, skip it", inner_dump_info.op->GetName().c_str(), i);
                      continue;);

      auto addr = inner_dump_info.args + kAddrLen * i;
      GE_CHK_STATUS_RET(GenerateInput(input, input_descs, addr, i), "Generate input failed");
    input.set_data_type(static_cast<int32_t>(GetIrDataType(input_descs.at(i).GetDataType())));
    input.set_format(static_cast<int32_t>(input_descs.at(i).GetFormat()));

    for (auto dim : input_descs.at(i).GetShape().GetDims()) {
      input.mutable_shape()->add_dim(dim);
    }

    int64_t input_size = 0;
    if (AttrUtils::GetInt(&input_descs.at(i), ATTR_NAME_INPUT_ORIGIN_SIZE, input_size)) {
      GELOGI("Get aipp input size according to attr is %ld", input_size);
    } else if (TensorUtils::GetTensorSizeInBytes(input_descs.at(i), input_size) != SUCCESS) {
      GELOGE(PARAM_INVALID, "Get input size filed");
      return PARAM_INVALID;
    }
    GELOGI("Get input size in dump is %ld", input_size);
    GE_IF_BOOL_EXEC(input_size <= 0, GELOGE(PARAM_INVALID, "Input size %ld is less than zero", input_size);
                    return PARAM_INVALID;)
    input.set_size(input_size);
    input.set_address(static_cast<uint64_t>(inner_dump_info.args + sizeof(void *) * i));
    task.mutable_input()->Add(std::move(input));
  }
  return SUCCESS;
@@ -556,38 +400,36 @@ Status DataDumper::ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_
  GELOGI("UnloadDumpInfo success, proto size is: %zu.", proto_size);
  return SUCCESS;
 }

 Status DataDumper::LoadDumpInfo() {
  std::string dump_list_key;
  PrintCheckLog(dump_list_key);

  if (op_list_.empty()) {
    GELOGW("op_list_ is empty");
    return SUCCESS;
  }

  aicpu::dump::OpMappingInfo op_mapping_info;

  auto dump_path = dump_properties_.GetDumpPath() + std::to_string(device_id_) + "/";
  op_mapping_info.set_dump_path(dump_path);
  auto dump_path = PropertiesManager::Instance().GetDumpOutputPath();
  op_mapping_info.set_dump_path(PropertiesManager::Instance().GetDumpOutputPath() + std::to_string(device_id_) + "/");
  op_mapping_info.set_model_name(dump_list_key);
  op_mapping_info.set_model_id(model_id_);
  op_mapping_info.set_flag(kAicpuLoadFlag);
  op_mapping_info.set_dump_step(dump_properties_.GetDumpStep());
  op_mapping_info.set_dump_step(PropertiesManager::Instance().GetDumpStep());
  SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info);
  GELOGI("Dump step is %s and dump path  is %s in load dump info", dump_properties_.GetDumpStep().c_str(),
  GELOGI("Dump step is %s and dump path  is %s in load dump info", PropertiesManager::Instance().GetDumpStep().c_str(),
         dump_path.c_str());

  for (const auto &op_iter : op_list_) {
    auto op_desc = op_iter.op;
    GELOGD("Op %s in model %s begin to add task in op_mapping_info", op_desc->GetName().c_str(), dump_list_key.c_str());
    aicpu::dump::Task task;
    auto op_desc = op_iter.op;
    task.set_end_graph(false);
    task.set_task_id(op_iter.task_id);
    task.set_stream_id(op_iter.stream_id);
    task.mutable_op()->set_op_name(op_desc->GetName());
    task.mutable_op()->set_op_type(op_desc->GetType());

    if (dump_properties_.GetDumpMode() == kDumpOutput) {
    if (PropertiesManager::Instance().GetDumpMode() == kDumpOutput) {
      if (DumpOutput(op_iter, task) != SUCCESS) {
        GELOGE(FAILED, "Dump output failed");
        return FAILED;
@@ -595,7 +437,7 @@ Status DataDumper::LoadDumpInfo() {
      op_mapping_info.mutable_task()->Add(std::move(task));
      continue;
    }
    if (dump_properties_.GetDumpMode() == kDumpInput) {
    if (PropertiesManager::Instance().GetDumpMode() == kDumpInput) {
      if (op_iter.is_task) {
        if (DumpInput(op_iter, task) != SUCCESS) {
          GELOGE(FAILED, "Dump input failed");
@@ -605,7 +447,7 @@ Status DataDumper::LoadDumpInfo() {
      op_mapping_info.mutable_task()->Add(std::move(task));
      continue;
    }
    if (dump_properties_.GetDumpMode() == kDumpAll) {
    if (PropertiesManager::Instance().GetDumpMode() == kDumpAll) {
      auto ret = DumpOutput(op_iter, task);
      if (ret != SUCCESS) {
        GELOGE(FAILED, "Dump output failed when in dumping all");
@@ -625,22 +467,19 @@ Status DataDumper::LoadDumpInfo() {

  SetEndGraphIdToAicpu(end_graph_task_id_, end_graph_stream_id_, op_mapping_info);

  SetOpDebugIdToAicpu(op_debug_task_id_, op_debug_stream_id_, op_debug_addr_, op_mapping_info);

  if (!op_list_.empty() || is_op_debug_) {
    auto ret = ExecuteLoadDumpInfo(op_mapping_info);
    if (ret != SUCCESS) {
      GELOGE(FAILED, "Execute load dump info failed");
      return FAILED;
    }
  auto ret = ExecuteLoadDumpInfo(op_mapping_info);
  if (ret != SUCCESS) {
    GELOGE(FAILED, "Execute load dump info failed");
    return FAILED;
  }
  return SUCCESS;
 }

 void DataDumper::SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id,
                                      aicpu::dump::OpMappingInfo &op_mapping_info) {
  if (dump_properties_.GetDumpMode() == kDumpOutput || dump_properties_.GetDumpMode() == kDumpInput ||
      dump_properties_.GetDumpMode() == kDumpAll) {
  if (PropertiesManager::Instance().GetDumpMode() == kDumpOutput ||
      PropertiesManager::Instance().GetDumpMode() == kDumpInput ||
      PropertiesManager::Instance().GetDumpMode() == kDumpAll) {
    GELOGI("Add end_graph_info to aicpu, task_id is %u, stream_id is %u", end_graph_task_id_, end_graph_stream_id_);
    aicpu::dump::Task task;
    task.set_end_graph(true);
@@ -652,37 +491,6 @@ void DataDumper::SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id,
  }
 }

 void DataDumper::SetOpDebugIdToAicpu(uint32_t task_id, uint32_t stream_id, void *op_debug_addr,
                                     aicpu::dump::OpMappingInfo &op_mapping_info) {
  if (is_op_debug_) {
    GELOGI("add op_debug_info to aicpu, task_id is %u, stream_id is %u", task_id, stream_id);
    aicpu::dump::Task task;
    task.set_end_graph(false);
    task.set_task_id(task_id);
    task.set_stream_id(stream_id);
    task.mutable_op()->set_op_name(NODE_NAME_OP_DEBUG);
    task.mutable_op()->set_op_type(OP_TYPE_OP_DEBUG);

    // set output
    aicpu::dump::Output output;
    output.set_data_type(DT_UINT8);
    output.set_format(FORMAT_ND);

    output.mutable_shape()->add_dim(kOpDebugShape);

    output.set_original_name(NODE_NAME_OP_DEBUG);
    output.set_original_output_index(0);
    output.set_original_output_format(FORMAT_ND);
    output.set_original_output_data_type(DT_UINT8);
    // due to lhisi virtual addr bug, cannot use args now
    output.set_address(static_cast<uint64_t>(reinterpret_cast<uintptr_t>(op_debug_addr)));
    output.set_size(kOpDebugSize);

    task.mutable_output()->Add(std::move(output));
    op_mapping_info.mutable_task()->Add(std::move(task));
  }
 }

 Status DataDumper::UnloadDumpInfo() {
  if (!load_flag_) {
    GELOGI("No need to UnloadDumpInfo.");
@@ -709,17 +517,15 @@ Status DataDumper::UnloadDumpInfo() {
 }

 void DataDumper::PrintCheckLog(string &dump_list_key) {
  std::set<std::string> model_list = dump_properties_.GetAllDumpModel();
  std::set<std::string> model_list = PropertiesManager::Instance().GetAllDumpModel();
  if (model_list.empty()) {
    GELOGI("No model need dump.");
    return;
  }

  GELOGI("%zu op need dump in %s.", op_list_.size(), model_name_.c_str());
  bool not_find_by_omname = model_list.find(om_name_) == model_list.end();
  bool not_find_by_modelname = model_list.find(model_name_) == model_list.end();
  dump_list_key = not_find_by_omname ? model_name_ : om_name_;
  GELOGI("%zu op need dump in %s.", op_list_.size(), dump_list_key.c_str());

  if (model_list.find(DUMP_ALL_MODEL) == model_list.end()) {
    if (not_find_by_omname && not_find_by_modelname) {
      std::string model_list_str;
@@ -727,12 +533,12 @@ void DataDumper::PrintCheckLog(string &dump_list_key) {
        model_list_str += "[" + model + "].";
      }

      GELOGW("Model %s will not be set to dump, dump list: %s", dump_list_key.c_str(), model_list_str.c_str());
      GELOGW("Model %s will not be set to dump, dump list: %s", model_name_.c_str(), model_list_str.c_str());
      return;
    }
  }

  std::set<std::string> config_dump_op_list = dump_properties_.GetPropertyValue(dump_list_key);
  dump_list_key = not_find_by_omname ? model_name_ : om_name_;
  std::set<std::string> config_dump_op_list = PropertiesManager::Instance().GetDumpPropertyValue(dump_list_key);
  std::set<std::string> dump_op_list;
  for (auto &inner_dump_info : op_list_) {
    // oplist value OpDescPtr is not nullptr
--- a/src/ge/graph/load/new_model_manager/data_dumper.h
+++ b/src/ge/graph/load/new_model_manager/data_dumper.h
@@ -23,9 +23,7 @@
 #include <vector>

 #include "framework/common/ge_inner_error_codes.h"
 #include "common/properties_manager.h"
 #include "graph/node.h"
 #include "graph/compute_graph.h"
 #include "proto/ge_ir.pb.h"
 #include "proto/op_mapping_info.pb.h"
 #include "runtime/mem.h"
@@ -46,9 +44,7 @@ class DataDumper {
        device_id_(0),
        global_step_(0),
        loop_per_iter_(0),
        loop_cond_(0),
        compute_graph_(nullptr),
        ref_info_() {}
        loop_cond_(0) {}

  ~DataDumper();

@@ -60,10 +56,6 @@ class DataDumper {

  void SetDeviceId(uint32_t device_id) { device_id_ = device_id; }

  void SetComputeGraph(const ComputeGraphPtr &compute_graph) { compute_graph_ = compute_graph; };

  void SetRefInfo(const std::map<OpDescPtr, void *> &ref_info) { ref_info_ = ref_info; };

  void SetLoopAddr(void *global_step, void *loop_per_iter, void *loop_cond);

  void SaveDumpInput(const std::shared_ptr<Node> &node);
@@ -73,15 +65,11 @@ class DataDumper {
  void SaveEndGraphId(uint32_t task_id, uint32_t stream_id);

  void SetOmName(const std::string &om_name) { om_name_ = om_name; }
  void SaveOpDebugId(uint32_t task_id, uint32_t stream_id, void *op_debug_addr, bool is_op_debug);

  Status LoadDumpInfo();

  Status UnloadDumpInfo();

  void SetDumpProperties(const DumpProperties &dump_properties) { dump_properties_ = dump_properties; }
  const DumpProperties &GetDumpProperties() const { return dump_properties_; }

 private:
  void ReleaseDevMem(void **ptr) noexcept;

@@ -109,32 +97,12 @@ class DataDumper {
  uintptr_t global_step_;
  uintptr_t loop_per_iter_;
  uintptr_t loop_cond_;
  ComputeGraphPtr compute_graph_;
  std::map<OpDescPtr, void *> ref_info_;

  uint32_t op_debug_task_id_ = 0;
  uint32_t op_debug_stream_id_ = 0;
  void *op_debug_addr_ = nullptr;
  bool is_op_debug_ = false;

  DumpProperties dump_properties_;

  Status DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task);
  Status DumpRefOutput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Output &output, size_t i,
                       const std::string &node_name_index);
  Status DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task);
  Status DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task);
  Status DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Input &input, size_t i,
                      const std::string &node_name_index);
  Status ExecuteLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info);
  void SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id, aicpu::dump::OpMappingInfo &op_mapping_info);
  void SetOpDebugIdToAicpu(uint32_t task_id, uint32_t stream_id, void *op_debug_addr,
                           aicpu::dump::OpMappingInfo &op_mapping_info);
  Status ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info);
  Status GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
                       const uintptr_t &addr, size_t index);
  Status GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
                        const uintptr_t &addr, size_t index);
 };
 struct DataDumper::InnerDumpInfo {
  uint32_t task_id;
--- a/src/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/src/ge/graph/load/new_model_manager/davinci_model.cc
--- a/src/ge/graph/load/new_model_manager/davinci_model.h
+++ b/src/ge/graph/load/new_model_manager/davinci_model.h
@@ -29,7 +29,6 @@
 #include "common/helper/om_file_helper.h"
 #include "common/opskernel/ge_task_info.h"
 #include "common/types.h"
 #include "common/properties_manager.h"
 #include "framework/common/util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/load/new_model_manager/data_dumper.h"
@@ -48,10 +47,6 @@
 #include "task_info/task_info.h"

 namespace ge {
 // op debug need 2048 bits buffer
 const size_t kOpDebugMemorySize = 2048UL;
 const size_t kDebugP2pSize = 8UL;

 typedef enum tagModelProcStage {
  MODEL_LOAD_START = 1,
  MODEL_LOAD_END,
@@ -176,6 +171,13 @@ class DavinciModel {
  // get session id
  uint64_t SessionId() const { return runtime_param_.session_id; }

  vector<OpDescPtr> GetOpDesc() {
    vector<OpDescPtr> opDescVector;
    GE_IF_BOOL_EXEC(AttrUtils::GetListOpDesc(GetGeModel(), MODEL_ATTR_FUSION_MODEL_DEF, opDescVector),
                    GELOGI("get opDesc of opDescVector"));
    return opDescVector;
  }

  // get model priority
  int32_t Priority() const { return priority_; }

@@ -246,9 +248,15 @@ class DavinciModel {
  ///
  Format GetFormat();

  rtModel_t GetRtModelHandle() const { return rt_model_handle_; }
  rtModel_t GetRtModelHandle() {
    rtModel_t res = rt_model_handle_;
    return res;
  }

  rtStream_t GetRtModelStream() const { return rt_model_stream_; }
  rtStream_t GetRtModelStream() {
    rtModel_t res = rt_model_stream_;
    return res;
  }

  uint64_t GetRtBaseAddr() const { return runtime_param_.logic_mem_base; }

@@ -287,7 +295,7 @@ class DavinciModel {
  /// @param [out] batch_info
  /// @return execute result
  ///
  Status GetDynamicBatchInfo(std::vector<std::vector<int64_t>> &batch_info) const;
  Status GetDynamicBatchInfo(std::vector<std::vector<int64_t>> &batch_info);

  void GetCurShape(std::vector<int64_t> &batch_info);

@@ -336,9 +344,10 @@ class DavinciModel {
  ///
  /// @ingroup ge
  /// @brief dump all op input and output information
  /// @return void
  /// @param [in] op_list model_id
  /// @return Status
  ///
  void DumpOpInputOutput();
  Status DumpOpInputOutput();

  ///
  /// @ingroup ge
@@ -394,9 +403,7 @@ class DavinciModel {
  ///
  uint32_t GetDeviceId() const { return device_id_; }

  bool NeedDestroyAicpuKernel() const { return need_destroy_aicpu_kernel_; }

  Status UpdateSessionId(uint64_t session_id);
  GeModelPtr GetGeModel() { return ge_model_; }

  const RuntimeParam &GetRuntimeParam() { return runtime_param_; }

@@ -456,19 +463,6 @@ class DavinciModel {
    void *cur_args = static_cast<char *>(args_) + offset;
    return cur_args;
  }
  void SetTotalFixedAddrsSize(string tensor_name, int64_t fix_addr_size);
  int64_t GetFixedAddrsSize(string tensor_name);
  void *GetCurrentFixedAddr(int64_t offset) const {
    void *cur_addr = static_cast<char *>(fixed_addrs_) + offset;
    return cur_addr;
  }

  uint32_t GetFixedAddrOutputIndex(string tensor_name) {
    if (tensor_name_to_peer_output_index_.find(tensor_name) != tensor_name_to_peer_output_index_.end()) {
      return tensor_name_to_peer_output_index_[tensor_name];
    }
    return UINT32_MAX;
  }
  void SetKnownNode(bool known_node) { known_node_ = known_node; }
  bool IsKnownNode() { return known_node_; }
  Status MallocKnownArgs();
@@ -483,9 +477,6 @@ class DavinciModel {
  // om file name
  void SetOmName(string om_name) { om_name_ = om_name; }

  void SetDumpProperties(const DumpProperties &dump_properties) { data_dumper_.SetDumpProperties(dump_properties); }
  const DumpProperties &GetDumpProperties() const { return data_dumper_.GetDumpProperties(); }

 private:
  // memory address of weights
  uint8_t *weights_mem_base_;
@@ -502,6 +493,8 @@ class DavinciModel {
  struct timeInfo time_info_;
  int32_t dataInputTid;

  void InitZeroCopyUtil(bool is_dynamic_batch, bool &input_zero_copy, bool &output_zero_copy);

  ///
  /// @ingroup ge
  /// @brief Save Batch label Info.
@@ -537,13 +530,6 @@ class DavinciModel {
  ///
  bool CheckInputAndModelSize(const int64_t &input_size, const int64_t &op_size, bool is_dynamic);

  ///
  /// @ingroup ge
  /// @brief Set copy only for No task feed NetOutput address.
  /// @return None.
  ///
  void SetCopyOnlyOutput();

  ///
  /// @ingroup ge
  /// @brief Copy Input/Output to model for direct use.
@@ -569,10 +555,14 @@ class DavinciModel {

  Status CopyInputData(const InputData &input_data, bool device_data = false);

  Status CopyOutputData(uint32_t data_id, OutputData &output_data, rtMemcpyKind_t kind);
  Status CopyOutputData(uint32_t data_id, OutputData &output_data);

  Status CopyOutputDataToUser(OpDescPtr &op_desc, std::vector<DataBuffer> &blobs, uint32_t &data_index);

  Status SyncVarData();

  Status SyncDataAndDump();

  Status InitModelMem(void *dev_ptr, size_t memsize, void *weight_ptr, size_t weightsize);

  void CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, InputOutputDescInfo &input);
@@ -599,12 +589,7 @@ class DavinciModel {

  bool IsAicpuKernelConnectSpecifiedLayer();

  ///
  /// @ingroup ge
  /// @brief Reduce memory usage after task sink.
  /// @return: void
  ///
  void Shrink();
  Status MarkSpecifiedAicpuKernel();

  ///
  /// @ingroup ge
@@ -740,9 +725,10 @@ class DavinciModel {
  ///
  /// @ingroup ge
  /// @brief definiteness queue schedule, active original model stream.
  /// @param [in] streams: streams will active by S0.
  /// @return: 0 for success / others for fail
  ///
  Status CpuActiveStream();
  Status CpuActiveStream(const std::vector<rtStream_t> &stream_list);

  ///
  /// @ingroup ge
@@ -760,9 +746,6 @@ class DavinciModel {
  ///
  Status CpuModelRepeat();

  Status InitEntryTask();
  Status AddHeadStream();

  ///
  /// @ingroup ge
  /// @brief set ts device.
@@ -770,10 +753,6 @@ class DavinciModel {
  ///
  Status SetTSDevice();

  Status OpDebugRegister();

  void OpDebugUnRegister();

  void CheckHasHcomOp();

  Status DoTaskSink();
@@ -781,17 +760,17 @@ class DavinciModel {
  void CreateOutput(uint32_t index, OpDescPtr &op_desc, InputOutputDescInfo &output, uint32_t &format_result);

  Status TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id);
  Status CopyVarData(ComputeGraphPtr &graph);

  // get desc info of graph for profiling
  Status GetComputeGraphInfo(const ComputeGraphPtr &graph, vector<ComputeGraphDescInfo> &graph_desc_info);
  Status GetComputeGraphInfo(vector<ComputeGraphDescInfo> &compute_graph_desc_info);

  void SetDataDumperArgs(const ComputeGraphPtr &compute_graph);
  void SetDataDumperArgs();

  Status GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data_index, OutputData *output_data,
                             std::vector<ge::OutputTensorInfo> &outputs);

  void ParseAIPPInfo(std::string in_out_info, InputOutputDims &dims_info);
  void GetFixedAddrAttr(const OpDescPtr &op_desc);

  bool is_model_has_inited_;
  uint32_t model_id_;
@@ -804,9 +783,6 @@ class DavinciModel {
  uint32_t version_;
  GeModelPtr ge_model_;

  bool need_destroy_aicpu_kernel_{false};
  vector<std::string> out_node_name_;

  map<uint32_t, OpDescPtr> op_list_;

  // data op_desc
@@ -867,11 +843,6 @@ class DavinciModel {

  bool is_async_mode_;  // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_.

  bool is_pure_head_stream_{false};
  rtStream_t rt_head_stream_{nullptr};
  rtStream_t rt_entry_stream_{nullptr};
  rtAicpuDeployType_t deploy_type_{AICPU_DEPLOY_RESERVED};

  // ACL queue schedule, save queue ids for Init.
  std::vector<TaskInfoPtr> cpu_task_list_;
  std::vector<uint32_t> input_queue_ids_;    // input queue ids created by caller.
@@ -893,6 +864,8 @@ class DavinciModel {
  std::vector<rtStream_t> active_stream_list_;
  std::set<uint32_t> active_stream_indication_;

  std::shared_ptr<domi::ModelTaskDef> model_task_def_;
  std::set<uint32_t> aicpu_streams_;
  std::set<uint32_t> hcom_streams_;
  RuntimeParam runtime_param_;

@@ -904,39 +877,22 @@ class DavinciModel {
  // for profiling task and graph info
  std::map<uint32_t, std::string> op_name_map_;
  std::vector<TaskDescInfo> task_desc_info_;
  ComputeGraphPtr compute_graph_;

  int64_t maxDumpOpNum_;
  // for data dump
  DataDumper data_dumper_;
  uint64_t iterator_count_;
  bool is_l1_fusion_enable_;
  std::map<OpDescPtr, void *> saved_task_addrs_;

  bool known_node_ = false;
  uint32_t total_args_size_ = 0;
  void *args_ = nullptr;
  void *args_host_ = nullptr;
  void *fixed_addrs_ = nullptr;
  int64_t total_fixed_addr_size_ = 0;
  std::map<const void *, void *> knonw_input_data_info_;
  std::map<const void *, void *> knonw_output_data_info_;

  vector<vector<int64_t>> batch_info_;

  vector<uint64_t> batch_size_;
  // key: input tensor name, generally rts op;
  // value: the fixed addr of input anchor, same as the peer output anchor addr of the peer op
  std::map<string, int64_t> tensor_name_to_fixed_addr_size_;

  // key: input tensor name, generally rts op; value: the peer output anchor of the peer op
  std::map<string, int64_t> tensor_name_to_peer_output_index_;
  // if model is first execute
  bool is_first_execute_;
  // for op debug
  std::mutex debug_reg_mutex_;
  bool is_op_debug_reg_ = false;
  void *op_debug_addr_ = nullptr;
  void *p2p_debug_addr_ = nullptr;
  bool is_new_model_desc_{false};
 };
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/model_manager.cc
+++ b/src/ge/graph/load/new_model_manager/model_manager.cc
@@ -22,9 +22,8 @@
 #include "common/profiling/profiling_manager.h"
 #include "common/properties_manager.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/util.h"
 #include "graph/common/ge_call_wrapper.h"
 #include "graph/debug/ge_attr_define.h"
 #include "framework/common/util.h"
 #include "graph/load/new_model_manager/davinci_model.h"
 #include "graph/load/new_model_manager/davinci_model_parser.h"
 #include "model/ge_root_model.h"
@@ -34,10 +33,9 @@ thread_local uint32_t device_count = 0;
 namespace {
 const int kCmdParSize = 2;
 const int kDumpCmdPairSize = 2;
 const char *const kNeedDestroySpecifiedAicpuKernel = "need_destroy_specified_aicpu_kernel";
 }  // namespace

 DumpProperties ModelManager::dump_properties_;

 std::shared_ptr<ModelManager> ModelManager::GetInstance() {
  static const std::shared_ptr<ModelManager> instance_ptr =
    shared_ptr<ModelManager>(new (std::nothrow) ModelManager(), ModelManager::FinalizeForPtr);
@@ -274,10 +272,6 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr<ge::Ge
  davinci_model->SetId(model_id);
  davinci_model->SetDeviceId(GetContext().DeviceId());

  const DumpProperties &dump_properties = PropertiesManager::Instance().GetDumpProperties(GetContext().SessionId());
  davinci_model->SetDumpProperties(dump_properties);
  dump_properties_ = dump_properties;

  auto root_graph = ge_root_model->GetRootGraph();
  GE_CHECK_NOTNULL(root_graph);
  string root_model_name = root_graph->GetName();
@@ -302,6 +296,9 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr<ge::Ge
      davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * 1000 * 1000 * 1000 +
                                                       timespec.tv_nsec));  // 1000 ^ 3 converts second to nanosecond
      davinci_model->SetProfileTime(MODEL_LOAD_END);
      if (davinci_model->SinkModelProfile() != SUCCESS) {
        GELOGW("Sink model profile failed.");
      }
    }
  } while (0);

@@ -614,10 +611,10 @@ Status ModelManager::HandleDumpCommand(const Command &command) {
    GELOGE(PARAM_INVALID, "parser dump model failed");
    return FAILED;
  }
  GELOGI("dump model = %s.", dump_model.c_str());
  GELOGI("dump status = %s.", dump_model.c_str());

  if (dump_status == "off" || dump_status == "OFF") {
    dump_properties_.DeletePropertyValue(dump_model);
    PropertiesManager::Instance().DeleteDumpPropertyValue(dump_model);
    return SUCCESS;
  }

@@ -634,10 +631,9 @@ Status ModelManager::HandleDumpCommand(const Command &command) {
    return FAILED;
  }
  if (!dump_path.empty() && dump_path[dump_path.size() - 1] != '/') {
    dump_path = dump_path + "/";
    dump_path = dump_path + "/" + CurrentTimeInStr() + "/";
  }
  dump_path = dump_path + CurrentTimeInStr() + "/";
  GELOGI("dump path = %s.", dump_path.c_str());
  GELOGI("dump status = %s.", dump_path.c_str());

  ret = ParserPara(command, DUMP_MODE, dump_mode);
  if (ret != SUCCESS) {
@@ -646,10 +642,20 @@ Status ModelManager::HandleDumpCommand(const Command &command) {
  }
  GELOGI("dump mode = %s", dump_mode.c_str());

  dump_properties_.AddPropertyValue(dump_model, dump_layers);
  dump_properties_.SetDumpPath(dump_path);
  dump_properties_.SetDumpMode(dump_mode);
  auto iter_dump_mode = std::find(command.cmd_params.begin(), command.cmd_params.end(), DUMP_MODE);
  if (iter_dump_mode != command.cmd_params.end()) {
    ++iter_dump_mode;
    if (iter_dump_mode == command.cmd_params.end()) {
      GELOGE(PARAM_INVALID, "Invalid access.");
      return PARAM_INVALID;
    }
    dump_mode = *iter_dump_mode;
    GELOGI("dump mode = %s", dump_mode.c_str());
  }

  PropertiesManager::Instance().AddDumpPropertyValue(dump_model, dump_layers);
  PropertiesManager::Instance().SetDumpOutputPath(dump_path);
  PropertiesManager::Instance().SetDumpMode(dump_mode);
  return SUCCESS;
 }

@@ -765,6 +771,17 @@ Status ModelManager::GenSessionId(uint64_t &session_id) {
  return SUCCESS;
 }

 Status ModelManager::UpdateSessionId(std::shared_ptr<DavinciModel> &davinci_model, uint64_t session_id) {
  GeModelPtr ge_model_current = davinci_model->GetGeModel();
  GE_CHECK_NOTNULL(ge_model_current);
  if (!ge::AttrUtils::SetInt(ge_model_current, ge::MODEL_ATTR_SESSION_ID, static_cast<int64_t>(session_id))) {
    GELOGW("Set attr[%s] failed in updating session_id.", MODEL_ATTR_SESSION_ID.c_str());
  }

  GELOGD("Update session id: %lu.", session_id);
  return SUCCESS;
 }

 Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model, shared_ptr<ModelListener> listener,
                                      void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) {
  GE_CHK_BOOL_RET_STATUS(model.key.empty() || access(model.key.c_str(), F_OK) == 0, PARAM_INVALID,
@@ -807,7 +824,6 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
    }
    davinci_model->SetDeviceId(device_id);
    davinci_model->SetOmName(model.om_name);
    davinci_model->SetDumpProperties(dump_properties_);

    /// In multi-threaded inference,  using the same session_id among multiple threads may cause some threads to fail.
    /// These session_ids come from the same model, so the values of session_id are the same.
@@ -815,7 +831,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
    uint64_t new_session_id;
    ret = GenSessionId(new_session_id);
    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, "Generate session_id for infer failed.");
    ret = davinci_model->UpdateSessionId(new_session_id);
    ret = UpdateSessionId(davinci_model, new_session_id);
    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, "Update session_id for infer failed.");

    ret = davinci_model->Init(dev_ptr, mem_size, weight_ptr, weight_size);
@@ -830,6 +846,9 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
      davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * 1000 * 1000 * 1000 +
                                                       timespec.tv_nsec));  // 1000 ^ 3 converts second to nanosecond
      davinci_model->SetProfileTime(MODEL_LOAD_END);
      if (davinci_model->SinkModelProfile() != SUCCESS) {
        GELOGW("Sink model profile failed.");
      }
    }

    GE_IF_BOOL_EXEC(ret == SUCCESS, device_count++);
@@ -879,7 +898,7 @@ Status ModelManager::LoadModelWithQ(uint32_t &model_id, const ModelData &model_d
  uint64_t new_session_id;
  ret = GenSessionId(new_session_id);
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Generate session_id for infer failed.");
  ret = davinci_model->UpdateSessionId(new_session_id);
  ret = UpdateSessionId(davinci_model, new_session_id);
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Update session_id for infer failed.");

  GenModelId(&model_id);
@@ -890,8 +909,6 @@ Status ModelManager::LoadModelWithQ(uint32_t &model_id, const ModelData &model_d
    return ret;
  }

  davinci_model->SetDumpProperties(dump_properties_);

  ret = davinci_model->Init();
  if (ret != SUCCESS) {
    GELOGE(ret, "init model failed.");
@@ -918,8 +935,12 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy
  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "Invalid Model ID %u to start! ", model_id);

  if (davinci_model->NeedDestroyAicpuKernel()) {
    GELOGI("Start to destroy specified aicpu kernel.");
  GeModelPtr ge_model_current = davinci_model->GetGeModel();
  bool need_destroy_aicpu_kernel = false;
  bool result = ge::AttrUtils::GetBool(ge_model_current, kNeedDestroySpecifiedAicpuKernel, need_destroy_aicpu_kernel);
  if (result && need_destroy_aicpu_kernel) {
    GELOGI("Get attr %s successfully, start to destroy specified aicpu kernel.", kNeedDestroySpecifiedAicpuKernel);

    // Zero copy is enabled by default, no need to judge.
    uint64_t session_id_davinci = davinci_model->GetSessionId();
    uint32_t model_id_davinci = davinci_model->GetModelId();
@@ -1029,19 +1050,4 @@ Status ModelManager::GetAllAippInputOutputDims(uint32_t model_id, uint32_t index
  return davinci_model->GetAllAippInputOutputDims(index, input_dims, output_dims);
 }

 bool ModelManager::IsDynamicShape(uint32_t model_id) {
  auto model = GetHybridModel(model_id);
  return model != nullptr;
 }

 ge::Status ModelManager::SyncExecuteModel(uint32_t model_id, const vector<GeTensor> &inputs,
                                          vector<GeTensor> &outputs) {
  auto model = GetHybridModel(model_id);
  if (model == nullptr) {
    GELOGE(FAILED, "Hybrid model not found. model id = %u.", model_id);
    return FAILED;
  }

  return model->Execute(inputs, outputs);
 }
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/model_manager.h
+++ b/src/ge/graph/load/new_model_manager/model_manager.h
@@ -31,7 +31,6 @@
 #include "common/ge_types.h"
 #include "common/helper/model_helper.h"
 #include "common/helper/om_file_helper.h"
 #include "common/properties_manager.h"
 #include "common/types.h"
 #include "ge/ge_api_types.h"
 #include "graph/ge_context.h"
@@ -142,8 +141,6 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  ge::Status ExecuteModel(uint32_t model_id, rtStream_t stream, bool async_mode, const InputData &input_data,
                          OutputData &output_data);

  ge::Status SyncExecuteModel(uint32_t model_id, const std::vector<GeTensor> &inputs, std::vector<GeTensor> &outputs);

  ///
  /// @ingroup domi_ome
  /// @brief model stop
@@ -252,8 +249,6 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  ge::Status GetAllAippInputOutputDims(uint32_t model_id, uint32_t index, std::vector<InputOutputDims> &input_dims,
                                       std::vector<InputOutputDims> &output_dims);

  bool IsDynamicShape(uint32_t model_id);

 private:
  ///
  /// @ingroup domi_ome
@@ -281,6 +276,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  ge::Status DeleteModel(uint32_t id);

  void GenModelId(uint32_t *id);
  ge::Status UpdateSessionId(std::shared_ptr<DavinciModel> &davinci_model, uint64_t session_id);

  std::map<uint32_t, std::shared_ptr<DavinciModel>> model_map_;
  std::map<uint32_t, std::shared_ptr<hybrid::HybridDavinciModel>> hybrid_model_map_;
@@ -291,8 +287,6 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  std::mutex session_id_create_mutex_;
  uint64_t session_id_bias_;
  std::set<uint64_t> sess_ids_;

  static DumpProperties dump_properties_;
 };
 }  // namespace ge

--- a/src/ge/graph/load/new_model_manager/model_utils.cc
+++ b/src/ge/graph/load/new_model_manager/model_utils.cc
@@ -31,7 +31,7 @@

 namespace ge {
 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get input size.
 /// @return vector<uint32_t>
 ///
@@ -43,26 +43,22 @@ vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) {

  const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
  for (size_t i = 0; i < inputs_size; ++i) {
    const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(i);
    if (tensor_desc == nullptr) {
      GELOGW("Op: %s, Index: %zu, Tensor Desc is null", op_desc->GetName().c_str(), i);
      continue;
    }

    int64_t tensor_size = 0;
    if ((i < v_is_input_const.size()) && v_is_input_const[i] && (op_type != NETOUTPUT)) {
      // TBE: add weights size to input
      GE_CHK_STATUS(TensorUtils::GetSize(*tensor_desc, tensor_size));
      GeTensorDesc tensor_desc = op_desc->GetInputDesc(i);
      int64_t tensor_size = 0;
      GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
      if (tensor_size) {
        v_input_size.push_back(tensor_size);
      }
      continue;
    }

    int64_t tensor_size = 0;
    GE_IF_BOOL_EXEC(
      TensorUtils::GetSize(*tensor_desc, tensor_size) != GRAPH_SUCCESS,
      TensorUtils::GetSize(op_desc->GetInputDesc(i), tensor_size) != GRAPH_SUCCESS,
      GELOGI("Get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
      continue);
      continue;);

    v_input_size.push_back(tensor_size);
  }
@@ -71,7 +67,7 @@ vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) {
 }

 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get output size.
 /// @return vector<uint32_t>
 ///
@@ -86,17 +82,11 @@ vector<int64_t> ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) {
                  return v_output_size;);

  for (size_t i = 0; i < outputs_size; ++i) {
    const GeTensorDescPtr tensor_desc = op_desc->MutableOutputDesc(i);
    if (tensor_desc == nullptr) {
      GELOGW("Op: %s, Index: %zu, Tensor Desc is null", op_desc->GetName().c_str(), i);
      continue;
    }

    int64_t tensor_size = 0;
    GE_IF_BOOL_EXEC(
      TensorUtils::GetSize(*tensor_desc, tensor_size) != GRAPH_SUCCESS,
      TensorUtils::GetSize(op_desc->GetOutputDesc(i), tensor_size) != GRAPH_SUCCESS,
      GELOGI("Get size from TensorDesc failed, op : %s, output index : %zu", op_desc->GetName().c_str(), i);
      continue);
      continue;);

    v_output_size.push_back(tensor_size);
  }
@@ -105,7 +95,7 @@ vector<int64_t> ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) {
 }

 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get workspace size.
 /// @return vector<uint32_t>
 ///
@@ -128,7 +118,7 @@ vector<int64_t> ModelUtils::GetWorkspaceSize(ConstOpDescPtr op_desc) {
 }

 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get weight size.
 /// @return vector<uint32_t>
 ///
@@ -152,14 +142,8 @@ vector<int64_t> ModelUtils::GetWeightSize(ConstOpDescPtr op_desc) {
  const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
  for (size_t i = 0; i < inputs_size; ++i) {
    if ((i < v_is_input_const.size()) && v_is_input_const[i]) {
      const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(i);
      if (tensor_desc == nullptr) {
        GELOGW("Op: %s, Index: %zu, Tensor Desc is null", op_desc->GetName().c_str(), i);
        continue;
      }

      int64_t tensor_size = 0;
      (void)TensorUtils::GetSize(*tensor_desc, tensor_size);
      (void)TensorUtils::GetSize(op_desc->GetInputDesc(i), tensor_size);
      v_weight_size.push_back(tensor_size);
    }
  }
@@ -168,7 +152,7 @@ vector<int64_t> ModelUtils::GetWeightSize(ConstOpDescPtr op_desc) {
 }

 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get weights.
 /// @return vector<ConstGeTensorPtr>
 ///
@@ -192,14 +176,9 @@ vector<ConstGeTensorPtr> ModelUtils::GetWeights(ConstOpDescPtr op_desc) {
  const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
  for (size_t i = 0; i < inputs_size; ++i) {
    if ((i < v_is_input_const.size()) && v_is_input_const[i]) {
      const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(i);
      if (tensor_desc == nullptr) {
        GELOGW("Op: %s, Index: %zu, Tensor Desc is null", op_desc->GetName().c_str(), i);
        continue;
      }

      ConstGeTensorPtr weight = nullptr;
      if (AttrUtils::GetTensor(*tensor_desc, ATTR_NAME_WEIGHTS, weight)) {
      GeTensorDesc tensor_desc = op_desc->GetInputDesc(i);
      if (AttrUtils::GetTensor(tensor_desc, ATTR_NAME_WEIGHTS, weight)) {
        v_weights.push_back(weight);
      }
    }
@@ -209,7 +188,7 @@ vector<ConstGeTensorPtr> ModelUtils::GetWeights(ConstOpDescPtr op_desc) {
 }

 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get AiCpuOp Input descriptor.
 /// @return vector<::tagCcAICPUTensor>
 ///
@@ -226,25 +205,20 @@ vector<::tagCcAICPUTensor> ModelUtils::GetInputDescs(ConstOpDescPtr op_desc) {
      continue;
    }

    const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(i);
    if (tensor_desc == nullptr) {
      GELOGW("Op: %s, Index: %zu, Tensor Desc is null", op_desc->GetName().c_str(), i);
      continue;
    }

    uint32_t dim_cnt = 0;
    GE_CHK_BOOL_EXEC_WARN(TensorUtils::GetRealDimCnt(*tensor_desc, dim_cnt) == GRAPH_SUCCESS, continue,
    const auto &descriptor = op_desc->GetInputDesc(i);
    GE_CHK_BOOL_EXEC_WARN(TensorUtils::GetRealDimCnt(descriptor, dim_cnt) == GRAPH_SUCCESS, continue,
                          "Get dim_cnt failed");

    opTensor_t tmp;
    uint32_t tmp_fmt = tensor_desc->GetFormat();
    uint32_t tmp_fmt = descriptor.GetFormat();
    tmp.format = tagOpTensorFormat(tmp_fmt);
    tmp.dim_cnt = static_cast<int32_t>(dim_cnt);
    uint32_t tmp_type = tensor_desc->GetDataType();
    uint32_t tmp_type = descriptor.GetDataType();
    tmp.data_type = tagOpDataType(tmp_type);

    for (int32_t j = 0; j < 4; j++) {  // 4 dims
      tmp.dim[j] = (j < tmp.dim_cnt ? tensor_desc->GetShape().GetDim(j) : 1);
      tmp.dim[j] = (j < tmp.dim_cnt ? descriptor.GetShape().GetDim(j) : 1);
    }

    v_input_descs.push_back(tmp);
@@ -254,7 +228,7 @@ vector<::tagCcAICPUTensor> ModelUtils::GetInputDescs(ConstOpDescPtr op_desc) {
 }

 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get AiCpuOp Output descriptor.
 /// @return vector<::tagCcAICPUTensor>
 ///
@@ -266,25 +240,20 @@ vector<::tagCcAICPUTensor> ModelUtils::GetOutputDescs(ConstOpDescPtr op_desc) {
  // init op output opTensor_t struct
  const size_t output_num = op_desc->GetOutputsSize();
  for (size_t i = 0; i < output_num; ++i) {
    const GeTensorDescPtr tensor_desc = op_desc->MutableOutputDesc(i);
    if (tensor_desc == nullptr) {
      GELOGW("Op: %s, Index: %zu, Tensor Desc is null", op_desc->GetName().c_str(), i);
      continue;
    }

    uint32_t dim_cnt = 0;
    GE_CHK_BOOL_EXEC_WARN(TensorUtils::GetRealDimCnt(*tensor_desc, dim_cnt) == GRAPH_SUCCESS, continue,
    const auto &descriptor = op_desc->GetOutputDesc(i);
    GE_CHK_BOOL_EXEC_WARN(TensorUtils::GetRealDimCnt(descriptor, dim_cnt) == GRAPH_SUCCESS, continue,
                          "Get dim_cnt failed");

    opTensor_t tmp;
    uint32_t tmp_fmt = tensor_desc->GetFormat();
    uint32_t tmp_fmt = descriptor.GetFormat();
    tmp.format = tagOpTensorFormat(tmp_fmt);
    tmp.dim_cnt = static_cast<int32_t>(dim_cnt);
    uint32_t tmp_type = tensor_desc->GetDataType();
    uint32_t tmp_type = descriptor.GetDataType();
    tmp.data_type = tagOpDataType(tmp_type);

    for (int32_t j = 0; j < 4; j++) {  // 4 dims
      tmp.dim[j] = (j < tmp.dim_cnt ? tensor_desc->GetShape().GetDim(j) : 1);
      tmp.dim[j] = (j < tmp.dim_cnt ? descriptor.GetShape().GetDim(j) : 1);
    }

    v_output_descs.push_back(tmp);
@@ -294,14 +263,44 @@ vector<::tagCcAICPUTensor> ModelUtils::GetOutputDescs(ConstOpDescPtr op_desc) {
 }

 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get input data address.
 /// @return vector<void*>
 ///
 vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc) {
 vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc,
                                             bool need_convert) {
  vector<void *> v_input_data_addr;  // init as:buf_base + op_def_->input(i));
  GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_data_addr);
  uint64_t session_id = model_param.session_id;
  uint8_t *mem_base = model_param.mem_base;
  uint8_t *var_base = model_param.var_base;
  uint8_t *weight_base = model_param.weight_base;
  const uint64_t logic_mem_base = 0;
  uint64_t logic_weight_base = 0;
  uint64_t logic_var_base = model_param.logic_var_base;
  uint64_t mem_size = model_param.mem_size;
  uint64_t weight_size = model_param.weight_size;
  uint64_t var_size = model_param.var_size;

  if (need_convert) {
    Status status = ConvertVirtualAddressToPhysical(mem_base, mem_size, mem_base);
    if (status != SUCCESS) {
      GELOGE(RT_FAILED, "Convert virtual address to physical for mem_base failed.");
      return v_input_data_addr;
    }

    status = ConvertVirtualAddressToPhysical(weight_base, weight_size, weight_base);
    if (status != SUCCESS) {
      GELOGE(RT_FAILED, "Convert virtual address to physical for weight_base failed.");
      return v_input_data_addr;
    }

    status = ConvertVirtualAddressToPhysical(var_base, var_size, var_base);
    if (status != SUCCESS) {
      GELOGE(RT_FAILED, "Convert virtual address to physical for var_base failed.");
      return v_input_data_addr;
    }
  }

  const size_t inputs_size = op_desc->GetInputsSize();
  const vector<int64_t> v_input_offset = op_desc->GetInputOffset();
@@ -320,18 +319,13 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
  for (size_t i = 0; i < inputs_size; ++i) {
    if ((i < v_is_input_const.size()) && v_is_input_const[i] && (op_type != NETOUTPUT)) {
      // TBE: add weights address to input
      const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(i);
      if (tensor_desc == nullptr) {
        GELOGW("Op: %s, Index: %zu, Tensor Desc is null", op_desc->GetName().c_str(), i);
        continue;
      }

      GeTensorDesc tensor_desc = op_desc->GetInputDesc(i);
      int64_t tensor_size = 0;
      GE_CHK_STATUS(TensorUtils::GetSize(*tensor_desc, tensor_size));
      GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
      if (tensor_size) {
        int64_t data_offset = 0;
        GE_CHK_STATUS(TensorUtils::GetDataOffset(*tensor_desc, data_offset));
        uint8_t *weight_addr = model_param.weight_base + data_offset;
        GE_CHK_STATUS(TensorUtils::GetDataOffset(tensor_desc, data_offset));
        uint8_t *weight_addr = static_cast<uint8_t *>(weight_base + data_offset - logic_weight_base);
        v_input_data_addr.push_back(weight_addr);
        GELOGI("[IMAS]GetInputDataAddrs graph_%u type[C] name[%s] input[%zu] memaddr[%p]", model_param.graph_id,
               op_desc->GetName().c_str(), i, weight_addr);
@@ -346,13 +340,17 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co

    int64_t input_offset = v_input_offset[non_const_index];
    non_const_index++;
    GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset),
                    uint8_t *variable_addr = model_param.var_base + input_offset - model_param.logic_var_base;
    GE_IF_BOOL_EXEC(var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset),
                    uint8_t *variable_addr = var_base + input_offset - logic_var_base;
                    v_input_data_addr.push_back(variable_addr);
                    GELOGI("[IMAS]GetInputDataAddrs graph_%u type[V] name[%s] input[%lu] memaddr[%p]",
                           model_param.graph_id, op_desc->GetName().c_str(), i, variable_addr);
                    continue);
                    continue;);

    bool input_tensor = false;
    GE_IF_BOOL_EXEC(TensorUtils::GetInputTensor(op_desc->GetOutputDesc(i), input_tensor) != GRAPH_SUCCESS,
                    GELOGW("get size from TensorDesc failed, op: %s, input index: %zu", op_desc->GetName().c_str(), i);
                    continue;);
    // feature maps
    uint8_t *mem_addr = nullptr;
    //  fusion
@@ -360,7 +358,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
      mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(input_offset));
      v_input_data_addr.push_back(mem_addr);
    } else {
      mem_addr = model_param.mem_base + input_offset;
      mem_addr = static_cast<uint8_t *>(mem_base + input_offset - logic_mem_base);
      v_input_data_addr.push_back(mem_addr);
    }
    GELOGI("[IMAS]GetInputDataAddrs graph_%u type[F] name[%s] input[%zu] memaddr[%p]", model_param.graph_id,
@@ -371,20 +369,41 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
 }

 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get output data address.
 /// @return vector<void*>
 ///
 vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc) {
 vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc,
                                              bool need_convert) {
  vector<void *> v_output_data_addr;  // init as:buf_base + op_def_->output(i)
  GE_CHECK_NOTNULL_EXEC(op_desc, return v_output_data_addr);
  uint64_t session_id = model_param.session_id;
  uint8_t *mem_base = model_param.mem_base;
  uint8_t *var_base = model_param.var_base;
  const uint64_t logic_mem_base = 0;
  uint64_t logic_var_base = model_param.logic_var_base;
  uint64_t mem_size = model_param.mem_size;
  uint64_t var_size = model_param.var_size;

  if (need_convert) {
    Status status = ConvertVirtualAddressToPhysical(mem_base, mem_size, mem_base);
    if (status != SUCCESS) {
      GELOGE(RT_FAILED, "Convert virtual address to physical for mem_base failed.");
      return v_output_data_addr;
    }

    status = ConvertVirtualAddressToPhysical(var_base, var_size, var_base);
    if (status != SUCCESS) {
      GELOGE(RT_FAILED, "Convert virtual address to physical for var_base failed.");
      return v_output_data_addr;
    }
  }

  const size_t outputs_size = op_desc->GetOutputsSize();
  const vector<int64_t> v_output_offset = op_desc->GetOutputOffset();
  GE_IF_BOOL_EXEC(v_output_offset.size() != outputs_size,
                  GELOGW("Output param invalid: output_offset=%zu, outputs=%zu.", v_output_offset.size(), outputs_size);
                  return v_output_data_addr);
                  return v_output_data_addr;);
  vector<int64_t> v_memory_type;
  bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, v_memory_type);
  if (has_mem_type_attr && (v_memory_type.size() != outputs_size)) {
@@ -394,12 +413,12 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
    return v_output_data_addr;
  }
  for (size_t i = 0; i < outputs_size; ++i) {
    GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(v_output_offset[i]),
                    uint8_t *variable_addr = model_param.var_base + v_output_offset[i] - model_param.logic_var_base;
    GE_IF_BOOL_EXEC(var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(v_output_offset[i]),
                    uint8_t *variable_addr = static_cast<uint8_t *>(var_base + v_output_offset[i] - logic_var_base);
                    v_output_data_addr.push_back(variable_addr);
                    GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[V] name[%s] output[%zu] memaddr[%p]",
                           model_param.graph_id, op_desc->GetName().c_str(), i, variable_addr);
                    continue);
                    continue;);
    // feature maps
    uint8_t *mem_addr = nullptr;
    //  fusion
@@ -407,7 +426,7 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
      mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_output_offset[i]));
      v_output_data_addr.push_back(mem_addr);
    } else {
      mem_addr = static_cast<uint8_t *>(model_param.mem_base + v_output_offset[i]);
      mem_addr = static_cast<uint8_t *>(mem_base + v_output_offset[i] - logic_mem_base);
      v_output_data_addr.push_back(mem_addr);
    }
    GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[F] name[%s] output[%zu] memaddr[%p]", model_param.graph_id,
@@ -417,13 +436,24 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
 }

 ///
 /// @ingroup ge
 /// @ingroup domi_ome
 /// @brief Get workspace data address.
 /// @return vector<void*>
 ///
 vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc) {
 vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc,
                                                 bool need_convert) {
  vector<void *> v_workspace_data_addr;
  GE_CHECK_NOTNULL_EXEC(op_desc, return v_workspace_data_addr);
  uint8_t *mem_base = model_param.mem_base;
  uint64_t mem_size = model_param.mem_size;

  if (need_convert) {
    Status status = ConvertVirtualAddressToPhysical(mem_base, mem_size, mem_base);
    if (status != SUCCESS) {
      GELOGE(RT_FAILED, "Convert virtual address to physical for mem_base failed.");
      return v_workspace_data_addr;
    }
  }

  const vector<int64_t> v_workspace_offset = op_desc->GetWorkspace();
  const vector<int64_t> v_workspace_bytes = op_desc->GetWorkspaceBytes();
@@ -436,13 +466,13 @@ vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param
  bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, v_memory_type);
  for (size_t i = 0; i < v_workspace_bytes.size(); ++i) {
    if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {
      v_workspace_data_addr.push_back(reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i])));
      v_workspace_data_addr.push_back(reinterpret_cast<uint8_t *>(v_workspace_offset[i]));
      GELOGI("Fusion: op: %s, GetWorkspaceDataAddrs mem_addr[workspace index %zu]:%p", op_desc->GetName().c_str(), i,
             reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i])));
    } else {
      int64_t workspace_offset = v_workspace_offset[i];
      int64_t workspace_bytes = v_workspace_bytes[i];
      uint8_t *mem_addr = workspace_bytes == 0 ? nullptr : model_param.mem_base + workspace_offset;
      uint8_t *mem_addr = workspace_bytes == 0 ? nullptr : mem_base + workspace_offset;
      v_workspace_data_addr.push_back(mem_addr);
      GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] memaddr[%p]",
             model_param.graph_id, op_desc->GetName().c_str(), i, workspace_offset, workspace_bytes, mem_addr);
@@ -452,32 +482,21 @@ vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param
  return v_workspace_data_addr;
 }

 ///
 /// @ingroup ge
 /// @brief Get runtime memory address.
 /// @return Status
 ///
 Status ModelUtils::GetRtAddress(const RuntimeParam &param, uintptr_t logic_addr, uint8_t *&mem_addr) {
  uint8_t *runtime_base_addr = nullptr;
  if ((param.logic_mem_base <= logic_addr) && (logic_addr < param.logic_mem_base + param.mem_size)) {
    runtime_base_addr = param.mem_base - param.logic_mem_base;
    GELOGI("The logic addr:0x%lx is data address, base:0x%lx, size:%lu", logic_addr, param.logic_mem_base,
           param.mem_size);
  } else if ((param.logic_weight_base <= logic_addr) && (logic_addr < param.logic_weight_base + param.weight_size)) {
    runtime_base_addr = param.weight_base - param.logic_weight_base;
    GELOGI("The logic addr:0x%lx is weight address, base:0x%lx, size:%lu", logic_addr, param.logic_weight_base,
           param.weight_size);
  } else if ((param.logic_var_base <= logic_addr) && (logic_addr < param.logic_var_base + param.var_size)) {
    runtime_base_addr = param.var_base - param.logic_var_base;
    GELOGI("The logic addr:0x%lx is variable address, base:0x%lx, size:%lu", logic_addr, param.logic_var_base,
           param.var_size);
  } else if (logic_addr != 0) {
    mem_addr = nullptr;
    GELOGE(PARAM_INVALID, "The logic addr:0x%lx is abnormal", logic_addr);
    return PARAM_INVALID;
 Status ModelUtils::ConvertVirtualAddressToPhysical(uint8_t *virtual_address, uint64_t size,
                                                   uint8_t *&physical_address) {
  // Indicates whether use physical address.
  const char *use_physical_address = std::getenv("GE_USE_PHYSICAL_ADDRESS");
  if (use_physical_address == nullptr || virtual_address == 0 || size == 0) {
    return SUCCESS;
  }

  rtError_t ret = rtKernelConfigTransArg(virtual_address, size, 0, reinterpret_cast<void **>(&physical_address));
  if (ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rtKernelConfigTransArg failed, ret: 0x%X", ret);
    return RT_FAILED;
  }

  mem_addr = runtime_base_addr + logic_addr;
  GELOGD("virtual_address=%p, physical_address=%p", virtual_address, physical_address);
  return SUCCESS;
 }
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/model_utils.h
+++ b/src/ge/graph/load/new_model_manager/model_utils.h
@@ -34,79 +34,78 @@ class ModelUtils {
  ~ModelUtils() = default;

  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get input size.
  /// @return vector<uint32_t>
  ///
  static vector<int64_t> GetInputSize(ConstOpDescPtr op_desc);

  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get output size.
  /// @return vector<uint32_t>
  ///
  static vector<int64_t> GetOutputSize(ConstOpDescPtr op_desc);

  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get workspace size.
  /// @return vector<uint32_t>
  ///
  static vector<int64_t> GetWorkspaceSize(ConstOpDescPtr op_desc);

  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get weight size.
  /// @return vector<uint32_t>
  ///
  static vector<int64_t> GetWeightSize(ConstOpDescPtr op_desc);

  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get weights.
  /// @return vector<ConstGeTensorPtr>
  ///
  static vector<ConstGeTensorPtr> GetWeights(ConstOpDescPtr op_desc);

  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get AiCpuOp Input descriptor.
  /// @return vector<::tagCcAICPUTensor>
  ///
  static vector<::tagCcAICPUTensor> GetInputDescs(ConstOpDescPtr op_desc);
  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get AiCpuOp Output descriptor.
  /// @return vector<::tagCcAICPUTensor>
  ///
  static vector<::tagCcAICPUTensor> GetOutputDescs(ConstOpDescPtr op_desc);

  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get input data address.
  /// @return vector<void*>
  ///
  static vector<void *> GetInputDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc);
  static vector<void *> GetInputDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc,
                                          bool need_convert = true);
  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get output data address.
  /// @return vector<void*>
  ///
  static vector<void *> GetOutputDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc);
  static vector<void *> GetOutputDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc,
                                           bool need_convert = true);

  ///
  /// @ingroup ge
  /// @ingroup domi_ome
  /// @brief Get workspace data address.
  /// @return vector<void*>
  ///
  static vector<void *> GetWorkspaceDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc);
  static vector<void *> GetWorkspaceDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc,
                                              bool need_convert = true);

  ///
  /// @ingroup ge
  /// @brief Get memory runtime base.
  /// @return Status
  ///
  static Status GetRtAddress(const RuntimeParam &model_param, uintptr_t logic_addr, uint8_t *&mem_addr);
  static ge::Status ConvertVirtualAddressToPhysical(uint8_t *virtual_address, uint64_t size,
                                                    uint8_t *&physical_address);
 };
 }  // namespace ge

--- a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc
@@ -45,7 +45,7 @@ Status EndGraphTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
 Status EndGraphTaskInfo::Distribute() {
  GELOGI("EndGraphTaskInfo Distribute Start.");
  GE_CHECK_NOTNULL(davinci_model_);
  auto all_dump_model = davinci_model_->GetDumpProperties().GetAllDumpModel();
  auto all_dump_model = PropertiesManager::Instance().GetAllDumpModel();
  if (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end() ||
      all_dump_model.find(davinci_model_->Name()) != all_dump_model.end() ||
      all_dump_model.find(davinci_model_->OmName()) != all_dump_model.end()) {
@@ -80,4 +80,5 @@ Status EndGraphTaskInfo::Distribute() {
 }

 REGISTER_TASK_INFO(RT_MODEL_TASK_MODEL_END_GRAPH, EndGraphTaskInfo);

 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.h
@@ -22,7 +22,7 @@
 namespace ge {
 class EndGraphTaskInfo : public TaskInfo {
 public:
  EndGraphTaskInfo() {}
  EndGraphTaskInfo() : model_(0) {}

  ~EndGraphTaskInfo() override { model_ = nullptr; }

@@ -35,10 +35,10 @@ class EndGraphTaskInfo : public TaskInfo {
  uint32_t GetStreamId() override { return stream_id_; }

 private:
  rtModel_t model_{nullptr};
  DavinciModel *davinci_model_{nullptr};
  uint32_t task_id_{0};
  uint32_t stream_id_{0};
  rtModel_t model_;
  DavinciModel *davinci_model_;
  uint32_t task_id_;
  uint32_t stream_id_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_END_GRAPH_TASK_INFO_H_
--- a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
@@ -42,7 +42,6 @@ HcclTaskInfo::~HcclTaskInfo() {
  davinci_model_ = nullptr;
  ops_kernel_store_ = nullptr;
  max_node_of_hccl_stream_ = 0;
  args_ = nullptr;
 }
 Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  GELOGI("HcclTaskInfo Init Start.");
@@ -61,61 +60,54 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m
  GELOGI("HcclTaskInfo Init, op_index is: %u", op_index);

  // Get HCCL op
  op_desc_ = davinci_model->GetOpByIndex(op_index);
  GE_CHECK_NOTNULL(op_desc_);
  OpDescPtr op_desc = davinci_model->GetOpByIndex(op_index);
  GE_CHECK_NOTNULL(op_desc);

  // Create the kernel hccl infos
  CreateKernelHcclInfo(op_desc_);
  CreateKernelHcclInfo(op_desc);

  // Initialize the hccl_type of all kernel hccl info
  HcomOmeUtil::GetHcclType(task_def, kernel_hccl_infos_);

  // Only in Horovod scenario should get the inputName and GeShape
  ret = HcomOmeUtil::GetHorovodInputs(op_desc_, kernel_hccl_infos_);
  ret = HcomOmeUtil::GetHorovodInputs(op_desc, kernel_hccl_infos_);
  if (ret != SUCCESS) {
    GELOGE(FAILED, "davinci_model: GetHorovodInputs fail! domi error: %u", ret);
    return FAILED;
  }
  Status dmrt = HcomOmeUtil::GetHcclDataType(op_desc_, kernel_hccl_infos_);
  Status dmrt = HcomOmeUtil::GetHcclDataType(op_desc, kernel_hccl_infos_);
  if (dmrt != SUCCESS) {
    GELOGE(FAILED, "davinci_model: GetHcomDataType fail! domi error: %u", dmrt);
    return FAILED;
  }
  dmrt = HcomOmeUtil::GetHcclCount(op_desc_, kernel_hccl_infos_);
  dmrt = HcomOmeUtil::GetHcclCount(op_desc, kernel_hccl_infos_);
  if (dmrt != SUCCESS) {
    GELOGE(FAILED, "davinci_model: GetHcomCount fail! domi error: %u", dmrt);
    return FAILED;
  }
  // Only HCOMBROADCAST and HVDCALLBACKBROADCAST need to get the rootId
  dmrt = HcomOmeUtil::GetAllRootId(op_desc_, kernel_hccl_infos_);
  dmrt = HcomOmeUtil::GetAllRootId(op_desc, kernel_hccl_infos_);
  if (dmrt != SUCCESS) {
    GELOGE(FAILED, "davinci_model: Get rootId fail! domi error: %u", dmrt);
    return FAILED;
  }

  // GE's new process: hccl declares the number of streams required, creates a stream by GE, and sends it to hccl
  ret = SetFollowStream(op_desc_, davinci_model);
  if (ret != SUCCESS) {
    GELOGE(ret, "SetStream Fail.");
    return ret;
  }

  if (davinci_model_->IsKnownNode()) {
    args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
    GELOGI("Known node %s args addr %p, offset %u.", op_desc_->GetName().c_str(), args_, args_offset_);
  }

  ret = SetAddrs(op_desc_, kernel_hccl_infos_);
  ret = SetAddrs(op_desc, kernel_hccl_infos_);
  if (ret != SUCCESS) {
    GELOGE(ret, "Setaddrs Fail.");
    return ret;
  }
  // GE's new process: hccl declares the need for Workspace size, and GE allocates Workspace
  ret = SetWorkspace(op_desc_, kernel_hccl_infos_);
  ret = SetWorkspace(op_desc, kernel_hccl_infos_);
  if (ret != SUCCESS) {
    GELOGE(ret, "SetWorkspace Fail.");
    return ret;
  }
  // GE's new process: hccl declares the number of streams required, creates a stream by GE, and sends it to hccl
  ret = SetFollowStream(op_desc, davinci_model);
  if (ret != SUCCESS) {
    GELOGE(ret, "SetStream Fail.");
    return ret;
  }

  GELOGI("HcclTaskInfo Init Success");
  return SUCCESS;
@@ -217,83 +209,40 @@ Status HcclTaskInfo::Distribute() {
  GELOGI("HcclTaskInfo Distribute Success.");
  return SUCCESS;
 }

 Status HcclTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  GE_CHECK_NOTNULL(davinci_model);
  auto hccl_def = task_def.kernel_hccl();
  uint32_t op_index = hccl_def.op_index();
  GELOGI("HcclTaskInfo Init, op_index is: %u", op_index);
  // Get HCCL op
  auto op_desc = davinci_model->GetOpByIndex(op_index);
  GE_CHECK_NOTNULL(op_desc);
  GELOGI("Calc opType[%s] args size. Node name is [%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str());
  // Only need the number of addr to allocate args memory
  auto input_size = op_desc->GetInputsSize();
  auto output_size = op_desc->GetOutputsSize();
  auto workspace_size = op_desc->GetWorkspaceBytes().size();
  uint32_t args_size = sizeof(void *) * (input_size + output_size + workspace_size);
  args_offset_ = davinci_model->GetTotalArgsSize();
  davinci_model->SetTotalArgsSize(args_size);
  GELOGI("Calculate hccl task args , args_size %u, args_offset %u", args_size, args_offset_);
  return SUCCESS;
 }

 Status HcclTaskInfo::UpdateArgs() {
  GELOGI("HcclTaskInfo::UpdateArgs in.");
  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
  input_data_addrs_ = ModelUtils::GetInputDataAddrs(rts_param, op_desc_);
  output_data_addrs_ = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_);
  workspace_data_addrs_ = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_);

  vector<void *> io_addrs;
  io_addrs.insert(io_addrs.end(), input_data_addrs_.begin(), input_data_addrs_.end());
  io_addrs.insert(io_addrs.end(), output_data_addrs_.begin(), output_data_addrs_.end());
  io_addrs.insert(io_addrs.end(), workspace_data_addrs_.begin(), workspace_data_addrs_.end());

  GE_CHK_STATUS_RET(davinci_model_->UpdateKnownZeroCopyAddr(io_addrs, args_offset_),
                    "update known node %s zero copy addr failed.", op_desc_->GetName().c_str());

  GELOGI("HcclTaskInfo::UpdateArgs success.");
  return SUCCESS;
 }

 Status HcclTaskInfo::SetAddrs(const std::shared_ptr<OpDesc> &op_desc,
                              std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  GE_CHK_STATUS_RET(HcomOmeUtil::CheckKernelHcclInfo(op_desc, kernel_hccl_infos),
                    "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid.");
  if (HcomOmeUtil::CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) {
    GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid.");
    return PARAM_INVALID;
  }
  GELOGI("Set hccl task input output address, node[%s}, type[%s] kernel_hccl_infos.size[%zu].",
         op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size());
  if (op_desc->GetType() == HVDWAIT) {
    return SUCCESS;
  }

  domi::Status dmrt;
  hcclRedOp_t op_type = HCCL_REP_OP_SUM;
  GE_CHECK_NOTNULL(davinci_model_);
  GELOGI("Calc opType[%s] input address before. Node name[%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str());
  if (!davinci_model_->IsKnownNode()) {
    input_data_addrs_ = ModelUtils::GetInputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
    output_data_addrs_ = ModelUtils::GetOutputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
  }
  void *input_data_addr = nullptr;
  void *output_data_addr = nullptr;
  auto input_data_addr_list = ModelUtils::GetInputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);

  auto output_data_addr_list = ModelUtils::GetOutputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
  // initialize every kernel_hccl_info inputDataAddr
  for (size_t i = 0; i < kernel_hccl_infos.size(); i++) {
    std::string hccl_type = kernel_hccl_infos[i].hccl_type;
    if (davinci_model_->IsKnownNode()) {
      input_data_addr = reinterpret_cast<void *>(reinterpret_cast<uint64_t *>(args_) + i);
      output_data_addr = reinterpret_cast<void *>(reinterpret_cast<uint64_t *>(args_) + op_desc->GetInputsSize() + i);
      GELOGI("Hccl task info known input addr %p, output addr %p.", input_data_addr, output_data_addr);
    } else {
      input_data_addr = input_data_addrs_.empty() ? nullptr : input_data_addrs_[i];
      output_data_addr = output_data_addrs_.empty() ? nullptr : output_data_addrs_[i];
    }
    void *input_data_addr = input_data_addr_list.empty() ? nullptr : input_data_addr_list[i];
    kernel_hccl_infos[i].inputDataAddr = input_data_addr;

    void *output_data_addr = output_data_addr_list.empty() ? nullptr : output_data_addr_list[i];
    if (hccl_type == HCOMALLGATHER || hccl_type == HCOMRECEIVE || hccl_type == HVDCALLBACKALLGATHER) {
      kernel_hccl_infos[i].outputDataAddr = output_data_addr;
    } else if (hccl_type == HCOMALLREDUCE || hccl_type == HCOMREDUCESCATTER || hccl_type == HVDCALLBACKALLREDUCE) {
      GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type),
                        "davinci_model: GetHcomOperationType fail!");
      dmrt = HcomOmeUtil::GetHcclOperationType(op_desc, op_type);
      if (dmrt != SUCCESS) {
        GELOGE(FAILED, "davinci_model: GetHcomOperationType fail! domi error: %u", dmrt);
        return FAILED;
      }
      kernel_hccl_infos[i].outputDataAddr = output_data_addr;
      kernel_hccl_infos[i].opType = op_type;
    }
@@ -361,7 +310,6 @@ void HcclTaskInfo::CreateKernelHcclInfo(const ge::ConstOpDescPtr &op_desc) {
 Status HcclTaskInfo::SetWorkspace(const std::shared_ptr<OpDesc> &op_desc,
                                  std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  GE_CHECK_NOTNULL(davinci_model_);
  GELOGI("SetWorkspace Node[%s] opType[%s] set workspace.", op_desc->GetName().c_str(), op_desc->GetType().c_str());
  uint64_t workspace_mem_size = 0;
  void *workspace_addr = nullptr;
@@ -371,12 +319,11 @@ Status HcclTaskInfo::SetWorkspace(const std::shared_ptr<OpDesc> &op_desc,
    GELOGI("hccl need workSpaceMemSize=%lu", workspace_mem_size_tmp);
    if (workspace_mem_size_tmp != 0) {
      workspace_mem_size = workspace_mem_size_tmp;
      if (davinci_model_->IsKnownNode()) {
        workspace_addr = reinterpret_cast<void *>(reinterpret_cast<uint64_t *>(args_) + op_desc->GetInputsSize() +
                                                  op_desc->GetOutputsSize());
      } else {
        workspace_data_addrs_ = ModelUtils::GetWorkspaceDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
        workspace_addr = workspace_data_addrs_.empty() ? nullptr : workspace_data_addrs_[0];
      vector<void *> workspace_data_addrs =
        ModelUtils::GetWorkspaceDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
      if (!workspace_data_addrs.empty()) {
        GELOGI("Get workSpaceAddr");
        workspace_addr = workspace_data_addrs[0];
      }
    }
  }
--- a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.h
@@ -34,10 +34,7 @@ class HcclTaskInfo : public TaskInfo {
        hccl_stream_list_(),
        ops_kernel_store_(nullptr),
        private_def_(nullptr),
        private_def_len_(0),
        op_desc_(nullptr),
        args_(nullptr),
        args_offset_(0) {}
        private_def_len_(0) {}

  ~HcclTaskInfo() override;

@@ -47,10 +44,6 @@ class HcclTaskInfo : public TaskInfo {

  uint32_t GetTaskID() override { return id_; }

  Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;

  Status UpdateArgs() override;

 private:
  ge::Status SetAddrs(const std::string &hccl_type, const std::shared_ptr<OpDesc> &op);

@@ -79,12 +72,6 @@ class HcclTaskInfo : public TaskInfo {
  static std::mutex hccl_follow_stream_mutex_;
  static uint32_t max_node_of_hccl_stream_;
  vector<GETaskKernelHcclInfo> kernel_hccl_infos_;
  vector<void *> input_data_addrs_;
  vector<void *> output_data_addrs_;
  vector<void *> workspace_data_addrs_;
  OpDescPtr op_desc_;
  void *args_;
  uint32_t args_offset_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_HCCL_TASK_INFO_H_
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
@@ -79,9 +79,6 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
                    return FAILED;)
  }

  GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, ext_info_addr_=%p", op_desc_->GetName().c_str(),
         op_desc_->GetType().c_str(), ext_info.size(), ext_info_addr_);

  // 2.1 get loop cond variable for tensor array write
  uint64_t step_id_addr = 0;
  OpDescPtr step_id_node = davinci_model_->GetVariableOp(NODE_NAME_GLOBAL_STEP);
@@ -100,11 +97,6 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuKernel(session_id, davinci_model->Id(), kernel_id) != SUCCESS,
                  GELOGE(FAILED, "CreateAicpuKernel error.");
                  return FAILED;)
  // 2.3 Create session
  GE_CHECK_NOTNULL(ModelManager::GetInstance());
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS,
                  GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id);
                  return FAILED;)

  kernel_buf_size_ = sizeof(STR_FWK_OP_KERNEL);
  if (davinci_model_->IsKnownNode()) {
@@ -161,8 +153,8 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy to input_output_addr_ error: 0x%X", rt_ret);
                    return FAILED;)

    if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
                                                            op_desc->GetName())) {
    if (PropertiesManager::Instance().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
                                                      op_desc->GetName())) {
      dump_flag_ = RT_KERNEL_DUMPFLAG;
      dump_args_ = input_output_addr_;
    }
@@ -175,7 +167,12 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
  fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoLen = ext_info.size();
  fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoAddr = reinterpret_cast<uintptr_t>(ext_info_addr_);

  // 4. Return result
  // 4. Create session
  GE_CHECK_NOTNULL(ModelManager::GetInstance());
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS,
                  GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id);
                  return FAILED;)
  // 5. Return result
  rtError_t rt_ret = rtMalloc(&kernel_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM);
  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc error: 0x%X", rt_ret); return FAILED;)

@@ -183,7 +180,12 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
                    sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE);
  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy error, ret: Ox%X", rt_ret); return FAILED;)

  davinci_model_->SetZeroCopyAddr(op_desc, io_addrs, io_addrs.data(), input_output_addr_, addrs_size, 0);
  vector<void *> virtual_io_addrs;  // use virtual address for zero copy key.
  const vector<void *> virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false);
  const vector<void *> virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false);
  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_in_addrs.begin(), virtual_in_addrs.end());
  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end());
  davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, io_addrs.data(), input_output_addr_, addrs_size, 0);

  GELOGI("KernelExTaskInfo Init Success. session id: %lu", session_id);
  return SUCCESS;
@@ -205,55 +207,19 @@ Status KernelExTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciMod
  uint32_t mem_size = sizeof(uint64_t) * mem_length;
  davinci_model->SetTotalArgsSize(mem_size);
  GELOGI("kernel task name %s, args_size %u, args_offset %u", op_desc->GetName().c_str(), mem_size, args_offset_);

  // alloc fixed addr
  string peer_input_name;
  if (AttrUtils::GetStr(op_desc, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name) && !peer_input_name.empty()) {
    uint32_t output_index = davinci_model->GetFixedAddrOutputIndex(peer_input_name);
    if (output_index > outputs_size) {
      GELOGE(FAILED, "The output size[%zu] and output index[%u] are inconsistent.", outputs_size, output_index);
      return FAILED;
    }
    fixed_addr_offset_ = davinci_model->GetFixedAddrsSize(peer_input_name);
    auto tensor_desc = op_desc->GetOutputDesc(output_index);
    int64_t tensor_size = 0;
    GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
    davinci_model->SetTotalFixedAddrsSize(peer_input_name, tensor_size);
    GELOGI("Calculate stream switch task args , tensor size is %ld, fixed addr offset %ld", tensor_size,
           fixed_addr_offset_);
  }
  return SUCCESS;
 }

 Status KernelExTaskInfo::UpdateArgs() {
  GELOGI("KernelExTaskInfo::UpdateArgs in.");
  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
  vector<void *> io_addrs;
  vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc_);
  vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_);
  vector<void *> io_addrs;
  if (!op_desc_->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) {
    io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
    io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
  } else {
    string peer_input_name;
    if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name)) {
      uint32_t output_index = davinci_model_->GetFixedAddrOutputIndex(peer_input_name);
      if (output_index > output_data_addrs.size()) {
        GELOGE(FAILED, "The output data addr size[%zu] and output index[%u] are inconsistent.",
               output_data_addrs.size(), output_index);
        return FAILED;
      }
      io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
      for (size_t i = 0; i < output_data_addrs.size(); ++i) {
        if (i == output_index) {
          void *fixed_addr = davinci_model_->GetCurrentFixedAddr(fixed_addr_offset_);
          io_addrs.emplace_back(fixed_addr);
          continue;
        }
        io_addrs.emplace_back(output_data_addrs[i]);
      }
    }
  }

  io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
  io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());

  GE_CHK_STATUS_RET(davinci_model_->UpdateKnownZeroCopyAddr(io_addrs, args_offset_),
                    "update known node %s zero copy addr failed.", op_desc_->GetName().c_str());

@@ -265,7 +231,7 @@ Status KernelExTaskInfo::CopyTaskInfo(const domi::KernelExDef &kernel_def, const
                                      const OpDescPtr &op_desc) {
  // Userspace copy need virtual address.
  const vector<int64_t> workspace_data_sizes = ModelUtils::GetWorkspaceSize(op_desc);
  const vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc);
  const vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc, false);
  if (workspace_data_addrs.empty() || workspace_data_sizes.empty()) {
    GELOGE(FAILED, "Node:%s invalid workspace, addrs is %zu, size is %zu.", op_desc->GetName().c_str(),
           workspace_data_addrs.size(), workspace_data_sizes.size());
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
@@ -54,7 +54,6 @@ class KernelExTaskInfo : public TaskInfo {
    auto ret = reinterpret_cast<uintptr_t>(dump_args_);
    return ret;
  }
  bool CallSaveDumpInfo() override { return true; };

 private:
  Status CopyTaskInfo(const domi::KernelExDef &kernel_def, const RuntimeParam &rts_param, const OpDescPtr &op_desc);
@@ -70,7 +69,6 @@ class KernelExTaskInfo : public TaskInfo {
  void *dump_args_;
  OpDescPtr op_desc_ = nullptr;
  uint32_t args_offset_ = 0;
  int64_t fixed_addr_offset_ = 0;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_KERNEL_EX_TASK_INFO_H_
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
@@ -47,16 +47,16 @@ const uint32_t kAddrLen = sizeof(void *);

 namespace ge {
 KernelTaskInfo::SuperKernelTaskInfo KernelTaskInfo::skt_info_ = {
  0, 0, 0, 0, nullptr, nullptr, {}, {}, {}, {}, {}, RT_KERNEL_DEFAULT, kInvalidGroupKey, 0, nullptr};
  0, 0, 0, 0, nullptr, nullptr, {}, {}, RT_KERNEL_DEFAULT, kInvalidGroupKey, 0, nullptr};

 Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  if (davinci_model == nullptr) {
    GELOGE(PARAM_INVALID, "davinci model is null!");
    GELOGE(PARAM_INVALID, "davinci_model is null!");
    return PARAM_INVALID;
  }
  davinci_model_ = davinci_model;
  is_l1_fusion_enable_ = davinci_model_->GetL1FusionEnableOption();
  GELOGD("KernelTaskInfo init start, ge.enableL1Fusion in davinci model is %d.", is_l1_fusion_enable_);
  GELOGD("KernelTaskInfo Init Start, ge.enableL1Fusion in davinci model is %d.", is_l1_fusion_enable_);

  Status ret = SetStream(task_def.stream_id(), davinci_model_->GetStreamList());
  if (ret != SUCCESS) {
@@ -73,7 +73,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
  // get opdesc
  op_desc_ = davinci_model_->GetOpByIndex(context.op_index());
  if (op_desc_ == nullptr) {
    GELOGE(INTERNAL_ERROR, "Get op desc failed, index is out of range!");
    GELOGE(INTERNAL_ERROR, "Get op_desc failed, index is out of range!");
    return INTERNAL_ERROR;
  }
  (void)AttrUtils::GetBool(*op_desc_, ATTR_N_BATCH_SPILT, is_n_batch_spilt_);
@@ -138,21 +138,14 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
    ret = InitCceTask(kernel_def);
  }

  GELOGD("KernelTaskInfo init finish, result=%u.", ret);
  GELOGD("KernelTaskInfo Init finish, result=%u.", ret);
  return ret;
 }

 Status KernelTaskInfo::SaveSKTDumpInfo() {
  GE_CHECK_NOTNULL(davinci_model_);
  if (skt_dump_flag_ == RT_KERNEL_DEFAULT) {
    GELOGD("no need save skt dump info");
    return SUCCESS;
  }
  // all op in super kernel share one taskid and streamid
  for (size_t i = 0; i < skt_info_.op_desc_list.size(); i++) {
    davinci_model_->SaveDumpTask(skt_info_.last_task_id, skt_info_.last_stream_id, skt_info_.op_desc_list[i],
                                 skt_info_.dump_args_list[i]);
  }
  davinci_model_->SaveDumpTask(skt_info_.last_task_id, skt_info_.last_stream_id, skt_info_.last_op,
                               skt_info_.last_dump_args);
  return SUCCESS;
 }

@@ -194,9 +187,6 @@ Status KernelTaskInfo::SKTFinalize() {
  GELOGI("SuperKernel Distribute [skt_id:%u]", skt_id_);
  skt_info_.kernel_list.clear();
  skt_info_.arg_list.clear();
  skt_info_.dump_flag_list.clear();
  skt_info_.op_desc_list.clear();
  skt_info_.dump_args_list.clear();
  skt_info_.last_stream = nullptr;
  skt_info_.last_block_dim = 0;
  skt_info_.last_sm_desc = sm_desc_;
@@ -207,15 +197,6 @@ Status KernelTaskInfo::SKTFinalize() {
  return SUCCESS;
 }

 uint32_t KernelTaskInfo::GetDumpFlag() {
  for (auto flag : skt_info_.dump_flag_list) {
    if (flag == RT_KERNEL_DUMPFLAG) {
      return RT_KERNEL_DUMPFLAG;
    }
  }
  return RT_KERNEL_DEFAULT;
 }

 Status KernelTaskInfo::SuperKernelLaunch() {
  if (skt_info_.kernel_list.empty()) {
    GELOGI("SuperKernelLaunch: Skt_kernel_list has no task, just return");
@@ -225,7 +206,7 @@ Status KernelTaskInfo::SuperKernelLaunch() {
  auto &skt_kernel_list = skt_info_.kernel_list;
  auto &skt_arg_list = skt_info_.arg_list;
  GELOGI("SuperKernelLaunch: Skt_kernel_list size[%d] skt_arg_list[%d]", skt_kernel_list.size(), skt_arg_list.size());
  if (skt_kernel_list.size() == kSKTSingleSize && skt_arg_list.size() == kSKTSingleSize) {
  if (skt_kernel_list.size() == kSKTSingleSize) {
    rt_ret = rtKernelLaunchWithFlag(skt_info_.kernel_list[0], static_cast<uint32_t>(skt_info_.last_block_dim),
                                    skt_info_.arg_list[0], skt_info_.last_args_size,
                                    static_cast<rtSmDesc_t *>(skt_info_.last_sm_desc), skt_info_.last_stream,
@@ -234,7 +215,6 @@ Status KernelTaskInfo::SuperKernelLaunch() {
      GELOGE(RT_FAILED, "SuperKernelLaunch: Call rt api failed, ret: 0x%X", rt_ret);
      return RT_FAILED;
    }
    call_save_dump_ = true;
    GE_CHK_STATUS_RET(SKTFinalize(), "Skt finalize failed");
    return SUCCESS;
  }
@@ -246,22 +226,18 @@ Status KernelTaskInfo::SuperKernelLaunch() {
    return RT_FAILED;
  }
  // Call the fuse API
  std::unique_ptr<skt::SuperKernel> superKernel = nullptr;
  skt::SuperKernel *superKernel = nullptr;
  if (factory->FuseKernels(skt_kernel_list, skt_arg_list, skt_info_.last_block_dim, superKernel) != SUCCESS) {
    GELOGE(RT_FAILED, "SuperKernelLaunch: fuse call failed");
    return RT_FAILED;
  }
  // Launch a super kernel
  skt_dump_flag_ = GetDumpFlag();
  if (superKernel->Launch(skt_info_.last_stream, skt_dump_flag_) != SUCCESS) {
  if (superKernel->Launch(skt_info_.last_stream, RT_KERNEL_DUMPFLAG) != SUCCESS) {
    GELOGE(RT_FAILED, "SuperKernelLaunch: launch failed");
    return RT_FAILED;
  }
  GELOGI("SuperKernelLaunch: success[skt_kernel_list size[%zu] skt_arg_list[%zu]]", skt_kernel_list.size(),
         skt_arg_list.size());
  // record skt addr for release
  superkernel_dev_nav_table_ = superKernel->GetNavTablePtr();
  superkernel_device_args_addr_ = superKernel->GetDeviceArgsPtr();
  GE_CHK_STATUS_RET(SKTFinalize(), "Skt finalize failed");
  return SUCCESS;
 }
@@ -274,9 +250,6 @@ Status KernelTaskInfo::SaveSuperKernelInfo() {
  skt_info_.last_args_size = args_size_;
  skt_info_.last_sm_desc = sm_desc_;
  skt_info_.last_dump_flag = dump_flag_;
  skt_info_.dump_flag_list.push_back(dump_flag_);
  skt_info_.op_desc_list.push_back(op_desc_);
  skt_info_.dump_args_list.push_back(reinterpret_cast<uintptr_t>(dump_args_));
  skt_info_.last_group_key = group_key_;
  skt_info_.last_dump_args = reinterpret_cast<uintptr_t>(dump_args_);
  skt_info_.last_op = op_desc_;
@@ -355,7 +328,6 @@ Status KernelTaskInfo::SuperKernelDistribute() {
      GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
      return FAILED;
    }
    call_save_dump_ = true;
    UpdateTaskId();
    GELOGI("Current Common Task Distribute [taskid:%u]", task_id_);
  } else {
@@ -384,7 +356,6 @@ Status KernelTaskInfo::Distribute() {
    rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(so_name_.c_str()),
                                       reinterpret_cast<const void *>(kernel_name_.c_str()), 1, args_, args_size_,
                                       nullptr, stream_, dump_flag_);
    call_save_dump_ = true;
  } else {
    /* default: not skt launch */
    GELOGI(
@@ -398,7 +369,6 @@ Status KernelTaskInfo::Distribute() {
      // call rtKernelLaunch for current task
      rt_ret = rtKernelLaunchWithFlag(stub_func_, block_dim_, args_, args_size_, static_cast<rtSmDesc_t *>(sm_desc_),
                                      stream_, dump_flag_);
      call_save_dump_ = true;
    }
  }
  if (rt_ret != RT_ERROR_NONE) {
@@ -422,31 +392,9 @@ Status KernelTaskInfo::UpdateArgs() {
  vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_);

  vector<void *> io_addrs;
  if (!op_desc_->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) {
    io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
    io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
    io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
  } else {
    string peer_input_name;
    if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name)) {
      uint32_t output_index = davinci_model_->GetFixedAddrOutputIndex(peer_input_name);
      if (output_index > output_data_addrs.size()) {
        GELOGE(FAILED, "The output data addr size[%zu] and output index[%u] are inconsistent.",
               output_data_addrs.size(), output_index);
        return FAILED;
      }
      io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
      for (size_t i = 0; i < output_data_addrs.size(); ++i) {
        if (i == output_index) {
          void *fixed_addr = davinci_model_->GetCurrentFixedAddr(fixed_addr_offset_);
          io_addrs.emplace_back(fixed_addr);
          continue;
        }
        io_addrs.emplace_back(output_data_addrs[i]);
      }
      io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
    }
  }
  io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
  io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
  io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());

  GE_CHK_STATUS_RET(davinci_model_->UpdateKnownZeroCopyAddr(io_addrs, args_offset_),
                    "update known node %s zero copy addr failed.", op_desc_->GetName().c_str());
@@ -460,8 +408,6 @@ Status KernelTaskInfo::Release() {
    return SUCCESS;
  }
  FreeRtMem(&args_);
  FreeRtMem(&superkernel_device_args_addr_);
  FreeRtMem(&superkernel_dev_nav_table_);
  FreeRtMem(&flowtable_);
  FreeRtMem(&custom_info_.input_descs);
  FreeRtMem(&custom_info_.input_addrs);
@@ -526,29 +472,6 @@ Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel
  args_offset_ = davinci_model->GetTotalArgsSize();
  davinci_model->SetTotalArgsSize(args_size);
  GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_);

  // get opcontext stored in model
  const domi::KernelContext &context = kernel_def.context();
  // get opdesc
  op_desc_ = davinci_model->GetOpByIndex(context.op_index());
  GE_CHECK_NOTNULL(op_desc_);
  // alloc fixed addr
  string peer_input_name;
  if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name) && !peer_input_name.empty()) {
    uint32_t output_index = davinci_model->GetFixedAddrOutputIndex(peer_input_name);
    if (output_index > op_desc_->GetOutputsSize()) {
      GELOGE(FAILED, "The output size[%zu] and output index[%u] are inconsistent.", op_desc_->GetOutputsSize(),
             output_index);
      return FAILED;
    }
    fixed_addr_offset_ = davinci_model->GetFixedAddrsSize(peer_input_name);
    auto tensor_desc = op_desc_->GetOutputDesc(output_index);
    int64_t tensor_size = 0;
    GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
    davinci_model->SetTotalFixedAddrsSize(peer_input_name, tensor_size);
    GELOGI("Calculate stream switch task args , tensor size is %ld, fixed addr offset %ld", tensor_size,
           fixed_addr_offset_);
  }
  return SUCCESS;
 }

@@ -626,8 +549,8 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
    return FAILED;
  }

  if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
                                                          op_desc->GetName())) {
  if (PropertiesManager::Instance().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
                                                    op_desc->GetName())) {
    dump_flag_ = RT_KERNEL_DUMPFLAG;
    dump_args_ = static_cast<char *>(args_) + offset;
  }
@@ -638,8 +561,10 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
  }

  vector<void *> virtual_io_addrs;  // use virtual address for zero copy key.
  virtual_io_addrs.insert(virtual_io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
  virtual_io_addrs.insert(virtual_io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
  const vector<void *> virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false);
  const vector<void *> virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false);
  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_in_addrs.begin(), virtual_in_addrs.end());
  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end());
  davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, args_info.data(), args_, args_size_, offset);

  GELOGD("Do InitTVMTask end");
@@ -677,6 +602,7 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel
  const std::vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
  Status ret = StoreInputOutputTensor(input_data_addrs, output_data_addrs, ModelUtils::GetInputDescs(op_desc),
                                      ModelUtils::GetOutputDescs(op_desc));

  if (ret != SUCCESS) {
    GELOGE(ret, "StoreInputOutputTensor Failed");
    return ret;
@@ -741,9 +667,11 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel
    return RT_FAILED;
  }

  davinci_model_->SetZeroCopyAddr(op_desc, input_data_addrs, input_data_addrs.data(), custom_info_.input_addrs,
                                  input_data_addrs.size() * kAddrLen, 0);
  davinci_model_->SetZeroCopyAddr(op_desc, output_data_addrs, output_data_addrs.data(), custom_info_.output_addrs,
  const vector<void *> virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false);
  const vector<void *> virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false);
  davinci_model_->SetZeroCopyAddr(op_desc, virtual_in_addrs, input_data_addrs.data(), custom_info_.input_addrs,
                                  virtual_in_addrs.size() * kAddrLen, 0);
  davinci_model_->SetZeroCopyAddr(op_desc, virtual_out_addrs, output_data_addrs.data(), custom_info_.output_addrs,
                                  output_data_addrs.size() * kAddrLen, 0);
  return SUCCESS;
 }
@@ -873,9 +801,6 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
    GELOGE(init_ret, "Init aicpu task ext info failed, ext_info size=%zu", ext_info.size());
    return init_ret;
  }
  GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, aicpu_ext_info_addr_=%p", op_desc_->GetName().c_str(),
         op_desc_->GetType().c_str(), ext_info.size(), aicpu_ext_info_addr_);

  aicpu_param_head->extInfoAddr = reinterpret_cast<uintptr_t>(aicpu_ext_info_addr_);
  aicpu_param_head->extInfoLength = reinterpret_cast<uintptr_t>(ext_info.size());

@@ -894,13 +819,19 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
    return RT_FAILED;
  }

  if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
                                                          op_desc->GetName())) {
  if (PropertiesManager::Instance().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
                                                    op_desc->GetName())) {
    dump_flag_ = RT_KERNEL_DUMPFLAG;
    dump_args_ = static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead);
  }

  davinci_model_->SetZeroCopyAddr(op_desc, io_addrs, args_addr.get(), args_, args_size_, sizeof(aicpu::AicpuParamHead));
  vector<void *> virtual_io_addrs;  // use virtual address for zero copy key.
  const vector<void *> virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false);
  const vector<void *> virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false);
  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_in_addrs.begin(), virtual_in_addrs.end());
  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end());
  davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, args_addr.get(), args_, args_size_,
                                  sizeof(aicpu::AicpuParamHead));

  return SUCCESS;
 }
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
@@ -61,8 +61,6 @@ class KernelTaskInfo : public TaskInfo {
    sm_desc_ = nullptr;
    flowtable_ = nullptr;
    args_ = nullptr;
    superkernel_device_args_addr_ = nullptr;
    superkernel_dev_nav_table_ = nullptr;
  }

  Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;
@@ -90,8 +88,6 @@ class KernelTaskInfo : public TaskInfo {

  uint32_t GetSktTaskID() override { return skt_id_; }

  bool CallSaveDumpInfo() override { return call_save_dump_; };

  cce::ccOpContext ctx_;
  FusionOpInfo fusion_op_info_;

@@ -134,7 +130,6 @@ class KernelTaskInfo : public TaskInfo {
  void UpdateSKTTaskId();
  Status SKTFinalize();
  Status SuperKernelLaunch();
  uint32_t GetDumpFlag();
  Status SaveSuperKernelInfo();
  bool IsMarkedLastNode();
  bool IsMarkedFirstNode();
@@ -158,8 +153,6 @@ class KernelTaskInfo : public TaskInfo {
  OpDescPtr op_desc_;
  DavinciModel *davinci_model_;
  uint32_t args_offset_ = 0;
  int64_t fixed_addr_offset_ = 0;
  bool call_save_dump_ = false;

  // aicpu ext_info device mem
  void *aicpu_ext_info_addr_ = nullptr;
@@ -171,9 +164,6 @@ class KernelTaskInfo : public TaskInfo {
  bool is_n_batch_spilt_;
  int64_t group_key_;
  bool has_group_key_;
  uint32_t skt_dump_flag_ = RT_KERNEL_DEFAULT;
  void *superkernel_device_args_addr_ = nullptr;
  void *superkernel_dev_nav_table_ = nullptr;

  struct AICPUCustomInfo {
    void *input_descs = nullptr;
@@ -193,9 +183,6 @@ class KernelTaskInfo : public TaskInfo {
    void *last_sm_desc;
    std::vector<void *> kernel_list;
    std::vector<void *> arg_list;
    std::vector<uint32_t> dump_flag_list;
    std::vector<OpDescPtr> op_desc_list;
    std::vector<uintptr_t> dump_args_list;
    uint32_t last_dump_flag;
    int64_t last_group_key;
    uintptr_t last_dump_args;
--- a/src/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc
@@ -16,8 +16,8 @@

 #include "graph/load/new_model_manager/task_info/label_switch_by_index_task_info.h"

 #include "graph/debug/ge_attr_define.h"
 #include "graph/load/new_model_manager/davinci_model.h"
 #include "graph/debug/ge_attr_define.h"

 namespace ge {
 constexpr uint8_t kLabelSwitchIndexNum = 1;
@@ -59,13 +59,7 @@ Status LabelSwitchByIndexTaskInfo::Init(const domi::TaskDef &task_def, DavinciMo
           op_desc->GetName().c_str(), input_data_addr.size(), kLabelSwitchIndexNum);
    return INTERNAL_ERROR;
  }

  if (davinci_model->IsKnownNode()) {
    index_value_ = davinci_model->GetCurrentFixedAddr(fixed_addr_offset_);
  } else {
    index_value_ = input_data_addr[0];
  }

  index_value_ = input_data_addr[0];
  davinci_model->DisableZeroCopy(index_value_);

  std::vector<uint32_t> label_idx_list;
@@ -130,28 +124,5 @@ Status LabelSwitchByIndexTaskInfo::Distribute() {
  return SUCCESS;
 }

 Status LabelSwitchByIndexTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  GE_CHECK_NOTNULL(davinci_model);
  auto label_switch = task_def.label_switch_by_index();
  uint32_t op_index = label_switch.op_index();
  GELOGI("Begin to calculate args, op_index is: %u", op_index);
  auto op_desc = davinci_model->GetOpByIndex(op_index);
  GE_CHECK_NOTNULL(op_desc);
  GELOGI("Calc opType[%s] args size. Node name is [%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str());
  if (op_desc->GetInputsSize() != kLabelSwitchIndexNum) {
    GELOGE(FAILED, "Label switch op only have one data input. Now input size is %zu", op_desc->GetInputsSize());
    return FAILED;
  }
  string input_tensor_name = op_desc->GetInputNameByIndex(0);
  fixed_addr_offset_ = davinci_model->GetFixedAddrsSize(input_tensor_name);
  auto tensor_desc = op_desc->GetInputDesc(0);
  int64_t tensor_size = 0;
  GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
  davinci_model->SetTotalFixedAddrsSize(input_tensor_name, tensor_size);
  GELOGI("Calculate stream switchn task args , tensor_size %ld, fixed_addr_offset %ld", tensor_size,
         fixed_addr_offset_);
  return SUCCESS;
 }

 REGISTER_TASK_INFO(RT_MODEL_TASK_STREAM_LABEL_SWITCH_BY_INDEX, LabelSwitchByIndexTaskInfo);
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.h
@@ -22,8 +22,7 @@
 namespace ge {
 class LabelSwitchByIndexTaskInfo : public TaskInfo {
 public:
  LabelSwitchByIndexTaskInfo()
      : index_value_(nullptr), branch_max_(0), args_(nullptr), args_size_(0), fixed_addr_offset_(0) {}
  LabelSwitchByIndexTaskInfo() : index_value_(nullptr), branch_max_(0), args_(nullptr), args_size_(0) {}

  ~LabelSwitchByIndexTaskInfo() override;

@@ -31,15 +30,13 @@ class LabelSwitchByIndexTaskInfo : public TaskInfo {

  Status Distribute() override;

  Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;

 private:
  void *index_value_;    // switch index input.
  uint32_t branch_max_;  // max branch count.
  void *args_;           // label info memory.
  uint32_t args_size_;   // label info length.

  std::vector<rtLabel_t> label_list_;
  int64_t fixed_addr_offset_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_SWITCH_BY_INDEX_TASK_INFO_H_
--- a/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc
@@ -21,9 +21,9 @@

 namespace ge {
 Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  GELOGI("MemcpyAddrAsyncTaskInfo Init Start");
  GELOGI("MemcpyAddrAsyncTaskInfo Init Start.");
  if (davinci_model == nullptr) {
    GELOGE(PARAM_INVALID, "davinci_model is null");
    GELOGE(PARAM_INVALID, "davinci_model is null!");
    return PARAM_INVALID;
  }

@@ -32,27 +32,45 @@ Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel
    return ret;
  }

  const auto &memcpy_async = task_def.memcpy_async();
  OpDescPtr op_desc = davinci_model->GetOpByIndex(memcpy_async.op_index());
  auto memcpy_async_def = task_def.memcpy_async();
  uint32_t op_index = memcpy_async_def.op_index();
  OpDescPtr op_desc = davinci_model->GetOpByIndex(op_index);
  if (op_desc == nullptr) {
    GELOGE(INTERNAL_ERROR, "Task op index:%u out of range", memcpy_async.op_index());
    GELOGE(INTERNAL_ERROR, "Init MemcpyAddrAsyncTaskInfo error, index is out of range!");
    return INTERNAL_ERROR;
  }

  ret = ModelUtils::GetRtAddress(davinci_model->GetRuntimeParam(), memcpy_async.src(), src_);
  uint64_t logic_dst = memcpy_async_def.dst();
  uint64_t logic_src = memcpy_async_def.src();

  dst_max_ = memcpy_async_def.dst_max();

  uint64_t update_base_addr = 0;
  ret = GetUpdateBaseAddr(davinci_model, logic_src, update_base_addr);
  if (ret != SUCCESS) {
    return ret;
  }
  src_ = reinterpret_cast<uint8_t *>(update_base_addr + logic_src);
  if (src_ == nullptr) {
    GELOGE(PARAM_INVALID, "src_ is null!");
    return PARAM_INVALID;
  }

  ret = ModelUtils::GetRtAddress(davinci_model->GetRuntimeParam(), memcpy_async.dst(), dst_);
  if (ret != SUCCESS) {
    return ret;
  uint64_t mem_base = reinterpret_cast<uint64_t>(davinci_model->MemBase());
  uint64_t logic_mem_base = davinci_model->GetRtBaseAddr();
  dst_ = reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(mem_base + (logic_dst - logic_mem_base)));
  if (dst_ == nullptr) {
    GELOGE(PARAM_INVALID, "dst_ is null!");
    return PARAM_INVALID;
  }

  vector<void *> io_addrs;
  io_addrs.emplace_back(src_);
  io_addrs.emplace_back(dst_);

  count_ = memcpy_async_def.count();
  kind_ = memcpy_async_def.kind();

  // malloc args memory
  size_t args_size = sizeof(void *) * io_addrs.size();
  rtError_t rt_ret = rtMalloc(&args_, args_size, RT_MEMORY_HBM);
@@ -70,18 +88,20 @@ Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel
    return RT_FAILED;
  }

  count_ = memcpy_async.count();
  kind_ = memcpy_async.kind();
  dst_max_ = memcpy_async.dst_max();
  GELOGI("InitMemcpyAddrAsyncTaskInfo, logic[0x%lx, 0x%lx], src:%p, dst:%p, max:%lu, count:%lu, args:%p, size:%zu",
         memcpy_async.src(), memcpy_async.dst(), src_, dst_, dst_max_, count_, args_, args_size);
  // Just dest addr need zero copy.
  davinci_model->SetZeroCopyAddr(op_desc, {dst_}, io_addrs.data(), args_, args_size, sizeof(void *));

  GELOGI("InitMemcpyAddrAsyncTaskInfo, logic_src:%p, logic_dst:%p, src:%p, dst:%p, src_args:%p, dst_args:%p",
         reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(logic_src)),
         reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(logic_dst)), src_, dst_, args_,
         reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(args_) + args_size));

  davinci_model->SetZeroCopyAddr(op_desc, io_addrs, io_addrs.data(), args_, args_size, 0);
  return SUCCESS;
 }

 Status MemcpyAddrAsyncTaskInfo::Distribute() {
  GELOGI("MemcpyAddrAsyncTaskInfo Distribute Start, dst_max:%lu, count:%lu, kind:%u", dst_max_, count_, kind_);
  GELOGI("MemcpyAddrAsyncTaskInfo Distribute Start.");
  GELOGI("Distribute MemcpyAddrAsync, dst_max:%lu, count:%lu, kind:%u.", dst_max_, count_, kind_);

  rtError_t rt_ret = rtMemcpyAsync(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(args_) + sizeof(void *)),
                                   dst_max_, args_, count_, static_cast<rtMemcpyKind_t>(kind_), stream_);
@@ -93,5 +113,39 @@ Status MemcpyAddrAsyncTaskInfo::Distribute() {
  return SUCCESS;
 }

 Status MemcpyAddrAsyncTaskInfo::GetUpdateBaseAddr(DavinciModel *davinci_model, uint64_t update_addr,
                                                  uint64_t &base_addr) {
  GE_CHECK_NOTNULL(davinci_model);
  uint64_t data_base_addr =
    static_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model->MemBase())) - davinci_model->GetRtBaseAddr();
  uint64_t weight_base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model->WeightsMemBase())) -
                              davinci_model->GetRtWeightAddr();
  uint64_t var_base_addr =
    static_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model->VarMemBase())) - davinci_model->GetRtVarAddr();

  uint64_t data_base_addr_start = davinci_model->GetRtBaseAddr();
  uint64_t data_base_addr_end = davinci_model->GetRtBaseAddr() + davinci_model->TotalMemSize();
  uint64_t wight_base_addr_start = davinci_model->GetRtWeightAddr();
  uint64_t wight_base_addr_end = davinci_model->GetRtWeightAddr() + davinci_model->TotalWeightsMemSize();
  uint64_t varible_base_addr_start = davinci_model->GetRtVarAddr();
  uint64_t varible_base_addr_end = davinci_model->GetRtVarAddr() + davinci_model->TotalVarMemSize();

  if ((data_base_addr_start <= update_addr) && (update_addr <= data_base_addr_end)) {
    base_addr = data_base_addr;
    GELOGI("The update_addr is data address.");
  } else if ((wight_base_addr_start <= update_addr) && (update_addr <= wight_base_addr_end)) {
    base_addr = weight_base_addr;
    GELOGI("The update_addr is weight address.");
  } else if ((varible_base_addr_start <= update_addr) && (update_addr <= varible_base_addr_end)) {
    base_addr = var_base_addr;
    GELOGI("The update_addr is variable address.");
  } else if (update_addr != 0) {
    base_addr = 0;
    GELOGE(PARAM_INVALID, "The update_addr is abnormal.");
    return PARAM_INVALID;
  }
  return SUCCESS;
 }

 REGISTER_TASK_INFO(RT_MODEL_TASK_MEMCPY_ADDR_ASYNC, MemcpyAddrAsyncTaskInfo);
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h
@@ -16,7 +16,6 @@

 #ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ADDR_ASYNC_TASK_INFO_H_
 #define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ADDR_ASYNC_TASK_INFO_H_

 #include "graph/load/new_model_manager/task_info/task_info.h"

 namespace ge {
@@ -33,8 +32,9 @@ class MemcpyAddrAsyncTaskInfo : public TaskInfo {
      if (ret != RT_ERROR_NONE) {
        GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", ret);
      }
      args_ = nullptr;
    }

    args_ = nullptr;
  }

  Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;
@@ -42,9 +42,11 @@ class MemcpyAddrAsyncTaskInfo : public TaskInfo {
  Status Distribute() override;

 private:
  uint8_t *dst_;
  Status GetUpdateBaseAddr(DavinciModel *davinci_model, uint64_t update_addr, uint64_t &base_addr);

  void *dst_;
  uint64_t dst_max_;
  uint8_t *src_;
  void *src_;
  void *args_;
  uint64_t count_;
  uint32_t kind_;
--- a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
@@ -21,9 +21,9 @@

 namespace ge {
 Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  GELOGI("MemcpyAsyncTaskInfo Init Start");
  GELOGI("MemcpyAsyncTaskInfo Init Start.");
  if (davinci_model == nullptr) {
    GELOGE(PARAM_INVALID, "davinci_model is null");
    GELOGE(PARAM_INVALID, "davinci_model is null!");
    return PARAM_INVALID;
  }

@@ -32,38 +32,35 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da
    return ret;
  }

  memcpy_async = task_def.memcpy_async();
  count_ = memcpy_async.count();
  kind_ = memcpy_async.kind();
  dst_max_ = memcpy_async.dst_max();
  if (davinci_model->IsKnownNode()) {
    src_ = reinterpret_cast<uint8_t *>(davinci_model_->GetCurrentArgsAddr(args_offset_));
    dst_ = reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(src_) + sizeof(void *));
    // for zero copy
    kind_ = RT_MEMCPY_ADDR_DEVICE_TO_DEVICE;
    GELOGI("MemcpyAsyncTaskInfo src_ %p, dst_ %p, args_offset %u.", src_, dst_, args_offset_);
    return SUCCESS;
  }
  ret = ModelUtils::GetRtAddress(davinci_model->GetRuntimeParam(), memcpy_async.src(), src_);
  if (ret != SUCCESS) {
    return ret;
  }
  auto memcpy_async_def = task_def.memcpy_async();
  uint64_t logic_dst = memcpy_async_def.dst();
  uint64_t logic_src = memcpy_async_def.src();

  dst_max_ = memcpy_async_def.dst_max();

  ret = ModelUtils::GetRtAddress(davinci_model->GetRuntimeParam(), memcpy_async.dst(), dst_);
  uint64_t update_base_addr = 0;
  ret = GetUpdateBaseAddr(davinci_model, logic_src, update_base_addr);
  if (ret != SUCCESS) {
    return ret;
  }
  src_ = reinterpret_cast<uint8_t *>(update_base_addr + logic_src);
  davinci_model->DisableZeroCopy(src_);

  GELOGI("MemcpyAsyncTaskInfo Init Success, logic[0x%lx, 0x%lx], src:%p, dst:%p, max:%lu, count:%lu",
         memcpy_async.src(), memcpy_async.dst(), src_, dst_, dst_max_, count_);
  uint64_t mem_base = reinterpret_cast<uint64_t>(davinci_model->MemBase());
  uint64_t logic_mem_base = davinci_model->GetRtBaseAddr();
  dst_ = reinterpret_cast<uint8_t *>(mem_base + (logic_dst - logic_mem_base));

  count_ = memcpy_async_def.count();
  kind_ = memcpy_async_def.kind();
  GELOGI("MemcpyAsyncTaskInfo Init Success, logic_src:%p, logic_dst:%p, src:%p, dst:%p",
         reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(logic_src)),
         reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(logic_dst)), src_, dst_);

  davinci_model->DisableZeroCopy(src_);
  davinci_model->DisableZeroCopy(dst_);
  return SUCCESS;
 }

 Status MemcpyAsyncTaskInfo::Distribute() {
  GELOGI("MemcpyAsyncTaskInfo Distribute Start. dst_max:%lu, count:%lu, kind:%u", dst_max_, count_, kind_);
  GELOGI("MemcpyAsyncTaskInfo Distribute Start. dst_max:%lu, count:%lu, kind:%u.", dst_max_, count_, kind_);

  rtError_t rt_ret = rtMemcpyAsync(dst_, dst_max_, src_, count_, static_cast<rtMemcpyKind_t>(kind_), stream_);
  if (rt_ret != RT_ERROR_NONE) {
@@ -71,41 +68,40 @@ Status MemcpyAsyncTaskInfo::Distribute() {
    return RT_FAILED;
  }

  GELOGI("MemcpyAsyncTaskInfo Distribute Success");
  return SUCCESS;
 }

 Status MemcpyAsyncTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  // the num of src and dst size is 2
  uint32_t args_size = sizeof(void *) * 2;
  args_offset_ = davinci_model->GetTotalArgsSize();
  davinci_model->SetTotalArgsSize(args_size);
  davinci_model_ = davinci_model;
  GELOGI("MemcpyAsyncTaskInfo kernel args_size %u, args_offset %u", args_size, args_offset_);
  GELOGI("MemcpyAsyncTaskInfo Distribute Success.");
  return SUCCESS;
 }

 Status MemcpyAsyncTaskInfo::UpdateArgs() {
  GELOGI("MemcpyAsyncTaskInfo::UpdateArgs in.");
  GE_CHECK_NOTNULL(davinci_model_);
  Status ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async.src(), src_);
  if (ret != SUCCESS) {
    return ret;
  }

  ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async.dst(), dst_);
  if (ret != SUCCESS) {
    return ret;
 Status MemcpyAsyncTaskInfo::GetUpdateBaseAddr(DavinciModel *davinci_model, uint64_t update_addr, uint64_t &base_addr) {
  GE_CHECK_NOTNULL(davinci_model);
  uint64_t data_base_addr =
    reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model->MemBase())) - davinci_model->GetRtBaseAddr();
  uint64_t weight_base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model->WeightsMemBase())) -
                              davinci_model->GetRtWeightAddr();
  uint64_t var_base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model->VarMemBase())) -
                           davinci_model->GetRtVarAddr();

  uint64_t data_base_addr_start = davinci_model->GetRtBaseAddr();
  uint64_t data_base_addr_end = davinci_model->GetRtBaseAddr() + davinci_model->TotalMemSize();
  uint64_t wight_base_addr_start = davinci_model->GetRtWeightAddr();
  uint64_t wight_base_addr_end = davinci_model->GetRtWeightAddr() + davinci_model->TotalWeightsMemSize();
  uint64_t varible_base_addr_start = davinci_model->GetRtVarAddr();
  uint64_t varible_base_addr_end = davinci_model->GetRtVarAddr() + davinci_model->TotalVarMemSize();

  if ((data_base_addr_start <= update_addr) && (update_addr <= data_base_addr_end)) {
    base_addr = data_base_addr;
    GELOGI("The update_addr is data address.");
  } else if ((wight_base_addr_start <= update_addr) && (update_addr <= wight_base_addr_end)) {
    base_addr = weight_base_addr;
    GELOGI("The update_addr is weight address.");
  } else if ((varible_base_addr_start <= update_addr) && (update_addr <= varible_base_addr_end)) {
    base_addr = var_base_addr;
    GELOGI("The update_addr is variable address.");
  } else if (update_addr != 0) {
    base_addr = 0;
    GELOGE(PARAM_INVALID, "The update_addr is abnormal.");
    return PARAM_INVALID;
  }

  vector<void *> io_addrs;
  io_addrs.emplace_back(reinterpret_cast<void *>(src_));
  io_addrs.emplace_back(reinterpret_cast<void *>(dst_));

  GE_CHK_STATUS_RET(davinci_model_->UpdateKnownZeroCopyAddr(io_addrs, args_offset_),
                    "update memcpyasync in known node zero copy addr failed.");

  GELOGI("MemcpyAsyncTaskInfo::UpdateArgs success.");
  return SUCCESS;
 }

--- a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
@@ -16,7 +16,6 @@

 #ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ASYNC_TASK_INFO_H_
 #define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ASYNC_TASK_INFO_H_

 #include "graph/load/new_model_manager/task_info/task_info.h"

 namespace ge {
@@ -33,19 +32,14 @@ class MemcpyAsyncTaskInfo : public TaskInfo {

  Status Distribute() override;

  Status UpdateArgs() override;

  Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;

 private:
  uint8_t *dst_;
  Status GetUpdateBaseAddr(DavinciModel *davinci_model, uint64_t update_addr, uint64_t &base_addr);

  void *dst_;
  uint64_t dst_max_;
  uint8_t *src_;
  void *src_;
  uint64_t count_;
  uint32_t kind_;
  DavinciModel *davinci_model_ = nullptr;
  uint32_t args_offset_ = 0;
  domi::MemcpyAsyncDef memcpy_async;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ASYNC_TASK_INFO_H_
--- a/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc
@@ -42,11 +42,16 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d

  auto stream_switch_def = task_def.stream_switch();
  uint32_t op_index = stream_switch_def.op_index();

  // get StreamSwitch op
  OpDescPtr op_desc = davinci_model->GetOpByIndex(op_index);
  GE_CHECK_NOTNULL(op_desc);
  auto input_data_addr = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
  SetInputAndValuePtr(davinci_model, input_data_addr);
  if (!input_data_addr.empty() && input_data_addr.size() >= STREAM_SWITCH_INPUT_NUM) {
    input_ptr_ = input_data_addr[0];
    value_ptr_ = input_data_addr[1];
  }

  uint32_t cond = 0;
  if (!AttrUtils::GetInt(op_desc, ATTR_NAME_STREAM_SWITCH_COND, cond)) {
    GELOGE(INTERNAL_ERROR, "StreamSwitchOp get attr STREAM_SWITCH_COND fail.");
@@ -110,42 +115,6 @@ Status StreamSwitchTaskInfo::Distribute() {
  GELOGI("StreamSwitchTaskInfo Distribute Success. cond:%d, stream:%p, datatype:%d.", cond_, true_stream_, data_type_);
  return SUCCESS;
 }
 Status StreamSwitchTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  GE_CHECK_NOTNULL(davinci_model);
  auto stream_switch_def = task_def.stream_switch();
  uint32_t op_index = stream_switch_def.op_index();
  GELOGI("Begin to calculate args, op_index is: %u", op_index);
  auto op_desc = davinci_model->GetOpByIndex(op_index);
  GE_CHECK_NOTNULL(op_desc);
  GELOGI("Calc opType[%s] args size. Node name is [%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str());
  if (op_desc->GetInputsSize() != STREAM_SWITCH_INPUT_NUM) {
    GELOGE(FAILED, "Stream switch op only have one data input. Now input size is %zu", op_desc->GetInputsSize());
    return FAILED;
  }
  for (uint32_t i = 0; i < STREAM_SWITCH_INPUT_NUM; ++i) {
    string input_tensor_name = op_desc->GetInputNameByIndex(i);
    int64_t fixed_addr_offset = davinci_model->GetFixedAddrsSize(input_tensor_name);
    fixed_addr_offset_.emplace_back(fixed_addr_offset);
    auto tensor_desc = op_desc->GetInputDesc(i);
    int64_t tensor_size = 0;
    GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
    davinci_model->SetTotalFixedAddrsSize(input_tensor_name, tensor_size);
    GELOGI("Calculate stream switch task args , tensor size is %ld, fixed addr[%u] offset %ld", tensor_size, i,
           fixed_addr_offset);
  }
  return SUCCESS;
 }

 void StreamSwitchTaskInfo::SetInputAndValuePtr(DavinciModel *davinci_model, const vector<void *> &input_data_addrs) {
  if (davinci_model->IsKnownNode() && fixed_addr_offset_.size() == STREAM_SWITCH_INPUT_NUM) {
    input_ptr_ = davinci_model->GetCurrentFixedAddr(fixed_addr_offset_[0]);
    value_ptr_ = davinci_model->GetCurrentFixedAddr(fixed_addr_offset_[1]);
  } else {
    if (!input_data_addrs.empty() && input_data_addrs.size() >= STREAM_SWITCH_INPUT_NUM) {
      input_ptr_ = input_data_addrs[0];
      value_ptr_ = input_data_addrs[1];
    }
  }
 }
 REGISTER_TASK_INFO(RT_MODEL_TASK_STREAM_SWITCH, StreamSwitchTaskInfo);
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h
@@ -39,18 +39,13 @@ class StreamSwitchTaskInfo : public TaskInfo {

  Status Distribute() override;

  Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;

 private:
  void SetInputAndValuePtr(DavinciModel *davinci_model, const vector<void *> &input_data_addrs);
  void *input_ptr_;
  rtCondition_t cond_;
  void *value_ptr_;
  rtStream_t true_stream_;
  uint32_t true_stream_id_;
  rtSwitchDataType_t data_type_;
  static const uint32_t kInputNum = 2;
  vector<int64_t> fixed_addr_offset_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_STREAM_SWITCH_TASK_INFO_H_
--- a/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc
@@ -24,15 +24,18 @@
 namespace {
 const uint32_t kDynamicBtachParamNum = 1;
 const uint32_t kDynamicResolutionParamNum = 2;
 const uint8_t kStreamSwitchnInputNum = 1;
 }  // namespace

 namespace ge {
 Status StreamSwitchNTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  GELOGI("StreamSwitchNTaskInfo Init Start.");
  GE_CHECK_NOTNULL(davinci_model);
  if (davinci_model == nullptr) {
    GELOGE(PARAM_INVALID, "davinci_model is null!");
    return PARAM_INVALID;
  }

  if (SetStream(task_def.stream_id(), davinci_model->GetStreamList()) != SUCCESS) {
  Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList());
  if (ret != SUCCESS) {
    return FAILED;
  }

@@ -72,16 +75,14 @@ Status StreamSwitchNTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *
    GELOGE(FAILED, "Get true stream ptr of switchN op failed.");
    return FAILED;
  }
  if (davinci_model->IsKnownNode()) {
    input_ptr_ = davinci_model->GetCurrentFixedAddr(args_offset_);
  } else {
    auto input_data_addr = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
    if (input_data_addr.empty()) {
      GELOGE(FAILED, "Input data addr is nullptr.");
      return FAILED;
    }
    input_ptr_ = input_data_addr[0];

  // set input_ptr_
  auto input_data_addr = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
  if (input_data_addr.empty()) {
    GELOGE(FAILED, "Input data addr is nullptr.");
    return FAILED;
  }
  input_ptr_ = input_data_addr[0];
  davinci_model->DisableZeroCopy(input_ptr_);
  GELOGI("StreamSwitchNTaskInfo Init Success, inputSize:%u, elementSize:%d, trueStreamID:%ld.", input_size_,
         element_size_, op_desc->GetStreamId());
@@ -139,26 +140,5 @@ Status StreamSwitchNTaskInfo::GetTrueStreamPtr(const OpDescPtr &op_desc, Davinci
  return SUCCESS;
 }

 Status StreamSwitchNTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  GE_CHECK_NOTNULL(davinci_model);
  auto stream_switchn_def = task_def.stream_switch_n();
  uint32_t op_index = stream_switchn_def.op_index();
  GELOGI("Begin to calculate args, op_index is: %u", op_index);
  auto op_desc = davinci_model->GetOpByIndex(op_index);
  GE_CHECK_NOTNULL(op_desc);
  GELOGI("Calc opType[%s] args size. Node name is [%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str());
  if (op_desc->GetInputsSize() != kStreamSwitchnInputNum) {
    GELOGE(FAILED, "Stream switchn op only have one data input. Now input size is %zu", op_desc->GetInputsSize());
    return FAILED;
  }
  string input_tensor_name = op_desc->GetInputNameByIndex(0);
  args_offset_ = davinci_model->GetFixedAddrsSize(input_tensor_name);
  auto tensor_desc = op_desc->GetInputDesc(0);
  int64_t tensor_size = 0;
  GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
  davinci_model->SetTotalFixedAddrsSize(input_tensor_name, tensor_size);
  GELOGI("Calculate stream switchn task args , tensor_size %ld, args_offset %ld", tensor_size, args_offset_);
  return SUCCESS;
 }
 REGISTER_TASK_INFO(RT_MODEL_TASK_STREAM_SWITCH_N, StreamSwitchNTaskInfo);
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.h
@@ -29,8 +29,7 @@ class StreamSwitchNTaskInfo : public TaskInfo {
        value_ptr_(nullptr),
        true_stream_ptr_(nullptr),
        element_size_(0),
        data_type_(RT_SWITCH_INT64),
        args_offset_(0) {}
        data_type_(RT_SWITCH_INT64) {}

  ~StreamSwitchNTaskInfo() override {}

@@ -38,8 +37,6 @@ class StreamSwitchNTaskInfo : public TaskInfo {

  Status Distribute() override;

  Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;

 private:
  Status GetTrueStreamPtr(const OpDescPtr &op_desc, DavinciModel *davinci_model);
  void *input_ptr_;
@@ -50,7 +47,6 @@ class StreamSwitchNTaskInfo : public TaskInfo {
  rtSwitchDataType_t data_type_;
  vector<rtStream_t> true_stream_list_;
  vector<int64_t> value_list_;
  int64_t args_offset_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_STREAM_SWITCHN_TASK_INFO_H_
--- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h
+++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h
@@ -34,13 +34,22 @@ class SuperKernel {
 public:
  SuperKernel(const void *stub, void *ptr, uint64_t sz, uint32_t dim)
      : func_stub_(stub), dev_nav_table_(ptr), nav_table_size_(sz), block_dim_(dim) {}
  ~SuperKernel() = default;
  ~SuperKernel() {
    // free memory when all releasing
    if (device_args_addr_ != nullptr) {
      GE_CHK_RT(rtFree(device_args_addr_));
      GELOGI("SKT: super_kernel args addr free.");
    }
    if (dev_nav_table_ != nullptr) {
      GE_CHK_RT(rtFree(dev_nav_table_));
      GELOGI("SKT: super_kernel args addr free.");
    }
  }
  Status Launch(rtStream_t stream, uint32_t dump_flag);
  const void *GetFuncStub() const { return func_stub_; }
  const void *GetNavTablePtr() const { return dev_nav_table_; }
  uint64_t GetNavTableSize() const { return nav_table_size_; }
  uint32_t GetBlockDim() const { return block_dim_; }
  void *GetNavTablePtr() const { return dev_nav_table_; }
  void *GetDeviceArgsPtr() const { return device_args_addr_; }
 };
 }  // namespace skt
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
@@ -42,10 +42,21 @@ Status SuperKernelFactory::Init() {
    rt_ret = rtGetAddrByFun(this->func_stub_, &this->func_ptr_);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret);
                    return FAILED;)
    GELOGD(
      "SKT: fuseKernels super_kernel_template subFunc %p, device func "
      "address %p",
      this->func_stub_, this->func_ptr_);
    if (this->use_physical_address_ != nullptr) {
      void *skt_func = nullptr;
      rt_ret = rtKernelConfigTransArg(this->func_ptr_, sizeof(uint64_t), 0, &skt_func);
      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret);
                      return FAILED;)
      GELOGD(
        "SKT: fuseKernels super_kernel_template subFunc %p, device func "
        "address %p, device physic PC %p",
        this->func_stub_, this->func_ptr_, skt_func);
    } else {
      GELOGD(
        "SKT: fuseKernels super_kernel_template subFunc %p, device func "
        "address %p",
        this->func_stub_, this->func_ptr_);
    }
  }
  is_init_ = true;

@@ -60,8 +71,7 @@ Status SuperKernelFactory::Uninitialize() {
 }

 Status SuperKernelFactory::FuseKernels(const std::vector<void *> &stub_func_list,
                                       const std::vector<void *> &args_addr_list, uint32_t block_dim,
                                       std::unique_ptr<skt::SuperKernel> &h) {
                                       const std::vector<void *> &args_addr_list, uint32_t block_dim, SuperKernel *&h) {
  // Iterate through the ops to be fused
  // Each subkernel to be fused contains 2 fields: fn address offset, args
  // address.
@@ -91,28 +101,70 @@ Status SuperKernelFactory::FuseKernels(const std::vector<void *> &stub_func_list

  rtError_t rt_ret;
  void *hbm_nav_table_addr = nullptr;
  for (unsigned i = 0; i < stub_func_list.size(); i++) {
    void *sub_device_func = nullptr;
    rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret);
                    return FAILED;)
    GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], sub_device_func);
    // store two uint64_t address
    // address divided by 4 because of 32bits encoding, call offset will *4 when calculating
    nav_table[i * 2] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(sub_device_func)) / 4;
    GELOGD("SKT: CALL offet %lu", nav_table[i * 2]);
    nav_table[i * 2 + 1] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(args_addr_list[i]));
    GELOGD("SKT: fuseKernels args base address %lu", nav_table[i * 2 + 1]);
  if (this->use_physical_address_ != nullptr) {
    for (unsigned i = 0; i < stub_func_list.size(); i++) {
      void *sub_device_func = nullptr;
      rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func);
      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret);
                      return FAILED;)
      void *sub_device_func_pys = nullptr;
      void *args_addr_pys = nullptr;
      rt_ret = rtKernelConfigTransArg(sub_device_func, sizeof(uint64_t), 0, &sub_device_func_pys);
      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret);
                      return FAILED;)
      rt_ret = rtKernelConfigTransArg(args_addr_list[i], sizeof(uint64_t), 0, &args_addr_pys);
      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret);
                      return FAILED;)
      GELOGD(
        "SKT: fuseKernels subFunc %p, device func address %p, device "
        "physic func address %p",
        stub_func_list[i], sub_device_func, sub_device_func_pys);
      // store two uint64_t address
      // address divided by 4 because of 32bits encoding, call offset will *4 when calculating
      nav_table[i * 2] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(sub_device_func_pys)) / 4;
      GELOGD("SKT: CALL offset %lu", nav_table[i * 2]);
      nav_table[i * 2 + 1] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(args_addr_pys));

      GELOGD("SKT: fuseKernels args base address %lu", nav_table[i * 2 + 1]);
    }

    void *hbm_nav_table_addr_pys = nullptr;
    rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failed. error: 0x%X", rt_ret); return FAILED;)
    rt_ret =
      rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table, nav_table_size, RT_MEMCPY_HOST_TO_DEVICE);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failed. error: 0x%X", rt_ret);
                    GE_CHK_RT(rtFree(hbm_nav_table_addr)); return FAILED;)
    rt_ret = rtKernelConfigTransArg(hbm_nav_table_addr, sizeof(uint64_t), 0, &hbm_nav_table_addr_pys);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret);
                    GE_CHK_RT(rtFree(hbm_nav_table_addr)); return FAILED;)

    GELOGD("SKT: hbm_nav_table_addr %p, hbm_nav_table_addr_pys %p", hbm_nav_table_addr, hbm_nav_table_addr_pys);
    // Create the necessary metadata for the super kernel
    h = new SuperKernel(this->func_stub_, hbm_nav_table_addr_pys, nav_table_size, block_dim);
  } else {
    for (unsigned i = 0; i < stub_func_list.size(); i++) {
      void *sub_device_func = nullptr;
      rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func);
      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret);
                      return FAILED;)
      GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], sub_device_func);
      // store two uint64_t address
      // address divided by 4 because of 32bits encoding, call offset will *4 when calculating
      nav_table[i * 2] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(sub_device_func)) / 4;
      GELOGD("SKT: CALL offet %lu", nav_table[i * 2]);
      nav_table[i * 2 + 1] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(args_addr_list[i]));
      GELOGD("SKT: fuseKernels args base address %lu", nav_table[i * 2 + 1]);
    }
    rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failed. error: 0x%X", rt_ret); return FAILED;)
    rt_ret =
      rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table, nav_table_size, RT_MEMCPY_HOST_TO_DEVICE);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failed. error: 0x%X", rt_ret);
                    GE_CHK_RT(rtFree(hbm_nav_table_addr)); return FAILED;)
    // Create the necessary metadata for the super kernel
    h = new SuperKernel(this->func_stub_, hbm_nav_table_addr, nav_table_size, block_dim);
  }
  rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM);
  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failed. error: 0x%X", rt_ret); return FAILED;)
  rt_ret =
    rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table, nav_table_size, RT_MEMCPY_HOST_TO_DEVICE);
  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failed. error: 0x%X", rt_ret);
                  GE_CHK_RT(rtFree(hbm_nav_table_addr)); return FAILED;)
  // Create the necessary metadata for the super kernel
  h =
    std::unique_ptr<skt::SuperKernel>(new SuperKernel(this->func_stub_, hbm_nav_table_addr, nav_table_size, block_dim));
  return SUCCESS;
 }
 }  // namespace skt
--- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h
+++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h
@@ -29,6 +29,7 @@ class SuperKernelFactory {
  void *func_ptr_ = nullptr;
  void *handle_ = nullptr;
  std::string sk_stub_name_ = "_Z21super_kernel_templatePmm";
  const char *use_physical_address_ = getenv("GE_USE_PHYSICAL_ADDRESS");
  bool is_init_ = false;
  SuperKernelFactory(){};
  ~SuperKernelFactory() {
@@ -47,7 +48,7 @@ class SuperKernelFactory {
  Status Init();
  Status Uninitialize();
  Status FuseKernels(const std::vector<void *> &stub_func_list, const std::vector<void *> &args_addr_list,
                     uint32_t block_dim, std::unique_ptr<skt::SuperKernel> &h);
                     uint32_t block_dim, SuperKernel *&h);
 };
 }  // namespace skt
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/task_info/task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/task_info.h
@@ -72,8 +72,6 @@ class TaskInfo {

  virtual uint32_t GetTaskID() { return 0xFFFFFFFF; }

  virtual bool CallSaveDumpInfo() { return false; }

  virtual uint32_t GetStreamId() { return 0xFFFFFFFF; }

  virtual uintptr_t GetDumpArgs() { return 0; }
--- a/src/ge/graph/load/new_model_manager/task_info/task_info_factory.h
+++ b/src/ge/graph/load/new_model_manager/task_info/task_info_factory.h
@@ -86,5 +86,5 @@ class TaskInfoFactory {
    return ptr;                              \
  }                                          \
  TaskInfoFactory::Registerar g_##type##_Task_Info_Creator(type, Creator_##type##_Task_Info);
 }  // namespace ge
 };      // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_TASK_INFO_FACTORY_H_
--- a/src/ge/graph/load/new_model_manager/zero_copy_task.cc
+++ b/src/ge/graph/load/new_model_manager/zero_copy_task.cc
@@ -129,6 +129,12 @@ Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, const DataBuffer &data,
      }

      auto dst_addr = static_cast<uint8_t *>(data.data);
      auto dst_size = static_cast<uint64_t>(data.length);
      if (ModelUtils::ConvertVirtualAddressToPhysical(dst_addr, dst_size, dst_addr) != SUCCESS) {
        GELOGE(FAILED, "[ZCPY] Convert virtual address to physical for dst_addr failed.");
        return FAILED;
      }

      GELOGI("[ZCPY] %s update task, args: %p, size: %zu, offset: %zu, addr: 0x%lx, length: %u", name_.c_str(),
             args_addr_, args_size_, offset, addr, data.length);
      *(uintptr_t *)(args_info + offset) = reinterpret_cast<uintptr_t>(dst_addr);
--- a/src/ge/graph/load/output/output.cc
+++ b/src/ge/graph/load/output/output.cc
@@ -0,0 +1,175 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "graph/load/output/output.h"

 #include <memory.h>

 #include "common/properties_manager.h"
 #include "graph/load/new_model_manager/davinci_model.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/tensor_utils.h"

 namespace ge {
 Output::Output(const OpDescPtr &op_desc, DavinciModel *model)
    : base_(nullptr),
      var_base_(nullptr),
      logic_base_(0),
      logic_var_base_(0),
      model_(model),
      op_desc_(op_desc),
      input_num_(0) {}

 Output::~Output() {
  var_base_ = nullptr;
  base_ = nullptr;
  model_ = nullptr;
 }

 ///
 /// @ingroup domi
 /// @brief Initialize input/output params
 /// @return Status
 ///
 Status Output::Init() {
  if (op_desc_ == nullptr || model_ == nullptr) {
    GELOGE(INTERNAL_ERROR, "The op_desc_ or model_ is nullptr.");
    return INTERNAL_ERROR;
  }

  base_ = model_->MemBase();
  var_base_ = model_->VarMemBase();
  logic_base_ = model_->GetRtBaseAddr();
  logic_var_base_ = model_->GetRtVarAddr();

  input_num_ = op_desc_->GetInputsSize();
  v_input_size_.clear();
  v_input_data_addr_.clear();

  auto input_vector = op_desc_->GetInputOffset();
  if (input_num_ != input_vector.size()) {
    GELOGE(INTERNAL_ERROR, "input desc size: %zu !=  input offset size: %zu.", input_num_, input_vector.size());
    return INTERNAL_ERROR;
  }

  for (size_t i = 0; i < input_num_; i++) {
    int64_t tensor_size = 0;
    auto input_desc = op_desc_->GetInputDescPtr(i);
    GE_CHECK_NOTNULL(input_desc);
    Status ret = TensorUtils::GetSize(*input_desc, tensor_size);
    if (ret != GRAPH_SUCCESS) {
      GELOGE(ret, "Get size from TensorDesc failed, op : %s, input index : %zu", op_desc_->GetName().c_str(), i);
      return ret;
    }
    v_input_size_.push_back(tensor_size);

    if (VarManager::Instance(model_->SessionId())->IsVarAddr(input_vector[i])) {
      v_input_data_addr_.push_back(static_cast<uint8_t *>(var_base_ + input_vector[i] - logic_var_base_));
    } else {
      v_input_data_addr_.push_back(static_cast<uint8_t *>(base_ + input_vector[i]));
    }
  }

  GELOGI("Init output:%lu, %lu, %lu", input_num_, v_input_size_.size(), v_input_data_addr_.size());

  return SUCCESS;
 }

 ///
 /// @ingroup domi
 /// @brief Copy Op Output to user space.
 /// @brief when model running, Add one DataOp as input node, Add one Output Op as output node.
 /// @return Status
 ///
 Status Output::CopyResult(OutputData &rslt, uint32_t data_begin, uint32_t &data_index, bool support_mem_share) {
  uint32_t data_count = 0;
  if (input_num_ > rslt.blobs.size() - data_begin) {
    GELOGE(FAILED, "Tensor num %zu, data_buf num: %zu.", input_num_, rslt.blobs.size() - data_begin);
    return FAILED;
  } else if (input_num_ < rslt.blobs.size() - data_begin) {
    GELOGW("Tensor num %zu, data_buf num: %zu.", input_num_, rslt.blobs.size() - data_begin);
  }

  for (size_t i = 0; i < input_num_; i++) {
    DataBuffer data_buf = rslt.blobs[data_begin + data_count];
    Status ret = SetDataBuf(data_buf, data_count, i, support_mem_share);
    if (ret != SUCCESS) {
      GELOGE(ret, "Copy data to host error. index: %zu", i);
      return ret;
    }
    data_index = data_begin + data_count;
  }

  return SUCCESS;
 }

 Status Output::SetDataBuf(DataBuffer &data_buf, uint32_t &data_count, size_t i, bool support_mem_share) {
  if (data_buf.length == 0) {
    ++data_count;
    GELOGD("Length of data_buffer is zero, No need to copy. output op : %s, output tensor index : %zu!",
           op_desc_->GetName().c_str(), i);
    return SUCCESS;
  }

  auto tensor_desc = op_desc_->GetInputDescPtr(static_cast<uint32_t>(i));
  if (tensor_desc == nullptr) {
    GELOGE(FAILED, "tensor_desc is null");
    return FAILED;
  }

  if (data_buf.isDataSupportMemShare && support_mem_share) {
    GELOGI("No need to copy input data, user's output data buffer can be shared.");
  } else {
    // Copy result to Databuf
    int64_t size = v_input_size_[i];
    GELOGI("Tensor data size before: %ld", size);

    graphStatus graph_status = TensorUtils::GetTensorSizeInBytes(*tensor_desc, size);
    if (graph_status != ge::GRAPH_SUCCESS) {
      GELOGE(graph_status, "GetTensorSizeInBytes failed!");
      return FAILED;
    }

    if (data_buf.length < size) {
      GELOGE(FAILED, "Tensor data size: %ld data_buf length: %ld", size, data_buf.length);
      return FAILED;
    } else if (data_buf.length > size) {
      GELOGW("Tensor data size: %ld data_buf length: %ld", size, data_buf.length);
    }

    rtError_t rt_ret = rtMemcpy(data_buf.data, size, v_input_data_addr_[i], size, RT_MEMCPY_DEVICE_TO_HOST);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(rt_ret, "rtmemcpy error");
      return FAILED;
    }
    GELOGI("Tensor data size: %ld data_buf length: %ld", size, data_buf.length);
  }

  ++data_count;
  GELOGD("Successfully copy the output tensor memory to buffer, output op : %s, output tensor index : %zu!",
         op_desc_->GetName().c_str(), i);

  return SUCCESS;
 }

 void Output::GetOutputData(vector<void *> &v_data_addr, vector<int64_t> &v_data_size) {
  for (size_t i = 0; i < input_num_; ++i) {
    v_data_addr.push_back(v_input_data_addr_[i]);
    v_data_size.push_back(v_input_size_[i]);
  }
 }
 }  // namespace ge
--- a/src/ge/graph/load/output/output.h
+++ b/src/ge/graph/load/output/output.h
@@ -0,0 +1,94 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef GE_GRAPH_LOAD_OUTPUT_OUTPUT_H_
 #define GE_GRAPH_LOAD_OUTPUT_OUTPUT_H_

 #include <string>
 #include <vector>

 #include "common/debug/log.h"
 #include "common/op/attr_value_util.h"
 #include "common/op/ge_op_utils.h"
 #include "common/types.h"
 #include "common/util.h"
 #include "common/ge_types.h"
 #include "graph/load/new_model_manager/davinci_model.h"
 #include "graph/op_desc.h"
 #include "graph/debug/ge_attr_define.h"

 namespace ge {
 using std::string;
 using std::vector;

 // The base class for all op
 class Output {
 public:
  Output(const OpDescPtr &op_desc, DavinciModel *model);
  virtual ~Output();

  ///
  /// @ingroup domi
  /// @brief Initialize input/output params
  /// @return Status
  ///
  virtual Status Init();

  ///
  /// @ingroup domi
  /// @brief Copy Op Output to user space.
  /// @brief when model running, Add one DataOp as input node, Add one Output Op as output node.
  /// @return Status
  ///
  virtual Status CopyResult(OutputData &rslt, uint32_t data_begin, uint32_t &data_index, bool support_mem_share);

  ///
  /// @ingroup domi
  /// @brief Trans Output data to fp16
  /// @return Status
  ///
  Status SetDataBuf(DataBuffer &data_buf, uint32_t &data_count, size_t i, bool support_mem_share);

  ///
  /// @ingroup domi
  /// @brief Get Output data and size.
  /// @return void
  ///
  void GetOutputData(vector<void *> &v_data_addr, vector<int64_t> &v_data_size);

  // Copy assignment operator and copy constructor are deleted
  Output &operator=(const Output &output) = delete;
  Output(const Output &output) = delete;

 protected:
  // Model's base address
  uint8_t *base_;
  uint8_t *var_base_;
  uint64_t logic_base_;
  uint64_t logic_var_base_;
  // The DavinciModel which ops belong to
  DavinciModel *model_;

  ConstOpDescPtr op_desc_;

  // Input descriptions
  size_t input_num_;
  vector<void *> v_input_data_addr_;  // init as:buf_base + op_def_->input(i));
  vector<int64_t> v_input_size_;
 };
 }  // namespace ge

 #endif  // GE_GRAPH_LOAD_OUTPUT_OUTPUT_H_
--- a/src/ge/graph/manager/graph_caching_allocator.cc
+++ b/src/ge/graph/manager/graph_caching_allocator.cc
@@ -34,6 +34,9 @@ const size_t bin_ranges[kNumBins] = {kRoundBlockSize * kKByteSize,
                                     26 * kGByteSize};

 static bool BlockComparator(const Block *left, const Block *right) {
  if (left->device_id != right->device_id) {
    return left->device_id < right->device_id;
  }
  if (left->size != right->size) {
    return left->size < right->size;
  }
@@ -264,20 +267,20 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) {
      return ge::FAILED;
    }
  }
  if (AddToBlockBin(memory_addr, memory_size, device_id) != ge::SUCCESS) {
  if (AddToBlockBin(memory_addr, memory_size) != ge::SUCCESS) {
    (void)memory_allocator_->FreeMemory(memory_addr);
    return ge::FAILED;
  }
  return ge::SUCCESS;
 }

 Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size, uint32_t device_id) {
 Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size) {
  BlockBin *bin = GetBlockBin(size);
  if (bin == nullptr) {
    GELOGE(ge::FAILED, "Get block bin failed size = %zu", size);
    return ge::FAILED;
  }
  Block *block = new (std::nothrow) Block(device_id, size, bin, nullptr);
  Block *block = new (std::nothrow) Block(0, size, bin, nullptr);
  if (block == nullptr) {
    GELOGE(ge::FAILED, "Alloc block failed size = %zu", size);
    return ge::FAILED;
@@ -336,4 +339,5 @@ void CachingAllocator::FreeBlockBins() {
    }
  }
 }

 }  // namespace ge
--- a/src/ge/graph/manager/graph_caching_allocator.h
+++ b/src/ge/graph/manager/graph_caching_allocator.h
@@ -32,6 +32,7 @@
 #include "runtime/mem.h"

 namespace ge {

 constexpr size_t kRoundBlockSize = 512;   // all block sizes are rounded to at least 512 bytes
 constexpr double kSplitThreshold = 0.75;  // split when malloc size <= small block size * kSpliThreshold
 constexpr size_t kKByteSize = 1024;
@@ -68,10 +69,6 @@ class CachingAllocator {
 public:
  explicit CachingAllocator(rtMemType_t memory_type);

  CachingAllocator(const CachingAllocator &) = delete;

  CachingAllocator &operator=(const CachingAllocator &) = delete;

  virtual ~CachingAllocator() = default;

  ///
@@ -140,10 +137,9 @@ class CachingAllocator {
  /// @brief add memory to right bin based on size
  /// @param [in] memory ptr
  /// @param [in] memory size
  /// @param [in] device_id device id
  /// @return Status result of function
  ///
  Status AddToBlockBin(uint8_t *ptr, size_t size, uint32_t device_id);
  Status AddToBlockBin(uint8_t *ptr, size_t size);

  ///
  /// @ingroup ge_graph
@@ -210,5 +206,7 @@ class CachingAllocator {
  // block bins by different block size
  BlockBin *free_block_bins_[kNumBins];
 };
 }  // namespace ge

 };  // namespace ge

 #endif  // GE_GRAPH_MANAGER_GRAPH_CACHING_ALLOCATOR_H_
--- a/src/ge/graph/manager/graph_manager.cc
+++ b/src/ge/graph/manager/graph_manager.cc
@@ -57,6 +57,7 @@
 #include "graph/passes/flow_ctrl_pass.h"
 #include "graph/passes/hccl_group_pass.h"
 #include "graph/passes/hccl_memcpy_pass.h"
 #include "graph/passes/identify_reference_pass.h"
 #include "graph/passes/identity_pass.h"
 #include "graph/passes/iterator_op_pass.h"
 #include "graph/passes/link_gen_mask_nodes_pass.h"
@@ -73,9 +74,7 @@
 #include "graph/passes/switch_data_edges_bypass.h"
 #include "graph/passes/switch_dead_branch_elimination.h"
 #include "graph/passes/switch_logic_remove_pass.h"
 #include "graph/passes/merge_to_stream_merge_pass.h"
 #include "graph/passes/switch_to_stream_switch_pass.h"
 #include "graph/passes/attach_stream_label_pass.h"
 #include "graph/passes/switch_op_pass.h"
 #include "graph/passes/transop_breadth_fusion_pass.h"
 #include "graph/passes/transop_depth_fusion_pass.h"
 #include "graph/passes/transop_nearby_allreduce_fusion_pass.h"
@@ -84,7 +83,6 @@
 #include "graph/passes/transpose_transdata_pass.h"
 #include "graph/passes/variable_op_pass.h"
 #include "graph/passes/variable_prepare_op_pass.h"
 #include "graph/passes/ref_identity_delete_op_pass.h"
 #include "graph/passes/variable_ref_delete_op_pass.h"
 #include "graph/passes/variable_ref_useless_control_out_delete_pass.h"
 #include "graph/utils/tensor_adapter.h"
@@ -349,13 +347,12 @@ Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_gr
  return SUCCESS;
 }

 #define GM_RUN_AND_DUMP_PERF(name, func, ...)                                                                    \
 #define GM_RUN_AND_DUMP(name, func, ...)                                                                         \
  do {                                                                                                           \
    GE_RUN_PERF(GraphManager, func, __VA_ARGS__);                                                                \
    GE_RUN(GraphManager, func, __VA_ARGS__);                                                                     \
    GE_DUMP(compute_graph, "PreRunAfter" name);                                                                  \
    GELOGI("Run %s on graph %s(%u) success.", name, compute_graph->GetName().c_str(), graph_node->GetGraphId()); \
  } while (0)

 Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs,
                            GeRootModelPtr &ge_root_model, uint64_t session_id) {
  GE_CHECK_NOTNULL(graph_node);
@@ -368,30 +365,30 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector<Ge
          compute_graph->GetName().c_str());
  GE_DUMP(compute_graph, "PreRunBegin");

  GM_RUN_AND_DUMP_PERF("OptimizeGraphPrepare", graph_optimize_.OptimizeOriginalGraphForQuantize, compute_graph);
  GM_RUN_AND_DUMP_PERF("HandleSummaryOp", graph_optimize_.HandleSummaryOp, compute_graph);
  GM_RUN_AND_DUMP_PERF("Prepare", graph_preparer_.PrepareDynShape, graph_node->GetGraph(), inputs, compute_graph,
                       session_id);
  GM_RUN_AND_DUMP_PERF("OptimizeOriginalGraph", graph_optimize_.OptimizeOriginalGraph, compute_graph);
  GM_RUN_AND_DUMP("OptimizeGraphPrepare", graph_optimize_.OptimizeOriginalGraphForQuantize, compute_graph);
  GM_RUN_AND_DUMP("HandleSummaryOp", graph_optimize_.HandleSummaryOp, compute_graph);
  GM_RUN_AND_DUMP("Prepare", graph_preparer_.PrepareDynShape, graph_node->GetGraph(), inputs, compute_graph,
                  session_id);
  GM_RUN_AND_DUMP("OptimizeOriginalGraph", graph_optimize_.OptimizeOriginalGraph, compute_graph);

  GM_RUN_AND_DUMP_PERF("PrepareRunningFormatRefiner", graph_preparer_.PrepareRunningFormatRefiner);
  GM_RUN_AND_DUMP_PERF("RefineRunningFormat", graph_optimize_.OptimizeOriginalGraphJudgeInsert, compute_graph);
  GM_RUN_AND_DUMP("PrepareRunningFormatRefiner", graph_preparer_.PrepareRunningFormatRefiner);
  GM_RUN_AND_DUMP("RefineRunningFormat", graph_optimize_.OptimizeOriginalGraphJudgeInsert, compute_graph);
  GE_RUN(GraphManager, graph_preparer_.RecordAIPPInfo, compute_graph);
  if (IsTailingOptimization()) {
    GM_RUN_AND_DUMP_PERF("OptimizeSwitchOp", graph_preparer_.SwitchOpOptimize, compute_graph);
    GM_RUN_AND_DUMP("OptimizeSwitchOp", graph_preparer_.SwitchOpOptimize, compute_graph);
  }
  GM_RUN_AND_DUMP_PERF("Optimize1", OptimizeStage1, compute_graph);
  GM_RUN_AND_DUMP_PERF("InferShape2", compute_graph->InferShapeInNeed);
  GM_RUN_AND_DUMP("Optimize1", OptimizeStage1, compute_graph);
  GM_RUN_AND_DUMP("InferShape2", compute_graph->InferShapeInNeed);
  const char *unknown_shape_skip = std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION");
  if (unknown_shape_skip != nullptr) {
    PassManager graph_pass;
    GE_CHK_STATUS_RET(graph_pass.AddPass("PreRun::CtrlEdgeTransferPass", new (std::nothrow) CtrlEdgeTransferPass))
    GE_CHK_STATUS_RET(graph_pass.Run(compute_graph));
  }
  GE_CHK_STATUS_RET(graph_optimize_.IdentifyReference(compute_graph), "Identify reference failed.");
  GM_RUN_AND_DUMP_PERF("OptimizeSubgraph", OptimizeSubgraph, graph_node, compute_graph, session_id);
  GM_RUN_AND_DUMP_PERF("Optimize2", OptimizeStage2, compute_graph);
  GM_RUN_AND_DUMP_PERF("Build", Build, graph_node, compute_graph, ge_root_model, session_id);

  GM_RUN_AND_DUMP("OptimizeSubgraph", OptimizeSubgraph, graph_node, compute_graph, session_id);
  GM_RUN_AND_DUMP("Optimize2", OptimizeStage2, compute_graph);
  GM_RUN_AND_DUMP("Build", Build, graph_node, compute_graph, ge_root_model, session_id);

  // when set incre build, save om model and var manager
  GeModelPtr ge_model = nullptr;
@@ -400,7 +397,7 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector<Ge
    GELOGW("Fail to save cache.");
  }
  // release rts generate context
  RtContextUtil::GetInstance().DestroyRtContexts(session_id);
  RtContextUtil::GetInstance().DestroyrtContexts();
  GEEVENT("[GEPERFTRACE] GE PreRun End");
  return SUCCESS;
 }
@@ -474,7 +471,7 @@ Status GraphManager::LoadGraph(const GeRootModelPtr &ge_root_model, const GraphN
    }
    GE_TIMESTAMP_START(LoadGraph);
    Status ret = GraphLoader::LoadModelOnline(model_id_info.model_id, ge_root_model, model_listener);
    GE_TIMESTAMP_EVENT_END(LoadGraph, "GraphManager::LoadGraph");
    GE_TIMESTAMP_END(LoadGraph, "GraphManager::LoadGraph");
    if (ret != SUCCESS) {
      GELOGE(ret, "[StartForRunGraph] LoadGraph Failed");
      graph_node->SetRunFlag(false);
@@ -637,7 +634,7 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vector<GeTenso
    graph_optimize_.TranFrameOp(compute_graph_tmp);
  }

  GeRootModelPtr ge_root_model = nullptr;
  GeRootModelPtr ge_root_model;
  ret = StartForRunGraph(graph_node, inputs, ge_root_model, session_id);
  if (ret != SUCCESS) {
    GELOGE(ret, "[RunGraph] StartForRunGraph failed!");
@@ -1616,6 +1613,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
  SwitchDeadBranchElimination switch_dead_branch_elimination;
  SwitchLogicRemovePass switch_logic_remove_pass;
  MergePass merge_pass;
  IdentifyReferencePass identify_reference_pass;
  CastRemovePass cast_remove_pass;
  TransposeTransDataPass transpose_transdata_pass;
  TransOpSymmetryEliminationPass symmetry_elimination_pass;
@@ -1624,6 +1622,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
  names_to_passes.emplace_back("SwitchDeadBranchElimination", &switch_dead_branch_elimination);
  names_to_passes.emplace_back("SwitchLogicRemovePass", &switch_logic_remove_pass);
  names_to_passes.emplace_back("MergePass", &merge_pass);
  names_to_passes.emplace_back("IdentifyReferencePass", &identify_reference_pass);
  names_to_passes.emplace_back("CastRemovePass", &cast_remove_pass);
  names_to_passes.emplace_back("TransposeTransDataPass", &transpose_transdata_pass);
  names_to_passes.emplace_back("TransOpSymmetryEliminationPass", &symmetry_elimination_pass);
@@ -1639,32 +1638,14 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
    GELOGE(ret, "Run passes when OptimizeStage1_2 failed, ret:%u.", ret);
    return ret;
  }
  // Calculate Op/Fe constantfolding cost
  uint64_t op_constant_folding_cost = 0;
  for (auto &it : constant_folding_pass.GetOpConstantFoldingPerfStatistic()) {
    op_constant_folding_cost += it.second.second;
    GELOGI("The time cost of %s constant folding is [%lu] micro second, calls is %lu.", it.first.c_str(),
           it.second.second, it.second.first);
  }
  GEEVENT("[GEPERFTRACE] The time cost of extern constant folding is [%lu] micro second.", op_constant_folding_cost);
  for (auto &it : constant_folding_pass.GetGeConstantFoldingPerfStatistic()) {
    op_constant_folding_cost += it.second.second;
    GELOGI("The time cost of %s constant folding is [%lu] micro second, calls is %lu.", it.first.c_str(),
           it.second.second, it.second.first);
  }

  GraphUtils::DumpGEGraphToOnnx(*compute_graph, "OptimizeStage1_2");
  PassManager graph_pass;
  // the prune pass should between SwitchPass and SwitchToStreamSwitchPass
  // the prune pass should between SwtichPass and SwitchOpPass
  GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::PrunePass", new (std::nothrow) PrunePass))
  GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::NextIterationPass", new (std::nothrow) NextIterationPass))
  GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::ControlTriggerPass", new (std::nothrow) ControlTriggerPass))
  GE_CHK_STATUS_RET(
    graph_pass.AddPass("OptimizeStage1_3::MergeToStreamMergePass", new (std::nothrow) MergeToStreamMergePass))
  GE_CHK_STATUS_RET(
    graph_pass.AddPass("OptimizeStage1_3::SwitchToStreamSwitchPass", new (std::nothrow) SwitchToStreamSwitchPass))
  GE_CHK_STATUS_RET(
    graph_pass.AddPass("OptimizeStage1_3::AttachStreamLabelPass", new (std::nothrow) AttachStreamLabelPass))
  GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::SwitchOpPass", new (std::nothrow) SwitchOpPass))
  GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::IteratorOpPass", new (std::nothrow) IteratorOpPass))
  GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::VariableRefUselessControlOutDeletePass",
                                       new (std::nothrow) VariableRefUselessControlOutDeletePass))
@@ -1679,7 +1660,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {

  NamesToPass identity_remove_pass;
  GE_TIMESTAMP_START(identity_remove_pass);
  IdentityPass identity_force_pass(true);  // after SwitchToStreamSwitchPass
  IdentityPass identity_force_pass(true);  // after SwitchOpPass
  identity_remove_pass.emplace_back("IdentityPass", &identity_force_pass);
  ret = GEPass(compute_graph).Run(identity_remove_pass);
  GE_TIMESTAMP_END(identity_remove_pass, "GraphPrepare::IdentityRemovePass");
@@ -1739,8 +1720,6 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) {

  GE_CHK_STATUS_RET(pass_for_control_attr_optimize.AddPass("OptimizeStage2::ControlAttrOptimize::MultiBatchPass",
                                                           new (std::nothrow) MultiBatchPass))
  GE_CHK_STATUS_RET(pass_for_control_attr_optimize.AddPass("OptimizeStage2::AfterMergePasses::RefIdentityDeleteOpPass",
                                                           new (std::nothrow) RefIdentityDeleteOpPass))
  // the value of the attr is the original variable name the ref-variable ref from.
  // The attr will be used when allocating memory,
  // the node marked attr will be output to a variable instead of new-allocated memory.
@@ -1798,6 +1777,8 @@ Status GraphManager::OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_gra

  GEPass ge_passes_for_shape(compute_graph);
  NamesToPass names_to_passes_for_shape;
  IdentifyReferencePass identify_reference_pass;
  names_to_passes_for_shape.emplace_back("IdentifyReferencePass", &identify_reference_pass);
  CastRemovePass cast_remove_pass;
  names_to_passes_for_shape.emplace_back("CastRemovePass", &cast_remove_pass);
  TransposeTransDataPass transpose_transdata_pass;
@@ -1885,10 +1866,7 @@ Status GraphManager::OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_gra
  GE_CHK_STATUS_RET(ret, "Remove isolated Constant failed, ret:%d.", ret);

  PassManager pass_for_optimize;
  const char *unknown_shape_skip = std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION");
  if (unknown_shape_skip == nullptr) {
    GE_CHK_STATUS_RET(pass_for_optimize.AddPass("SubgraphPass", new (std::nothrow) SubgraphPass));
  }
  GE_CHK_STATUS_RET(pass_for_optimize.AddPass("SubgraphPass", new (std::nothrow) SubgraphPass));
  GE_CHK_STATUS_RET(pass_for_optimize.AddPass("MultiBatchPass", new (std::nothrow) MultiBatchPass));
  GE_CHK_STATUS_RET(pass_for_optimize.AddPass("CompileNodesPass", new (std::nothrow) CompileNodesPass));
  GE_TIMESTAMP_START(pass_for_optimize);
@@ -1928,7 +1906,7 @@ Status GraphManager::LoadGraphAsync(const GeRootModelPtr &ge_root_model, const G
    GE_CHECK_NOTNULL(graph_node->graph_run_async_listener_);
    Status ret =
      GraphLoader::LoadModelOnline(model_id_info.model_id, ge_root_model, graph_node->graph_run_async_listener_);
    GE_TIMESTAMP_EVENT_END(LoadGraph, "GraphManager::LoadGraphAsync");
    GE_TIMESTAMP_END(LoadGraph, "GraphManager::LoadGraphAsync");
    if (ret != SUCCESS) {
      GELOGE(ret, "[LoadGraphAsync] LoadGraphAsync Failed");
      graph_node->SetRunFlag(false);
@@ -2331,21 +2309,21 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra
    GELOGE(FAILED, "failed get dynamic shape partitioned flag on partitioned graph.");
    return FAILED;
  }
  GE_TIMESTAMP_EVENT_END(GraphPartitionDynamicShape, "OptimizeSubgraph::GraphPartitionDynamicShape");
  GE_TIMESTAMP_END(GraphPartitionDynamicShape, "OptimizeSubgraph::GraphPartitionDynamicShape");
  GE_TIMESTAMP_START(GraphPartition);
  ret = graph_partitioner_.Partition(compute_graph, GraphPartitioner::kPartitioning);
  if (ret != SUCCESS) {
    GELOGE(ret, "Graph partition Failed");
    return ret;
  }
  GE_TIMESTAMP_EVENT_END(GraphPartition, "OptimizeSubgraph::Partition1");
  GE_TIMESTAMP_END(GraphPartition, "OptimizeSubgraph::Partition1");
  GE_TIMESTAMP_START(SetSubgraph);
  ret = SetSubgraph(session_id, compute_graph);
  if (ret != SUCCESS) {
    GELOGE(ret, "Graph set subgraph Failed");
    return ret;
  }
  GE_TIMESTAMP_EVENT_END(SetSubgraph, "OptimizeSubgraph::SetSubGraph");
  GE_TIMESTAMP_END(SetSubgraph, "OptimizeSubgraph::SetSubGraph");

  ComputeGraphPtr merged_compute_graph = nullptr;
  std::vector<ComputeGraphPtr> merged_sub_graph_list;
@@ -2364,7 +2342,7 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra
    sub_graph->SetSessionID(session_id);
    sub_graph->SetGraphID(graph_node->GetGraphId());
  }
  GE_TIMESTAMP_EVENT_END(MergeSubgraph, "OptimizeSubgraph::MergeSubGraph");
  GE_TIMESTAMP_END(MergeSubgraph, "OptimizeSubgraph::MergeSubGraph");
  GE_DUMP(merged_compute_graph, "mergedComputeGraph");
  compute_graph = merged_compute_graph;
  if (!AttrUtils::SetBool(*compute_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, dynamic_shape_partitioned)) {
@@ -2390,7 +2368,8 @@ Status GraphManager::Build(const GraphNodePtr &graph_node, ComputeGraphPtr &comp
  }

  bool is_always_dump = false;
  if (!PropertiesManager::Instance().GetDumpProperties(session_id).GetDumpPath().empty()) {
  PropertiesManager &properties_manager = PropertiesManager::Instance();
  if (!properties_manager.GetDumpOutputPath().empty()) {
    is_always_dump = true;
  }

--- a/src/ge/graph/manager/graph_manager.h
+++ b/src/ge/graph/manager/graph_manager.h
@@ -327,6 +327,6 @@ class GraphManager {

  std::mutex run_mutex_;
 };
 }  // namespace ge
 };  // namespace ge

 #endif  // GE_GRAPH_MANAGER_GRAPH_MANAGER_H_
--- a/src/ge/graph/manager/graph_mem_allocator.h
+++ b/src/ge/graph/manager/graph_mem_allocator.h
@@ -190,6 +190,6 @@ class MemManager {
  std::map<rtMemType_t, CachingAllocator *> caching_allocator_map_;
  std::recursive_mutex allocator_mutex_;
 };
 }  // namespace ge
 };  // namespace ge

 #endif  // GE_GRAPH_MANAGER_GRAPH_MEM_ALLOCATOR_H_
--- a/src/ge/graph/manager/graph_var_manager.cc
+++ b/src/ge/graph/manager/graph_var_manager.cc
@@ -91,7 +91,7 @@ ge::Status VarResource::SaveVarAddr(const std::string &var_name, const ge::GeTen
  std::string var_key = VarKey(var_name, tensor_desc);
  GELOGD("VarResource::SaveVarAddr, var_key = %s", var_key.c_str());
  if (var_addr_mgr_map_.count(var_key) == 0) {
    uint64_t logic_address = VarManager::Instance(session_id_)->GetVarMemLogicBase() +
    uint64_t logic_address = VarManager::Instance(0)->GetVarMemLogicBase() +
                             reinterpret_cast<uint64_t>(reinterpret_cast<std::uintptr_t>(address));
    GELOGI("SaveVarAddr node_name %s, tensor_desc format %s, type %s.", var_name.c_str(),
           TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str(),
@@ -274,7 +274,7 @@ MemResource::MemResource() : total_size_(0), var_mem_size_(0) {}
 Status MemResource::AssignVarMem(const std::string &var_name, uint64_t size, uint64_t session_id, size_t &mem_offset) {
  size = (size + kSessionMemAlignSize - 1) / kSessionMemAlignSize * kSessionMemAlignSize;
  uint64_t real_size = size;
  total_size_ = VarManager::Instance(session_id)->GetVarMemMaxSize();
  total_size_ = VarManager::Instance(0)->GetVarMemMaxSize();
  if (total_size_ < var_mem_size_) {
    GELOGE(PARAM_INVALID, "total_size_: %lu is smaller than var_mem_size_: %lu", total_size_, var_mem_size_);
    return PARAM_INVALID;
@@ -684,8 +684,7 @@ uint8_t *VarManager::GetVarMemoryAddr(uint8_t *logic_addr, rtMemType_t memory_ty
  if (mem_base == nullptr) {
    return nullptr;
  }
  uint8_t *mem_addr =
    logic_addr + reinterpret_cast<intptr_t>(mem_base) - VarManager::Instance(session_id_)->GetVarMemLogicBase();
  uint8_t *mem_addr = logic_addr + reinterpret_cast<intptr_t>(mem_base) - VarManager::Instance(0)->GetVarMemLogicBase();
  return mem_addr;
 }

--- a/src/ge/graph/manager/graph_var_manager.h
+++ b/src/ge/graph/manager/graph_var_manager.h
@@ -309,5 +309,5 @@ class VarManagerPool {
  std::mutex var_manager_mutex_;
  map<uint64_t, VarManager *> var_manager_map_;
 };
 }  // namespace ge
 };      // namespace ge
 #endif  // GE_GRAPH_MANAGER_GRAPH_VAR_MANAGER_H_
--- a/src/ge/graph/manager/model_manager/event_manager.h
+++ b/src/ge/graph/manager/model_manager/event_manager.h
@@ -92,6 +92,6 @@ class EventManager {
  std::vector<rtEvent_t> event_list_;
  bool inited_;
  uint32_t current_idx_;
 };  // EventManager
 }  // namespace ge
 };      // EventManager
 };      // namespace ge
 #endif  // GE_GRAPH_MANAGER_MODEL_MANAGER_EVENT_MANAGER_H_
--- a/src/ge/graph/manager/trans_var_data_utils.cc
+++ b/src/ge/graph/manager/trans_var_data_utils.cc
@@ -397,11 +397,10 @@ Status TransVarDataUtils::SyncTensorToHost(const string &var_name, const ge::GeT

  uint8_t *src_addr = nullptr;
  GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, src_tensor_desc, &src_addr));
  uint8_t *mem_addr =
    src_addr -
    static_cast<int64_t>(reinterpret_cast<uintptr_t>(VarManager::Instance(session_id)->GetVarMemLogicBase())) +
    static_cast<int64_t>(
      reinterpret_cast<uintptr_t>(VarManager::Instance(session_id)->GetVarMemoryBase(RT_MEMORY_HBM)));
  uint8_t *mem_addr = src_addr -
                      static_cast<int64_t>(reinterpret_cast<uintptr_t>(VarManager::Instance(0)->GetVarMemLogicBase())) +
                      static_cast<int64_t>(
                        reinterpret_cast<uintptr_t>(VarManager::Instance(session_id)->GetVarMemoryBase(RT_MEMORY_HBM)));
  GE_CHK_RT_RET(rtMallocHost(reinterpret_cast<void **>(host_addr), src_tensor_size));

  GE_CHK_RT_RET(rtMemcpy(*host_addr, src_tensor_size, mem_addr, src_tensor_size, RT_MEMCPY_DEVICE_TO_HOST));
@@ -414,11 +413,10 @@ Status TransVarDataUtils::SyncTensorToDevice(const string &var_name, const uint8
                                             const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id) {
  uint8_t *dst_addr = nullptr;
  GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, dst_tensor_desc, &dst_addr));
  uint8_t *mem_addr =
    dst_addr -
    static_cast<int64_t>(reinterpret_cast<uintptr_t>(VarManager::Instance(session_id)->GetVarMemLogicBase())) +
    static_cast<int64_t>(
      reinterpret_cast<uintptr_t>(VarManager::Instance(session_id)->GetVarMemoryBase(RT_MEMORY_HBM)));
  uint8_t *mem_addr = dst_addr -
                      static_cast<int64_t>(reinterpret_cast<uintptr_t>(VarManager::Instance(0)->GetVarMemLogicBase())) +
                      static_cast<int64_t>(
                        reinterpret_cast<uintptr_t>(VarManager::Instance(session_id)->GetVarMemoryBase(RT_MEMORY_HBM)));
  GE_CHK_RT_RET(rtMemcpy(mem_addr, addr_size, host_addr, addr_size, RT_MEMCPY_HOST_TO_DEVICE));

  GELOGI("SyncTensorToDevice var_name %s, addr_size %u", var_name.c_str(), addr_size);
--- a/src/ge/graph/manager/util/hcom_util.cc
+++ b/src/ge/graph/manager/util/hcom_util.cc
@@ -24,6 +24,7 @@
 #include "graph/utils/type_utils.h"

 namespace ge {

 Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc,
                                    std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
@@ -100,12 +101,6 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType
      GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i));
      GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size),
                        "get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
      // dynamic shape hccl op get size from output tensor desc
      if (op_desc->HasAttr(ATTR_NAME_IS_UNKNOWN_SHAPE)) {
        GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i));
        GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), input_size),
                          "get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
      }

      GE_IF_BOOL_EXEC(
        op_desc->GetType() == HCOMREDUCESCATTER, int32_t rank_size = 0;
@@ -119,8 +114,6 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType
        total_size = total_size + block_size; continue;);

      int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize();
      GELOGD("hcom util node %s inputsize %ld, shapesize %ld, datasize %d.", op_desc->GetName().c_str(), input_size,
             shape_size, size);
      GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size),
                        "Product of shape size and size beyond INT64_MAX");
      GE_IF_BOOL_EXEC(is_allgather, block_size = shape_size * size;);
--- a/src/ge/graph/manager/util/hcom_util.h
+++ b/src/ge/graph/manager/util/hcom_util.h
@@ -144,6 +144,8 @@ class HcomOmeUtil {
  ///
  static Status GetHorovodInputs(const ge::ConstOpDescPtr &op_desc,
                                 std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);

 private:
  ///
  /// @ingroup domi_ome
  /// @brief GetHcomCount
@@ -152,8 +154,6 @@ class HcomOmeUtil {
  ///
  static Status GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType_t data_type, bool is_allgather,
                             int &count);

 private:
  ///
  /// @ingroup domi_ome
  /// @brief GetHorovodCount
--- a/src/ge/graph/manager/util/rt_context_util.cc
+++ b/src/ge/graph/manager/util/rt_context_util.cc
@@ -19,30 +19,13 @@
 #include "framework/common/debug/ge_log.h"

 namespace ge {
 void RtContextUtil::AddRtContext(uint64_t session_id, rtContext_t context) {
  std::lock_guard<std::mutex> lock(ctx_mutex_);
  rt_contexts_[session_id].emplace_back(context);
 }

 void RtContextUtil::DestroyRtContexts(uint64_t session_id) {
  std::lock_guard<std::mutex> lock(ctx_mutex_);
  auto &contexts = rt_contexts_[session_id];
  DestroyRtContexts(session_id, contexts);
 }

 void RtContextUtil::DestroyAllRtContexts() {
  std::lock_guard<std::mutex> lock(ctx_mutex_);
  for (auto &ctx_pair : rt_contexts_) {
    DestroyRtContexts(ctx_pair.first, ctx_pair.second);
  }
  rt_contexts_.clear();
 }
 void RtContextUtil::AddrtContext(rtContext_t context) { rtContexts_.emplace_back(context); }

 void RtContextUtil::DestroyRtContexts(uint64_t session_id, std::vector<rtContext_t> &contexts) {
  GELOGI("Runtime context handle number of session %lu is %zu.", session_id, contexts.size());
  for (auto &rtContext : contexts) {
 void RtContextUtil::DestroyrtContexts() {
  GELOGI("The size of runtime context handle is %zu.", rtContexts_.size());
  for (auto &rtContext : rtContexts_) {
    (void)rtCtxDestroy(rtContext);
  }
  contexts.clear();
  rtContexts_.clear();
 }
 }  // namespace ge
--- a/src/ge/graph/manager/util/rt_context_util.h
+++ b/src/ge/graph/manager/util/rt_context_util.h
@@ -18,8 +18,6 @@
 #define GE_GRAPH_MANAGER_UTIL_RT_CONTEXT_UTIL_H_

 #include <vector>
 #include <map>
 #include <mutex>

 #include "runtime/context.h"

@@ -31,14 +29,13 @@ class RtContextUtil {
    return instance;
  }

  void AddRtContext(uint64_t session_id, rtContext_t context);
  void AddrtContext(rtContext_t context);

  const rtContext_t GetNormalModeContext() const { return before_prerun_ctx_; }

  void SetNormalModeContext(rtContext_t context) { before_prerun_ctx_ = context; }

  void DestroyRtContexts(uint64_t session_id);
  void DestroyAllRtContexts();
  void DestroyrtContexts();

  RtContextUtil &operator=(const RtContextUtil &) = delete;
  RtContextUtil(const RtContextUtil &RtContextUtil) = delete;
@@ -47,12 +44,8 @@ class RtContextUtil {
  RtContextUtil() = default;
  ~RtContextUtil() {}

  void DestroyRtContexts(uint64_t session_id, std::vector<rtContext_t> &contexts);

  std::map<uint64_t, std::vector<rtContext_t>> rt_contexts_;
  std::vector<rtContext_t> rtContexts_;
  rtContext_t before_prerun_ctx_ = nullptr;

  std::mutex ctx_mutex_;
 };
 }  // namespace ge

--- a/src/ge/graph/optimize/graph_optimize.cc
+++ b/src/ge/graph/optimize/graph_optimize.cc
@@ -299,36 +299,4 @@ void GraphOptimize::TranFrameOp(ComputeGraphPtr &compute_graph) {
    }
  }
 }

 Status GraphOptimize::IdentifyReference(ComputeGraphPtr &compute_graph) {
  for (auto &node : compute_graph->GetAllNodes()) {
    GE_CHECK_NOTNULL(node);
    auto op_desc = node->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
    auto input_name_index = op_desc->GetAllInputName();
    bool is_ref = false;
    for (const auto &name_index : input_name_index) {
      const int out_index = op_desc->GetOutputIndexByName(name_index.first);
      if (out_index != -1) {
        auto input_desc = op_desc->GetInputDesc(name_index.second);
        input_desc.SetRefPortByIndex({name_index.second});
        op_desc->UpdateInputDesc(name_index.second, input_desc);
        GELOGI("SetRefPort: set op[%s] input desc[%u-%s] ref.", op_desc->GetName().c_str(), name_index.second,
               name_index.first.c_str());
        auto output_desc = op_desc->GetOutputDesc(static_cast<uint32_t>(out_index));
        output_desc.SetRefPortByIndex({name_index.second});
        op_desc->UpdateOutputDesc(static_cast<uint32_t>(out_index), output_desc);
        GELOGI("SetRefPort: set op[%s] output desc[%u-%s] ref.", op_desc->GetName().c_str(), out_index,
               name_index.first.c_str());
        is_ref = true;
      }
    }
    if (is_ref) {
      AttrUtils::SetBool(op_desc, ATTR_NAME_REFERENCE, is_ref);
      GELOGI("param [node] %s is reference node, set attribute %s to be true.", node->GetName().c_str(),
             ATTR_NAME_REFERENCE.c_str());
    }
  }
  return SUCCESS;
 }
 }  // namespace ge
--- a/src/ge/graph/optimize/graph_optimize.h
+++ b/src/ge/graph/optimize/graph_optimize.h
@@ -67,9 +67,6 @@ class GraphOptimize {
  // handle summary node before preRun graph
  Status HandleSummaryOp(ComputeGraphPtr &compute_graph);

  // Identify reference node before optimize subgraph
  Status IdentifyReference(ComputeGraphPtr &compute_graph);

  void TranFrameOp(ComputeGraphPtr &compute_graph);

 private:
@@ -88,5 +85,5 @@ class GraphOptimize {
  std::map<uint32_t, std::map<string, size_t>> summary_output_indexes_ = {};
  std::string func_bin_path_;
 };
 }  // namespace ge
 };      // namespace ge
 #endif  // GE_GRAPH_OPTIMIZE_GRAPH_OPTIMIZE_H_
--- a/src/ge/graph/optimize/summary_optimize.cc
+++ b/src/ge/graph/optimize/summary_optimize.cc
@@ -80,8 +80,7 @@ Status GraphOptimize::HandleSummaryOp(ComputeGraphPtr &compute_graph) {
      del_nodes.emplace_back(node_ptr);
    }
  }
  GE_IF_BOOL_EXEC(!summary_output_indexes.empty(),
                  summary_output_indexes_.insert({compute_graph->GetGraphID(), summary_output_indexes}));
  summary_output_indexes_.insert({compute_graph->GetGraphID(), summary_output_indexes});

  // add output nodes for summary
  std::vector<std::pair<NodePtr, int32_t>> out_nodes_info;
--- a/src/ge/graph/partition/dynamic_shape_partition.cc
+++ b/src/ge/graph/partition/dynamic_shape_partition.cc
@@ -62,16 +62,15 @@ Status DynamicShapePartitioner::Partition() {
  }

  GELOGD("Start dynamic shape partition graph %s.", root_graph_->GetName().c_str());
  REQUIRE_SUCCESS(MarkUnknownShapeNodes(), "Failed mark unknown shape nodes, root grah name:%s.",
                  root_graph_->GetName().c_str());
  REQUIRE_SUCCESS(MarkUnknownShapeNodes(), "Failed mark unknown shape nodes.");
  if (unknown_shape_nodes_.empty()) {
    GELOGD("Skip dynamic shape partition of graph %s as all nodes are known shape.", root_graph_->GetName().c_str());
    REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, false),
            "Failed set dynamic shape partitioned flag on root graph %s.", root_graph_->GetName().c_str());
            "Failed set dynamic shape partitioned flag on root graph.");
    return SUCCESS;
  }
  REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, true),
          "Failed set dynamic shape partitioned flag on root graph %s.", root_graph_->GetName().c_str());
          "Failed set dynamic shape partitioned flag on root graph.");

  DumpGraph("_Before_DSP");
  auto status = PartitionImpl();
@@ -108,21 +107,21 @@ void DynamicShapePartitioner::PruneUniqueClusters() {
 }

 Status DynamicShapePartitioner::BuildPartitionFrame() {
  for (const auto &cluster : unique_clusters_) {
  for (auto cluster : unique_clusters_) {
    REQUIRE_SUCCESS(cluster->BuildFrame(), "Failed build frame of cluster[%lu].", cluster->Id());
  }
  return SUCCESS;
 }

 Status DynamicShapePartitioner::CombinePartitionFrame() {
  for (const auto &cluster : unique_clusters_) {
  for (auto cluster : unique_clusters_) {
    REQUIRE_SUCCESS(cluster->CombinePartitionFrame(), "Failed combine frame of cluster[%lu].", cluster->Id());
  }
  return SUCCESS;
 }

 Status DynamicShapePartitioner::BuildPartitionSubgraph() {
  for (const auto &cluster : unique_clusters_) {
  for (auto cluster : unique_clusters_) {
    REQUIRE_SUCCESS(cluster->BuildPartitionSubgraph(), "Failed build subgraph of cluster[%lu].", cluster->Id());
  }
  return SUCCESS;
@@ -135,10 +134,10 @@ std::string DynamicShapePartitioner::DebugString() const {
  size_t netoutput = 0;
  std::stringstream ss;
  ss << "All unknown shape nodes:" << std::endl;
  for (const auto &node : unknown_shape_nodes_) {
  for (auto node : unknown_shape_nodes_) {
    ss << "  [" << node->GetName() << "](" << node->GetType() << ")" << std::endl;
  }
  for (const auto &cluster : unique_clusters_) {
  for (auto cluster : unique_clusters_) {
    if (cluster->IsUnknownShape()) {
      unknown++;
    } else if (cluster->IsKnownShape()) {
@@ -151,7 +150,7 @@ std::string DynamicShapePartitioner::DebugString() const {
  }
  ss << "All clusters:" << unique_clusters_.size() << ", data:" << data << ", known:" << known
     << ", unknown:" << unknown << ", netoutput:" << netoutput << std::endl;
  for (const auto &cluster : unique_clusters_) {
  for (auto cluster : unique_clusters_) {
    ss << "  " << cluster->DebugString() << std::endl;
  }
  return ss.str();
@@ -159,13 +158,13 @@ std::string DynamicShapePartitioner::DebugString() const {

 void DynamicShapePartitioner::DumpGraph(const std::string &suffix) {
  GraphUtils::DumpGEGraphToOnnx(*root_graph_, root_graph_->GetName() + suffix);
  for (const auto &sub_graph : root_graph_->GetAllSubgraphs()) {
  for (auto sub_graph : root_graph_->GetAllSubgraphs()) {
    GraphUtils::DumpGEGraphToOnnx(*sub_graph, sub_graph->GetName() + suffix);
  }
 }

 void DynamicShapePartitioner::ClearResource() {
  for (const auto &cluster : unique_clusters_) {
  for (auto cluster : unique_clusters_) {
    cluster->Clear();
  }
  node_2_cluster_.clear();
@@ -176,7 +175,8 @@ void DynamicShapePartitioner::ClearResource() {
 }

 Status DynamicShapePartitioner::MarkUnknownShapeNodes() {
  for (auto &node : root_graph_->GetDirectNode()) {
  auto graph = root_graph_;
  for (auto &node : graph->GetDirectNode()) {
    REQUIRE_SUCCESS(CollectSpreadUnknownShapeNodes(node), "Failed collect spread unknown shape nodes %s.",
                    node->GetName().c_str());
  }
@@ -186,7 +186,7 @@ Status DynamicShapePartitioner::MarkUnknownShapeNodes() {
 Status DynamicShapePartitioner::InitClusters() {
  auto graph = root_graph_;
  size_t rank = 0;
  for (const auto &node : graph->GetDirectNode()) {
  for (const auto node : graph->GetDirectNode()) {
    Cluster::Type type = Cluster::DATA;
    if (node->GetType() == DATA) {
      type = Cluster::DATA;
@@ -208,7 +208,7 @@ Status DynamicShapePartitioner::InitClusters() {
      cluster->AddInput(node_2_cluster_[parent]);
    }
  }
  for (const auto &node : graph->GetDirectNode()) {
  for (const auto node : graph->GetDirectNode()) {
    GELOGD("Make cluster for node %s : %s.", node->GetName().c_str(), node_2_cluster_[node]->DebugString().c_str());
  }
  return SUCCESS;
@@ -220,8 +220,8 @@ Status DynamicShapePartitioner::TopologicalSortClusters() {
  std::queue<ClusterPtr> ready_clusters;
  std::unordered_map<ClusterPtr, size_t> cluster_pending_count;
  std::unordered_set<ClusterPtr> seen_clusters;
  for (auto &iter : node_2_cluster_) {
    auto cluster = iter.second;
  for (auto iter = node_2_cluster_.begin(); iter != node_2_cluster_.end(); iter++) {
    auto cluster = iter->second;
    if (seen_clusters.count(cluster) != 0) {
      continue;
    }
@@ -242,7 +242,7 @@ Status DynamicShapePartitioner::TopologicalSortClusters() {
    if (cluster->IsKnownShape()) {
      ordered_cluster_.push_back(cluster);
    }
    for (const auto &out_cluster : cluster->Outputs()) {
    for (auto out_cluster : cluster->Outputs()) {
      if (cluster_pending_count[out_cluster] > 0 && --cluster_pending_count[out_cluster] == 0) {
        ready_clusters.push(out_cluster);
      }
@@ -273,16 +273,16 @@ static std::string ToString(const std::vector<ClusterPtr> &clusters) {

 Status DynamicShapePartitioner::MergeClusters() {
  // Merge unknown shape clusters
  for (const auto &cluster : ordered_cluster_) {
    for (const auto &in_cluster : cluster->Inputs()) {
  for (auto cluster : ordered_cluster_) {
    for (auto in_cluster : cluster->Inputs()) {
      if (!in_cluster->IsUnknownShape()) {
        continue;
      }
      auto merged_clusters = cluster->MergeAllPathFrom(in_cluster);
      GELOGD("Merge all path cluster from %lu to %lu %s.", in_cluster->Id(), cluster->Id(),
             ToString(merged_clusters).c_str());
      for (const auto &merged_cluster : merged_clusters) {
        for (const auto &node : merged_cluster->Nodes()) {
      for (auto merged_cluster : merged_clusters) {
        for (auto node : merged_cluster->Nodes()) {
          node_2_cluster_[node] = cluster;
        }
      }
@@ -291,7 +291,7 @@ Status DynamicShapePartitioner::MergeClusters() {

  REQUIRE_SUCCESS(TopologicalSortClusters(), "Failed topological sort clusters after merge unknown shape clusters.");
  // Merge known shape clusters
  for (const auto &cluster : ordered_cluster_) {
  for (auto cluster : ordered_cluster_) {
    if (cluster->IsRefVariable() && cluster->Inputs().size() == 1) {
      auto in_cluster = *(cluster->Inputs().begin());
      in_cluster->Merge(cluster);
@@ -299,13 +299,13 @@ Status DynamicShapePartitioner::MergeClusters() {
      continue;
    }

    for (const auto &in_cluster : cluster->Inputs()) {
    for (auto in_cluster : cluster->Inputs()) {
      if (!in_cluster->IsKnownShape()) {
        continue;
      }
      if (cluster->TryMerge(in_cluster)) {
        GELOGD("Success merge known shape cluster from %lu to %lu.", in_cluster->Id(), cluster->Id());
        for (const auto &node : in_cluster->Nodes()) {
        for (auto node : in_cluster->Nodes()) {
          node_2_cluster_[node] = cluster;
        }
      }
@@ -333,7 +333,7 @@ Status DynamicShapePartitioner::CollectSpreadUnknownShapeNodes(NodePtr node) {
    if (IsUnknownShapeTensor(out_tensor)) {
      GELOGD("Collect node %s as unknown as output %lu is unknown.", node->GetName().c_str(), anchor_index);
      is_unknown = true;
      auto anchor = node->GetOutDataAnchor(static_cast<int>(anchor_index));
      auto anchor = node->GetOutDataAnchor(anchor_index);
      for (const auto peer_anchor : anchor->GetPeerInDataAnchors()) {
        if (peer_anchor != nullptr) {
          GELOGD("Collect node %s as has unknown input from %s:%lu.", peer_anchor->GetOwnerNode()->GetName().c_str(),
@@ -349,7 +349,7 @@ Status DynamicShapePartitioner::CollectSpreadUnknownShapeNodes(NodePtr node) {
    if (IsUnknownShapeTensor(in_tensor)) {
      GELOGD("Collect node %s as unknown as input %lu is unknown.", node->GetName().c_str(), anchor_index);
      is_unknown = true;
      auto anchor = node->GetInDataAnchor(static_cast<int>(anchor_index));
      auto anchor = node->GetInDataAnchor(anchor_index);
      const auto peer_anchor = anchor->GetPeerOutAnchor();
      if (peer_anchor != nullptr) {
        GELOGD("Collect node %s as has unknown output to %s:%lu.", peer_anchor->GetOwnerNode()->GetName().c_str(),
@@ -453,15 +453,15 @@ std::string Cluster::DebugString() const {
  }
  ss << "[" << id_ << "](size:" << nodes_.size() << ")";
  ss << "(" << min_ << "," << max_ << ")(";
  for (const auto &cluster : in_clusters_) {
  for (auto cluster : in_clusters_) {
    ss << cluster->id_ << ",";
  }
  ss << ")->(";
  for (const auto &cluster : out_clusters_) {
  for (auto cluster : out_clusters_) {
    ss << cluster->id_ << ",";
  }
  ss << ")|";
  for (const auto &node : nodes_) {
  for (auto node : nodes_) {
    ss << (node->GetName() + "|");
  }
  return ss.str();
@@ -507,12 +507,12 @@ void Cluster::Merge(ClusterPtr other) {
  in_clusters_.erase(other);
  out_clusters_.erase(other);
  auto in_clusters = other->in_clusters_;
  for (const auto &cluster : in_clusters) {
  for (auto cluster : in_clusters) {
    cluster->RemoveOutput(other);
    cluster->AddOutput(shared_from_this());
  }
  auto out_clusters = other->out_clusters_;
  for (const auto &cluster : out_clusters) {
  for (auto cluster : out_clusters) {
    cluster->RemoveInput(other);
    cluster->AddInput(shared_from_this());
  }
@@ -529,7 +529,7 @@ bool Cluster::TryMerge(ClusterPtr other) {
  while (!forward_reached.empty()) {
    auto current_cluster = forward_reached.front();
    forward_reached.pop();
    for (const auto &cluster : current_cluster->out_clusters_) {
    for (auto cluster : current_cluster->out_clusters_) {
      if (cluster->max_ == max_ && current_cluster != other) {
        return false;
      } else if (cluster->min_ < max_) {
@@ -557,7 +557,7 @@ std::vector<ClusterPtr> Cluster::MergeAllPathFrom(ClusterPtr other) {
  while (!forward_reached_queue.empty()) {
    auto current_cluster = forward_reached_queue.front();
    forward_reached_queue.pop();
    for (const auto &cluster : current_cluster->out_clusters_) {
    for (auto cluster : current_cluster->out_clusters_) {
      if (cluster->min_ < max_ && cluster->max_ != max_ && forward_reached_clusters.count(cluster) == 0) {
        forward_reached_clusters.insert(cluster);
        forward_reached_queue.push(cluster);
@@ -567,7 +567,7 @@ std::vector<ClusterPtr> Cluster::MergeAllPathFrom(ClusterPtr other) {
  while (!backward_reached_queue.empty()) {
    auto current_cluster = backward_reached_queue.front();
    backward_reached_queue.pop();
    for (const auto &cluster : current_cluster->in_clusters_) {
    for (auto cluster : current_cluster->in_clusters_) {
      if (cluster->max_ > other->min_ && cluster->max_ != other->max_ &&
          backward_reached_clusters.count(cluster) == 0) {
        backward_reached_clusters.insert(cluster);
@@ -578,7 +578,7 @@ std::vector<ClusterPtr> Cluster::MergeAllPathFrom(ClusterPtr other) {
      }
    }
  }
  for (const auto &cluster : path_clusters) {
  for (auto cluster : path_clusters) {
    Merge(cluster);
  }
  return path_clusters;
@@ -598,11 +598,11 @@ void Cluster::AddFrameOutput(OutDataAnchorPtr anchor) {
 };

 InDataAnchorPtr Cluster::GetFrameInDataAnchor(InDataAnchorPtr anchor) {
  return partition_node_->GetInDataAnchor(static_cast<int>(inputs_index_[anchor]));
  return partition_node_->GetInDataAnchor(inputs_index_[anchor]);
 };

 OutDataAnchorPtr Cluster::GetFrameOutDataAnchor(OutDataAnchorPtr anchor) {
  return partition_node_->GetOutDataAnchor(static_cast<int>(outputs_index_[anchor]));
  return partition_node_->GetOutDataAnchor(outputs_index_[anchor]);
 };

 InControlAnchorPtr Cluster::GetFrameInControlAnchor() { return partition_node_->GetInControlAnchor(); };
@@ -616,25 +616,22 @@ Status Cluster::BuildFrame() {
    auto node = nodes_.front();
    auto in_control_anchor = node->GetInControlAnchor();
    if (in_control_anchor != nullptr) {
      for (const auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) {
      for (auto peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) {
        auto src_cluster = partitioner_->node_2_cluster_[peer_out_control_anchor->GetOwnerNode()];
        if (src_cluster->id_ != id_) {
          REQUIRE_GRAPH_SUCCESS(
            GraphUtils::RemoveEdge(peer_out_control_anchor, in_control_anchor),
            "Failed remove edge from node %s index %d to node %s index %d.",
            peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), AnchorUtils::GetIdx(peer_out_control_anchor),
            in_control_anchor->GetOwnerNode()->GetName().c_str(), AnchorUtils::GetIdx(in_control_anchor));
          auto src_cluster = partitioner_->node_2_cluster_[peer_out_control_anchor->GetOwnerNode()];
          GraphUtils::RemoveEdge(peer_out_control_anchor, in_control_anchor);
          control_inputs_.insert(src_cluster);
          src_cluster->control_outputs_.insert(peer_out_control_anchor);
        }
      }
    }
    if (IsData()) {
      for (const auto &anchor : node->GetAllOutDataAnchors()) {
      for (auto anchor : node->GetAllOutDataAnchors()) {
        AddFrameOutput(anchor);
      }
    } else {
      for (const auto &anchor : node->GetAllInDataAnchors()) {
      for (auto anchor : node->GetAllInDataAnchors()) {
        AddFrameInput(anchor);
      }
    }
@@ -663,7 +660,7 @@ Status Cluster::BuildPartitionFrame() {
            "Failed set shape flag.");
    REQUIRE_GRAPH_SUCCESS(GraphUtils::RemoveJustNode(graph, node), "Failed remove root graph node.");
    REQUIRE_GRAPH_SUCCESS(node->SetOwnerComputeGraph(subgraph_), "Failed set owner graph.");
    for (const auto &anchor : node->GetAllInDataAnchors()) {
    for (auto anchor : node->GetAllInDataAnchors()) {
      auto peer_out_anchor = anchor->GetPeerOutAnchor();
      if (peer_out_anchor == nullptr) {
        continue;  // Skip overhang input.
@@ -677,7 +674,7 @@ Status Cluster::BuildPartitionFrame() {
    }
    auto in_control_anchor = node->GetInControlAnchor();
    if (in_control_anchor != nullptr) {
      for (const auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) {
      for (auto peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) {
        if (peer_out_control_anchor == nullptr) {
          continue;
        }
@@ -692,9 +689,9 @@ Status Cluster::BuildPartitionFrame() {
        }
      }
    }
    for (const auto &anchor : node->GetAllOutDataAnchors()) {
    for (auto anchor : node->GetAllOutDataAnchors()) {
      auto peer_in_anchors = anchor->GetPeerInDataAnchors();
      for (const auto &peer_in_anchor : peer_in_anchors) {
      for (auto peer_in_anchor : peer_in_anchors) {
        auto src_cluster = partitioner_->node_2_cluster_[peer_in_anchor->GetOwnerNode()];
        if (src_cluster->id_ != id_) {
          AddFrameOutput(anchor);
@@ -720,7 +717,7 @@ Status Cluster::BuildPartitionFrame() {
 }

 Status Cluster::CombinePartitionFrame() {
  for (const auto &anchor : inputs_) {
  for (auto anchor : inputs_) {
    auto peer_out_anchor = anchor->GetPeerOutAnchor();
    auto src_cluster = partitioner_->node_2_cluster_[peer_out_anchor->GetOwnerNode()];
    auto src_anchor = src_cluster->GetFrameOutDataAnchor(peer_out_anchor);
@@ -732,7 +729,7 @@ Status Cluster::CombinePartitionFrame() {
                          src_anchor->GetOwnerNode()->GetName().c_str(), src_anchor->GetIdx(),
                          dst_anchor->GetOwnerNode()->GetName().c_str(), dst_anchor->GetIdx());
  }
  for (const auto &src_cluster : control_inputs_) {
  for (auto src_cluster : control_inputs_) {
    auto src_anchor = src_cluster->GetFrameOutControlAnchor();
    auto dst_anchor = GetFrameInControlAnchor();
    REQUIRE_GRAPH_SUCCESS(GraphUtils::AddEdge(src_anchor, dst_anchor), "Failed add edge from %s:%d to %s:%d.",
@@ -777,8 +774,8 @@ Status Cluster::BuildPartitionSubgraph() {
  REQUIRE_NOT_NULL(net_output_node, "Failed add netoutput node to subgraph.");
  REQUIRE_GRAPH_SUCCESS(net_output_node->SetOwnerComputeGraph(subgraph_), "Failed set owner graph of netoutput node.");
  parent_node_index = 0;
  for (const auto &anchor : outputs_) {
    auto output_desc = anchor->GetOwnerNode()->GetOpDesc()->GetOutputDesc(static_cast<uint32_t>(anchor->GetIdx()));
  for (auto anchor : outputs_) {
    auto output_desc = anchor->GetOwnerNode()->GetOpDesc()->GetOutputDesc(anchor->GetIdx());
    REQUIRE(AttrUtils::SetInt(output_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_node_index),
            "Failed set parent_node_index on subgraph netoutput's input.");
    REQUIRE_GRAPH_SUCCESS(net_output_op->UpdateInputDesc(parent_node_index, output_desc),
@@ -789,7 +786,7 @@ Status Cluster::BuildPartitionSubgraph() {
                          anchor->GetIdx());
    parent_node_index++;
  }
  for (const auto &anchor : control_outputs_) {
  for (auto anchor : control_outputs_) {
    REQUIRE_GRAPH_SUCCESS(GraphUtils::AddEdge(anchor, net_output_node->GetInControlAnchor()),
                          "Faile add control edge from %s:%d to netoutput node.",
                          anchor->GetOwnerNode()->GetName().c_str(), anchor->GetIdx());
--- a/src/ge/graph/partition/engine_place.cc
+++ b/src/ge/graph/partition/engine_place.cc
@@ -38,7 +38,6 @@ Status EnginePlacer::Run() {
    return FAILED;
  }
  // Assign engine for each node in the graph
  instance_ptr->DNNEngineManagerObj().InitPerformanceStaistic();
  for (const auto &node_ptr : compute_graph_->GetDirectNode()) {
    GE_CHECK_NOTNULL(node_ptr);
    GE_CHECK_NOTNULL(node_ptr->GetOpDesc());
@@ -61,15 +60,12 @@ Status EnginePlacer::Run() {
      return FAILED;
    }
  }
  for (auto &it : instance_ptr->DNNEngineManagerObj().GetCheckSupportCost()) {
    GEEVENT("The time cost of %s::CheckSupported is [%lu] micro second.", it.first.c_str(), it.second);
  }
  GELOGI("Engine placer ends.");
  return SUCCESS;
 }

 Status EnginePlacer::AssignEngineAndLog(ge::ConstNodePtr node_ptr, const std::string &engine_name) {
  if ((node_ptr == nullptr) || (node_ptr->GetOpDesc() == nullptr)) {
  if (node_ptr == nullptr || node_ptr->GetOpDesc() == nullptr) {
    GELOGE(FAILED, "node_ptr is null.");
    return FAILED;
  }
--- a/src/ge/graph/partition/graph_partition.cc
+++ b/src/ge/graph/partition/graph_partition.cc
@@ -25,7 +25,6 @@
 #include "framework/common/types.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/manager/graph_manager_utils.h"
 #include "graph/common/ge_call_wrapper.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/type_utils.h"
@@ -232,33 +231,33 @@ Status ge::GraphPartitioner::MergeSubGraph(ge::ComputeGraphPtr &output_merged_co
  ComputeGraphPtr new_sub_graph = MakeShared<ComputeGraph>(original_compute_graph->GetName());
  GE_CHECK_NOTNULL(new_sub_graph);
  output_merged_compute_graph = new_sub_graph;
  GE_TIMESTAMP_START(MergeSubGraphRemoveNode);
  GE_TIMESTAMP_START(MergeGraphRemoveNode);
  if (RemoveNodeAndEdgeBetweenEndPld(output_merged_compute_graph, sub_graph_list) != ge::SUCCESS) {
    GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: merging sub-graphs failed");
    return FAILED;
  }
  GE_TIMESTAMP_END(MergeSubGraphRemoveNode, "GraphPartitioner::MergeGraphRemoveNodeAndEdge");
  GE_TIMESTAMP_START(MergeSubGraphTopologicalSorting);
  GE_TIMESTAMP_END(MergeGraphRemoveNode, "GraphPartitioner::MergeGraphRemoveNodeAndEdge");
  GE_TIMESTAMP_START(MergeGraphTopologicalSorting);
  Status ret = output_merged_compute_graph->TopologicalSorting();
  if (ret != SUCCESS) {
    GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "[GraphPartitioner]: output_merged_compute_graph->TopologicalSorting failed");
    return FAILED;
  }
  GE_TIMESTAMP_END(MergeSubGraphTopologicalSorting, "GraphPartitioner::MergeGraphTopologicalSorting");
  GE_TIMESTAMP_END(MergeGraphTopologicalSorting, "GraphPartitioner::MergeGraphTopologicalSorting");
  // flush all nodes' engine of merged graph
  GE_TIMESTAMP_START(MergeSubGraphEnginePlacerRun);
  GE_TIMESTAMP_START(MergeGraphEnginePlacerRun);
  graph_info_.engine_placer_.SetComputeGraph(output_merged_compute_graph);
  if (graph_info_.engine_placer_.Run() != SUCCESS) {
    GELOGE(GE_GRAPH_INIT_FAILED, "[GraphPartitioner]: engine_placer run failed");
    return FAILED;
  }
  GE_TIMESTAMP_END(MergeSubGraphEnginePlacerRun, "GraphPartitioner::MergeGraphEnginePlacerRun");
  GE_TIMESTAMP_END(MergeGraphEnginePlacerRun, "GraphPartitioner::MergeGraphEnginePlacerRun");
  GELOGI("Graph merge ends.");
  return SUCCESS;
 }

 Status ge::GraphPartitioner::UpdatePldOpDesc(const NodePtr &dst_node, int input_index, OpDescPtr &pld_op_desc) {
  if ((dst_node == nullptr) || (pld_op_desc == nullptr) || (dst_node->GetOpDesc() == nullptr)) {
  if (dst_node == nullptr || pld_op_desc == nullptr || dst_node->GetOpDesc() == nullptr) {
    GELOGE(FAILED, "parameter ptr is null.");
    return FAILED;
  }
@@ -276,7 +275,7 @@ Status ge::GraphPartitioner::UpdatePldOpDesc(const NodePtr &dst_node, int input_
 }

 Status ge::GraphPartitioner::UpdateEndOpDesc(const NodePtr &src_node, int output_index, OpDescPtr &end_op_desc) {
  if ((src_node == nullptr) || (end_op_desc == nullptr) || (src_node->GetOpDesc() == nullptr)) {
  if (src_node == nullptr || end_op_desc == nullptr || src_node->GetOpDesc() == nullptr) {
    GELOGE(FAILED, "parameter ptr is null.");
    return FAILED;
  }
@@ -297,9 +296,9 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr
                                                                 const AnchorPtr &peer_in_anchor,
                                                                 const ge::ComputeGraphPtr &pld_graph,
                                                                 const ge::ComputeGraphPtr &end_graph) {
  GE_CHECK_NOTNULL(out_anchor);
  GE_CHECK_NOTNULL(peer_in_anchor);
  GE_CHECK_NOTNULL(pld_graph);
  GE_CHECK_NOTNULL(out_anchor);
  GE_CHECK_NOTNULL(end_graph);
  const auto &src_node = out_anchor->GetOwnerNode();
  const auto &dst_node = peer_in_anchor->GetOwnerNode();
@@ -314,7 +313,6 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr
                  GELOGW("SetInt peerIndex failed");)
  GE_IF_BOOL_EXEC(!AttrUtils::SetStr(end_op_desc, "parentOpType", dst_node->GetType()),
                  GELOGW("SetStr parentOpType failed");)
  GE_IF_BOOL_EXEC(!end_op_desc->SetExtAttr("parentNode", dst_node), GELOGW("SetEndExtAttr parentNode failed");)
  // replace input_desc of end with owner node's desc
  int output_index = ge::AnchorUtils::GetIdx(out_anchor);
  bool is_need_update_desc = (output_index >= 0) && (graph_info_.mode_ == kPartitioning);
@@ -363,7 +361,6 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr
                  GELOGW("SetStr parentId failed");)
  GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "anchorIndex", AnchorUtils::GetIdx(out_anchor)),
                  GELOGW("SetInt anchorIndex failed");)
  GE_IF_BOOL_EXEC(!pld_op_desc->SetExtAttr("parentNode", src_node), GELOGW("SetPldExtAttr parentNode failed");)
  // do not care over flow
  graph_info_.num_of_pld_end_++;
  // replace output_desc of pld with input node's output desc
@@ -398,14 +395,14 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr
    return FAILED;
  }
  graph_info_.index_2_end_[graph_info_.num_of_pld_end_] = new_end_node;
  graph_info_.pld_2_end_[new_pld_node] = new_end_node;
  graph_info_.end_2_pld_[new_end_node] = new_pld_node;
  graph_info_.pld_2_end_[new_pld_node] = new_end_node;
  return SUCCESS;
 }

 Status ge::GraphPartitioner::LinkInput2EndRemoveOrginalLink(ge::NodePtr input_node, ge::ComputeGraphPtr src_graph,
                                                            ge::ComputeGraphPtr dst_graph) {
  if ((input_node == nullptr) || (src_graph == nullptr) || (dst_graph == nullptr)) {
  if (input_node == nullptr || src_graph == nullptr || dst_graph == nullptr) {
    GELOGE(FAILED, "parameter ptr is null.");
    return FAILED;
  }
@@ -445,7 +442,7 @@ Status ge::GraphPartitioner::LinkInput2EndRemoveOrginalLink(ge::NodePtr input_no

 Status ge::GraphPartitioner::PutInputNodesInSubGraph(const ge::ComputeGraphPtr &src_graph,
                                                     const ge::ComputeGraphPtr &dst_graph) {
  if ((src_graph == nullptr) || (dst_graph == nullptr)) {
  if (src_graph == nullptr || dst_graph == nullptr) {
    GELOGE(FAILED, "parameter ptr is null.");
    return FAILED;
  }
@@ -852,34 +849,34 @@ Status ge::GraphPartitioner::PartitionSubGraph(ge::ComputeGraphPtr compute_graph
    GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "[GraphPartitioner]: subGraphPtr->TopologicalSorting failed");
    return FAILED;
  }
  GE_TIMESTAMP_START(PartitionSubGraphInitialize);
  GE_TIMESTAMP_START(GraphPartitionInitialize);
  if (Initialize(compute_graph) != SUCCESS) {
    GELOGE(GE_GRAPH_INIT_FAILED, "[GraphPartitioner]: initialize failed");
    return FAILED;
  }
  GE_TIMESTAMP_END(PartitionSubGraphInitialize, "GraphPartitioner::PartitionInitialize");
  GE_TIMESTAMP_START(PartitionSubGraphMarkClusters);
  GE_TIMESTAMP_END(GraphPartitionInitialize, "GraphPartitioner::PartitionInitialize");
  GE_TIMESTAMP_START(GraphPartitionMarkClusters);
  MarkClusters();
  GE_TIMESTAMP_END(PartitionSubGraphMarkClusters, "GraphPartitioner::PartitionMarkClusters");
  GE_TIMESTAMP_START(PartitionSubGraphSplitSubGraphs);
  GE_TIMESTAMP_END(GraphPartitionMarkClusters, "GraphPartitioner::PartitionMarkClusters");
  GE_TIMESTAMP_START(GraphPartitionSplitSubGraphs);
  if (SplitSubGraphs(compute_graph) != SUCCESS) {
    GELOGE(FAILED, "[GraphPartitioner]: SplitSubGraphs failed");
    return FAILED;
  }
  GE_TIMESTAMP_END(PartitionSubGraphSplitSubGraphs, "GraphPartitioner::PartitionSplitSubGraphs");
  GE_TIMESTAMP_START(PartitionSubGraphSortSubGraphs);
  GE_TIMESTAMP_END(GraphPartitionSplitSubGraphs, "GraphPartitioner::PartitionSplitSubGraphs");
  GE_TIMESTAMP_START(GraphPartitionSortSubGraphs);
  if (SortSubGraphs(compute_graph) != ge::SUCCESS) {
    GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "Graph Partition SortSubGraphs failed.");
    return ge::FAILED;
  }
  GE_TIMESTAMP_END(PartitionSubGraphSortSubGraphs, "GraphPartitioner::PartitionSortSubGraphs");
  GE_TIMESTAMP_START(PartitionSubGraphAddPartitionsToGraphNode);
  GE_TIMESTAMP_END(GraphPartitionSortSubGraphs, "GraphPartitioner::PartitionSortSubGraphs");
  GE_TIMESTAMP_START(GraphPartitionAddPartitionsToGraphNode);
  vector<ge::SubGraphInfoPtr> output_subgraphs;
  if (AddPartitionsToGraphNode(output_subgraphs, compute_graph) != ge::SUCCESS) {
    GELOGE(GE_GRAPH_EMPTY_PARTITION, "Graph Partition AddPartitionsToGraphNode failed.");
    return ge::FAILED;
  }
  GE_TIMESTAMP_END(PartitionSubGraphAddPartitionsToGraphNode, "GraphPartitioner::PartitionAddPartitionsToGraphNode");
  GE_TIMESTAMP_END(GraphPartitionAddPartitionsToGraphNode, "GraphPartitioner::PartitionAddPartitionsToGraphNode");
  GELOGI("Graph Partition ends. Adding partitions to SubGraphInfo, got %zu sub graphs", output_subgraphs.size());
  graph_info_.mode_ = kMerging;
  // do not care over flow
@@ -926,7 +923,7 @@ Status ge::GraphPartitioner::AddPlaceHolderEnd(const AnchorPtr &out_anchor, cons
 Status ge::GraphPartitioner::SortSubGraphs(const ge::ComputeGraphPtr &compute_graph) {
  uint32_t rank = kRankOne;  // rank 0 for data graph
  ComputeGraphPtr new_input_nodes_sub_graph = MakeShared<ComputeGraph>("inputNodeGraph");
  if ((new_input_nodes_sub_graph == nullptr) || (compute_graph == nullptr)) {
  if (new_input_nodes_sub_graph == nullptr || compute_graph == nullptr) {
    GELOGE(FAILED, "[GraphPartitioner]: new_input_nodes_sub_graph or compute_graph is null.");
    return FAILED;
  }
@@ -968,7 +965,7 @@ Status ge::GraphPartitioner::SortSubGraphs(const ge::ComputeGraphPtr &compute_gr
 }

 AnchorPtr ge::GraphPartitioner::GetEndInAnchor(const AnchorPtr &src_anchor, const NodePtr &end_node) {
  if ((src_anchor == nullptr) || (end_node == nullptr)) {
  if (src_anchor == nullptr || end_node == nullptr) {
    GELOGE(FAILED, "parameter ptr is null.");
    return nullptr;
  }
@@ -982,7 +979,7 @@ AnchorPtr ge::GraphPartitioner::GetEndInAnchor(const AnchorPtr &src_anchor, cons
 }

 AnchorPtr ge::GraphPartitioner::GetPldOutAnchor(const NodePtr &pld_node, const AnchorPtr &dst_anchor) {
  if ((pld_node == nullptr) || (dst_anchor == nullptr)) {
  if (pld_node == nullptr || dst_anchor == nullptr) {
    GELOGE(FAILED, "parameter ptr is null.");
    return nullptr;
  }
@@ -995,16 +992,16 @@ AnchorPtr ge::GraphPartitioner::GetPldOutAnchor(const NodePtr &pld_node, const A
  return pld_out_anchor;
 }

 void ge::GraphPartitioner::AddEndPldInformationToSubGraphInfo(ge::SubGraphInfoPtr &subgraph_info) {
  if (subgraph_info == nullptr) {
 void ge::GraphPartitioner::AddEndPldInformationToSubGraphInfo(ge::SubGraphInfoPtr &sub_graph_info) {
  if (sub_graph_info == nullptr) {
    GELOGE(FAILED, "parameter ptr is null.");
    return;
  }
  auto subgraph = subgraph_info->GetSubGraph();
  GE_CHECK_NOTNULL_JUST_RETURN(subgraph);
  auto sub_graph = sub_graph_info->GetSubGraph();
  GE_CHECK_NOTNULL_JUST_RETURN(sub_graph);
  NodetoNodeMap end_map;
  NodetoNodeMap pld_map;
  for (const auto &node : subgraph->GetDirectNode()) {
  for (const auto &node : sub_graph->GetDirectNode()) {
    if (node->GetType() == kEndType) {
      end_map[node] = graph_info_.end_2_pld_.at(node);
    }
@@ -1012,8 +1009,8 @@ void ge::GraphPartitioner::AddEndPldInformationToSubGraphInfo(ge::SubGraphInfoPt
      pld_map[node] = graph_info_.pld_2_end_.at(node);
    }
  }
  subgraph_info->SetEnd2PldMap(end_map);
  subgraph_info->SetPld2EndMap(pld_map);
  sub_graph_info->SetEnd2PldMap(end_map);
  sub_graph_info->SetPld2EndMap(pld_map);
 }

 const Graph2SubGraphInfoList &ge::GraphPartitioner::GetSubGraphMap() { return graph_2_subgraph_list_; }
--- a/src/ge/graph/passes/atomic_addr_clean_pass.cc
+++ b/src/ge/graph/passes/atomic_addr_clean_pass.cc
@@ -22,12 +22,16 @@
 #include <sstream>
 #include <vector>

 #include "framework/common/debug/ge_log.h"
 #include "common/ge_inner_error_codes.h"
 #include "common/ge/ge_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/utils/node_utils.h"
 #include "init/gelib.h"

 namespace {
 bool is_loop_graph = false;
 }
 namespace ge {
 namespace {
 bool GraphShouldBeSkip(const ge::ComputeGraphPtr &graph) {
@@ -40,6 +44,7 @@ bool GraphShouldBeSkip(const ge::ComputeGraphPtr &graph) {
 }  // namespace

 Status AtomicAddrCleanPass::Run(ComputeGraphPtr graph) {
  GE_TIMESTAMP_START(AtomicAddrCleanPass);
  if (graph == nullptr) {
    GELOGE(PARAM_INVALID, "param [graph] must not be null.");
    return PARAM_INVALID;
@@ -66,10 +71,10 @@ Status AtomicAddrCleanPass::Run(ComputeGraphPtr graph) {
      }
      atomic_node_vec.push_back(node);
    }
    if (!is_loop_graph_ && node->GetType() == LOOPCOND) {
    if (!is_loop_graph && node->GetType() == LOOPCOND) {
      // there is loop in this graph
      GELOGD("There is no loop node. It will insert clean node follow atomic node.");
      is_loop_graph_ = true;
      is_loop_graph = true;
    }
  }
  if (atomic_node_vec.empty()) {
@@ -78,7 +83,7 @@ Status AtomicAddrCleanPass::Run(ComputeGraphPtr graph) {
  }
  // 2.Insert clean node and link to atomic node
  Status ret;
  if (is_loop_graph_) {
  if (is_loop_graph) {
    ret = HandleLoopGraph(graph, atomic_node_vec);
    if (ret != SUCCESS) {
      return ret;
@@ -90,6 +95,7 @@ Status AtomicAddrCleanPass::Run(ComputeGraphPtr graph) {
    }
  }
  GELOGD("AtomicAddrCleanPass end.");
  GE_TIMESTAMP_END(AtomicAddrCleanPass, "GraphManager::AtomicAddrCleanPass");
  return SUCCESS;
 }

@@ -166,14 +172,12 @@ NodePtr AtomicAddrCleanPass::InsertAtomicAddrCleanNode(ComputeGraphPtr &graph) {
  if (!session_graph_id.empty()) {
    (void)AttrUtils::SetStr(op_desc, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id);
  }
  string node_name = op_desc->GetName();
  // Only flush subgraph name
  if (graph->GetParentGraph() != nullptr) {
    node_name = graph->GetName() + "_" + node_name;
  }
  string node_name = (graph->GetParentGraph() != nullptr)
                       ? (graph->GetName() + "_" + op_desc->GetName() + session_graph_id)
                       : (op_desc->GetName() + session_graph_id);

  string name = node_name + session_graph_id;
  op_desc->SetName(name);
  op_desc->SetName(node_name);
  GELOGI("Create cleanAddr op:%s.", op_desc->GetName().c_str());
  // To avoid same name between graphs, set session graph id to this node
  NodePtr clean_addr_node = graph->AddNodeFront(op_desc);
@@ -199,7 +203,7 @@ Status AtomicAddrCleanPass::LinkToAtomicNode(const NodePtr &atomic_node, NodePtr
  }
  GELOGD("Graph add cleanAddrNode op out ctrl edge, dst node: %s.", atomic_node->GetName().c_str());
  std::string stream_label;
  if (is_loop_graph_ && AttrUtils::GetStr(atomic_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label)) {
  if (is_loop_graph && AttrUtils::GetStr(atomic_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label)) {
    if (!AttrUtils::SetStr(atomic_clean_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label)) {
      GELOGW("LinkToAtomicNode: SetStr failed");
      return INTERNAL_ERROR;
@@ -258,7 +262,7 @@ bool AtomicAddrCleanPass::IsAtomicOp(const NodePtr &node) {
  return true;
 }
 ///
 /// @brief Clear Status, used for subgraph pass
 /// @brief Clear Status, uesd for subgraph pass
 /// @return SUCCESS
 ///
 Status AtomicAddrCleanPass::ClearStatus() {
--- a/src/ge/graph/passes/atomic_addr_clean_pass.h
+++ b/src/ge/graph/passes/atomic_addr_clean_pass.h
@@ -75,7 +75,6 @@ class AtomicAddrCleanPass : public GraphPass {
  bool IsAtomicOp(const NodePtr &node);

  vector<NodePtr> hcom_node_vec_;
  bool is_loop_graph_ = false;
 };
 }  // namespace ge

--- a/src/ge/graph/passes/attach_stream_label_pass.cc
+++ b/src/ge/graph/passes/attach_stream_label_pass.cc
@@ -1,319 +0,0 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "graph/passes/attach_stream_label_pass.h"
 #include "ge/ge_api_types.h"
 #include "graph/common/omg_util.h"

 namespace ge {
 Status AttachStreamLabelPass::Run(ComputeGraphPtr graph) {
  GELOGD("AttachStreamLabelPass Enter.");

  FindNodes(graph);
  for (const auto &node : need_label_nodes_) {
    OpDescPtr op_desc = node->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
    if (!op_desc->HasAttr(ATTR_NAME_STREAM_LABEL)) {
      GE_CHK_STATUS_RET(UpdateCondBranch(node), "Update cond branch failed, start node:%s.", node->GetName().c_str());
    }
  }
  GE_CHK_STATUS_RET(UpdateEnterNode(), "UpdateEnterNode failed.");

  GELOGD("AttachStreamLabelPass Leave.");
  return SUCCESS;
 }

 ///
 /// @brief Clear Status, used for subgraph pass
 /// @return
 ///
 Status AttachStreamLabelPass::ClearStatus() {
  stream_switch_nodes_.clear();
  need_label_nodes_.clear();
  enter_nodes_.clear();
  branch_head_nodes_.clear();
  return SUCCESS;
 }

 ///
 /// @brief Find StreamSwitch / StreamMerge / Enter node
 /// @param [in] graph
 /// @return void
 ///
 void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph) {
  for (const NodePtr &node : graph->GetDirectNode()) {
    const std::string &type = node->GetType();
    if (type == STREAMSWITCH) {
      stream_switch_nodes_.emplace_back(node);
    } else if (type == STREAMMERGE) {
      if ((node->GetOpDesc() != nullptr) && !node->GetOpDesc()->HasAttr(ATTR_NAME_NEXT_ITERATION)) {
        need_label_nodes_.emplace_back(node);
      }
    } else if ((type == ENTER) || (type == REFENTER)) {
      enter_nodes_.emplace_back(node);
    }
  }

  for (const auto &node : stream_switch_nodes_) {
    for (const auto &out_ctrl_node : node->GetOutControlNodes()) {
      MarkHeadNodes(out_ctrl_node, node);
    }
    need_label_nodes_.emplace_back(node);
  }
 }

 ///
 /// @brief Mark node as head_node of stream_switch
 /// @param [in] node
 /// @param [in] stream_switch
 /// @return void
 ///
 void AttachStreamLabelPass::MarkHeadNodes(const NodePtr &node, const NodePtr &stream_switch) {
  static const std::set<std::string> bypass_type_set = {IDENTITY,  IDENTITYN,  CAST,   TRANSDATA,
                                                        TRANSPOSE, TRANSPOSED, RESHAPE};
  std::stack<NodePtr> nodes;
  nodes.push(node);
  std::set<NodePtr> visited;
  while (!nodes.empty()) {
    NodePtr cur_node = nodes.top();
    nodes.pop();
    if (visited.count(cur_node) > 0) {
      continue;
    }
    GELOGD("branch_head_node %s of stream_switch %s.", cur_node->GetName().c_str(), stream_switch->GetName().c_str());
    branch_head_nodes_[cur_node] = stream_switch;
    if (bypass_type_set.count(cur_node->GetType()) > 0) {
      for (const auto &out_node : cur_node->GetOutAllNodes()) {
        nodes.push(out_node);
      }
    }
    visited.insert(cur_node);
  }
 }

 ///
 /// @brief update cond branch
 /// @param [in] node
 /// @return Status
 ///
 Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
  std::string stream_label;
  std::unordered_set<NodePtr> branch_nodes;
  std::unordered_set<NodePtr> visited;
  std::stack<NodePtr> nodes;
  nodes.push(node);

  static const std::set<std::string> end_type_set = {STREAMSWITCH, STREAMMERGE, MERGE};
  bool merge_flag = false;
  bool exit_flag = false;
  bool net_output_flag = false;
  while (!nodes.empty()) {
    NodePtr cur_node = nodes.top();
    nodes.pop();
    if (visited.count(cur_node) > 0) {
      continue;
    }
    if (AttachFlag(cur_node, stream_label, merge_flag, exit_flag, net_output_flag) != SUCCESS) {
      GELOGE(FAILED, "Attach flag for node %s failed.", cur_node->GetName().c_str());
      return FAILED;
    }

    const std::string &type = cur_node->GetType();
    for (const auto &out_node : cur_node->GetOutAllNodes()) {
      const std::string &out_type = out_node->GetType();
      bool stop_flag = (end_type_set.count(out_type) > 0) ||
                       ((branch_head_nodes_.count(out_node) > 0) && (branch_head_nodes_[out_node] != node)) ||
                       (((type == ENTER) || (type == REFENTER)) && (out_type != STREAMACTIVE));
      if (!stop_flag) {
        nodes.push(out_node);
        GELOGD("Insert branch node %s.", out_node->GetName().c_str());
        branch_nodes.insert(out_node);
      }
    }
    visited.insert(cur_node);
  }

  if (node->GetType() == STREAMSWITCH) {
    GE_CHK_STATUS_RET(SetActiveLabelList(node, {stream_label}), "set active_label_list failed.");
  }

  bool attach_flag = (merge_flag || exit_flag) && net_output_flag;
  if (attach_flag) {
    GELOGI("No need to keep on attaching label.");
    return SUCCESS;
  }

  for (const NodePtr &tmp_node : branch_nodes) {
    GELOGD("Attach label %s to node: %s.", stream_label.c_str(), tmp_node->GetName().c_str());
    GE_CHK_STATUS_RET(SetStreamLabel(tmp_node, stream_label), "Set stream label failed.");
  }

  return SUCCESS;
 }

 ///
 /// @brief attach flag
 /// @param [in] node
 /// @param [out] stream_label
 /// @param [out] merge_flag
 /// @param [out] exit_flag
 /// @param [out] net_output_flag
 /// @return Status
 ///
 Status AttachStreamLabelPass::AttachFlag(const NodePtr &node, std::string &stream_label, bool &merge_flag,
                                         bool &exit_flag, bool &net_output_flag) {
  const std::string &type = node->GetType();
  if (type == STREAMSWITCH) {
    if (node->GetInDataNodes().empty()) {
      GELOGE(INTERNAL_ERROR, "node %s has no input_data_node.", node->GetName().c_str());
      return INTERNAL_ERROR;
    }
    stream_label = node->GetInDataNodes().at(0)->GetName();
    GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed.");
    bool value = false;
    OpDescPtr op_desc = node->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
    GE_CHK_BOOL_EXEC(AttrUtils::GetBool(op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value), return FAILED,
                     "StreamSwitch get attr TRUE_BRANCH_STREAM failed.");
    stream_label += (value ? "_t" : "_f");
  } else if (type == STREAMMERGE) {
    stream_label = node->GetName();
    GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed.");
    merge_flag = true;
  } else if ((type == EXIT) || (type == REFEXIT)) {
    GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed.");
    exit_flag = true;
  } else if (type == NETOUTPUT) {
    net_output_flag = true;
  }

  return SUCCESS;
 }

 ///
 /// @brief Update stream_label start with enter nodes
 /// @return Status
 ///
 Status AttachStreamLabelPass::UpdateEnterNode() {
  std::unordered_map<NodePtr, std::vector<NodePtr>> enter_active_map;
  for (const auto &enter_node : enter_nodes_) {
    for (const auto &out_ctrl_node : enter_node->GetOutControlNodes()) {
      if (out_ctrl_node->GetType() != STREAMACTIVE) {
        continue;
      }
      auto iter = enter_active_map.find(out_ctrl_node);
      if (iter == enter_active_map.end()) {
        enter_active_map[out_ctrl_node] = {enter_node};
      } else {
        iter->second.emplace_back(enter_node);
      }
    }
  }

  for (const auto &pair : enter_active_map) {
    if (SetEnterLabel(pair.second, pair.first) != SUCCESS) {
      GELOGE(FAILED, "Set stream_label for enter_nodes failed.");
      return FAILED;
    }

    NodePtr active_node = pair.first;
    GE_CHECK_NOTNULL(active_node);
    std::vector<std::string> active_label_list;
    if (!AttrUtils::GetListStr(active_node->GetOpDesc(), ATTR_NAME_ACTIVE_LABEL_LIST, active_label_list) ||
        (active_label_list.size() != 1) || active_label_list[0].empty()) {
      GELOGE(INTERNAL_ERROR, "Get attr ATTR_NAME_ACTIVE_LABEL_LIST failed, node: %s.", active_node->GetName().c_str());
      return INTERNAL_ERROR;
    }

    std::stack<NodePtr> enter_nodes;
    for (const auto &enter_node : pair.second) {
      enter_nodes.emplace(enter_node);
    }
    if (UpdateLoopBranch(enter_nodes, active_label_list[0]) != SUCCESS) {
      GELOGE(FAILED, "Update stream_label for loop_branch failed.");
      return FAILED;
    }
  }

  return SUCCESS;
 }

 ///
 /// @brief Set stream_label for enter_nodes
 /// @param [in] enter_nodes
 /// @param [in] active_node
 /// @return Status
 ///
 Status AttachStreamLabelPass::SetEnterLabel(const std::vector<NodePtr> &enter_nodes, const NodePtr &active_node) {
  std::string stream_label;
  GE_CHECK_NOTNULL(active_node);
  (void)AttrUtils::GetStr(active_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label);

  bool same_flag = true;
  for (const auto &enter_node : enter_nodes) {
    std::string tmp_label;
    (void)AttrUtils::GetStr(enter_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, tmp_label);
    if (tmp_label.empty() || (stream_label == tmp_label)) {
      continue;
    }
    same_flag = false;
    break;
  }

  if (stream_label.empty()) {
    if (same_flag) {
      stream_label = active_node->GetName();
    } else {
      GELOGW("stream_label of enter_active is empty while stream_label of some enter_node is not.");
      return SUCCESS;
    }
  }

  for (const auto &enter_node : enter_nodes) {
    GE_CHK_STATUS_RET(SetStreamLabel(enter_node, stream_label), "Set stream label failed.");
  }
  GE_CHK_STATUS_RET(SetStreamLabel(active_node, stream_label), "Set stream label failed.");
  return SUCCESS;
 }

 ///
 /// @brief Update stream_label for loop_branch
 /// @param [in] enter_nodes
 /// @param [in] stream_label
 /// @return Status
 ///
 Status AttachStreamLabelPass::UpdateLoopBranch(const std::stack<NodePtr> &enter_nodes,
                                               const std::string &stream_label) {
  std::stack<NodePtr> nodes(enter_nodes);
  NodePtr cur_node = nullptr;
  while (!nodes.empty()) {
    cur_node = nodes.top();
    nodes.pop();
    for (const NodePtr &out_node : cur_node->GetOutAllNodes()) {
      OpDescPtr out_desc = out_node->GetOpDesc();
      GE_CHECK_NOTNULL(out_desc);
      std::string out_type = out_desc->GetType();
      if (out_desc->HasAttr(ATTR_NAME_STREAM_LABEL) || (out_type == ENTER) || (out_type == REFENTER)) {
        continue;
      }
      GELOGD("Attach label %s to node: %s.", stream_label.c_str(), out_node->GetName().c_str());
      GE_CHK_STATUS_RET(SetStreamLabel(out_node, stream_label), "Set stream label failed.");
      nodes.push(out_node);
    }
  }
  return SUCCESS;
 }
 }  // namespace ge
--- a/src/ge/graph/passes/attach_stream_label_pass.h
+++ b/src/ge/graph/passes/attach_stream_label_pass.h
@@ -1,97 +0,0 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef GE_GRAPH_PASSES_ATTACH_STREAM_LABEL_PASS_H_
 #define GE_GRAPH_PASSES_ATTACH_STREAM_LABEL_PASS_H_

 #include <stack>
 #include "inc/graph_pass.h"

 namespace ge {
 class AttachStreamLabelPass : public GraphPass {
 public:
  Status Run(ComputeGraphPtr graph);

  ///
  /// @brief Clear Status, used for subgraph pass
  /// @return
  ///
  Status ClearStatus() override;

 private:
  ///
  /// @brief Find StreamSwitch / StreamMerge / Enter node
  /// @param [in] graph
  /// @return void
  ///
  void FindNodes(const ComputeGraphPtr &graph);

  ///
  /// @brief Mark node as head_node of stream_switch
  /// @param [in] node
  /// @param [in] stream_switch
  /// @return void
  ///
  void MarkHeadNodes(const NodePtr &node, const NodePtr &stream_switch);

  ///
  /// @brief update cond branch
  /// @param [in] node
  /// @return Status
  ///
  Status UpdateCondBranch(const NodePtr &node);

  ///
  /// @brief attach flag
  /// @param [in] node
  /// @param [out] stream_label
  /// @param [out] merge_flag
  /// @param [out] exit_flag
  /// @param [out] net_output_flag
  /// @return Status
  ///
  static Status AttachFlag(const NodePtr &node, std::string &stream_label, bool &merge_flag, bool &exit_flag,
                           bool &net_output_flag);

  ///
  /// @brief Update stream_label for loop_branch
  /// @param [in] enter_nodes
  /// @param [in] stream_label
  /// @return Status
  ///
  static Status UpdateLoopBranch(const std::stack<NodePtr> &enter_nodes, const std::string &stream_label);

  ///
  /// @brief Update stream_label start with enter nodes
  /// @return Status
  ///
  Status UpdateEnterNode();

  ///
  /// @brief Set stream_label for enter_nodes
  /// @param [in] enter_nodes
  /// @param [in] active_node
  /// @return Status
  ///
  static Status SetEnterLabel(const std::vector<NodePtr> &enter_nodes, const NodePtr &active_node);

  std::vector<NodePtr> stream_switch_nodes_;
  std::vector<NodePtr> need_label_nodes_;
  std::vector<NodePtr> enter_nodes_;
  std::unordered_map<NodePtr, NodePtr> branch_head_nodes_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_ATTACH_STREAM_LABEL_PASS_H_
--- a/src/ge/graph/passes/cast_remove_pass.cc
+++ b/src/ge/graph/passes/cast_remove_pass.cc
@@ -69,6 +69,7 @@ bool CastRemovePass::HasSameDataType(OpDescPtr &begin_op_desc, OpDescPtr &end_op

  auto begin_out_desc = begin_op_desc->MutableOutputDesc(0);
  DataType begin_out_datatype = begin_out_desc->GetDataType();

  if (begin_out_datatype == end_out_datatype && (begin_out_datatype == DT_FLOAT16 || begin_out_datatype == DT_FLOAT)) {
    type = begin_out_datatype;
    return true;
--- a/src/ge/graph/passes/common_subexpression_elimination_pass.cc
+++ b/src/ge/graph/passes/common_subexpression_elimination_pass.cc
@@ -83,7 +83,6 @@ Status CommonSubexpressionEliminationPass::Run(ComputeGraphPtr graph) {
      continue;
    }
    auto key = GetCseKey(node);
    GELOGD("The node %s cse key %s", node->GetName().c_str(), key.c_str());
    auto iter = keys_to_node.find(key);
    if (iter == keys_to_node.end()) {
      keys_to_node[key] = node;
--- a/src/ge/graph/passes/compile_nodes_pass.cc
+++ b/src/ge/graph/passes/compile_nodes_pass.cc
@@ -23,7 +23,6 @@
 #include "common/ge_inner_error_codes.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/common/ge_call_wrapper.h"
 #include "graph/op_desc.h"

 using domi::ImplyType;
@@ -79,7 +78,7 @@ graphStatus CompileNodesPass::Run(ComputeGraphPtr graph) {
    return result;
  }
  GELOGI("[CompileNodesPass]: Optimize success.");
  GE_TIMESTAMP_EVENT_END(CompileNodesPass, "OptimizeStage2::ControlAttrOptimize::CompileNodesPass");
  GE_TIMESTAMP_END(CompileNodesPass, "GraphManager::CompileNodesPass");
  return GRAPH_SUCCESS;
 }

@@ -102,6 +101,7 @@ graphStatus CompileNodesPass::GetSupportedKernel(const NodePtr &node, const std:
    }
  }
  OpsKernelInfoStorePtr kernel_info = instance->OpsKernelManagerObj().GetOpsKernelInfoStore(kernel_lib_name);

  if (kernel_info == nullptr) {
    GELOGE(ge::GE_GRAPH_PARAM_NULLPTR, "Get op %s ops kernel info store failed", node->GetName().c_str());
    return ge::GE_GRAPH_PARAM_NULLPTR;
--- a/src/ge/graph/passes/cond_pass.cc
+++ b/src/ge/graph/passes/cond_pass.cc
@@ -226,7 +226,7 @@ Status CondPass::HandleScalarCond(const ComputeGraphPtr &graph, const OutDataAnc
    return FAILED;
  }

  if (GraphUtils::InsertNodeAfter(out_anchor, {in_anchor}, cast_node) != GRAPH_SUCCESS) {
  if (GraphUtils::InsertNodeBefore(out_anchor, {in_anchor}, cast_node) != GRAPH_SUCCESS) {
    GELOGE(FAILED, "Insert Cast node %s between %s->%s failed.", cast_node->GetName().c_str(),
           out_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str());
    return FAILED;
@@ -271,7 +271,7 @@ Status CondPass::InsertNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr
  }
  AddRePassNode(new_node);

  if (GraphUtils::InsertNodeAfter(out_anchor, {in_anchor}, new_node) != GRAPH_SUCCESS) {
  if (GraphUtils::InsertNodeBefore(out_anchor, {in_anchor}, new_node) != GRAPH_SUCCESS) {
    GELOGE(FAILED, "Insert %s node %s between %s->%s failed.", type.c_str(), new_node->GetName().c_str(),
           out_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str());
    return FAILED;