diff --git a/inc/common/optimizer/graph_optimizer.h b/inc/common/optimizer/graph_optimizer.h
index 2c0cebe6..253aaae1 100644
--- a/inc/common/optimizer/graph_optimizer.h
+++ b/inc/common/optimizer/graph_optimizer.h
@@ -62,6 +62,9 @@ class GraphOptimizer {
 
   // optimize streamed Graph
   virtual Status OptimizeStreamGraph(ComputeGraph &graph, const RunContext &context) { return SUCCESS; }
+
+  // op compile
+  virtual Status OptimizeFusedGraphAfterGraphSlice(ComputeGraph &graph) { return SUCCESS; }
 };
 }  // namespace ge
 /*lint +e148*/
diff --git a/inc/common/util/ai_core/common/aicore_util_attr_define.h b/inc/common/util/ai_core/common/aicore_util_attr_define.h
index 6c20c470..ba28d7b3 100644
--- a/inc/common/util/ai_core/common/aicore_util_attr_define.h
+++ b/inc/common/util/ai_core/common/aicore_util_attr_define.h
@@ -35,5 +35,7 @@ static const std::string ATTR_NAME_L2_FUSION_EXTEND_PTR = "l2_fusion_extend_cont
 static const std::string L1_OPTIMIZED = "l1_optimized";
 
 static const std::string L2_OPTIMIZED = "l2_optimized";
+
+static const std::string OP_SLICE_INFO = "_op_slice_info";
 }  // namespace fe
 #endif
diff --git a/inc/common/util/ai_core/common/scope_allocator.h b/inc/common/util/ai_core/common/scope_allocator.h
index 3b264425..6cebb286 100644
--- a/inc/common/util/ai_core/common/scope_allocator.h
+++ b/inc/common/util/ai_core/common/scope_allocator.h
@@ -34,6 +34,7 @@ class ScopeAllocator {
   bool HasScopeAttr(ge::ConstOpDescPtr opdef);
   bool GetScopeAttr(ge::ConstOpDescPtr opdef, int64_t& scopeId);
   bool SetScopeAttr(ge::OpDescPtr opdef, int64_t scopeId);
+  bool ResetScopeId(int64_t scopeId);
 
  private:
   int64_t scopeId;
diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h
index 619812d7..68743bc8 100644
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -40,6 +40,7 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode";
 const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile";
 const char *const GE_AICPU_FLAG = "ge.aicpuFlag";
 const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath";
+// Dump flag and para
 const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump";
 const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath";
 const char *const OPTION_EXEC_DUMP_STEP = "ge.exec.dumpStep";
@@ -48,7 +49,10 @@ const char *const OPTION_EXEC_ENABLE_DUMP_DEBUG = "ge.exec.enableDumpDebug";
 const char *const OPTION_EXEC_DUMP_DEBUG_MODE = "ge.exec.dumpDebugMode";
 const char *const OPTION_EXEC_ENABLE_INCRE_BUILD = "ge.exec.enableIncreBuild";
 const char *const OPTION_EXEC_INCRE_BUILD_CACHE_PATH = "ge.exec.increBuildCachePath";
+const char *const OPTION_EXEC_ENABLE_EXCEPTION_DUMP = "ge.exec.enable_exception_dump";
 const char *const OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES = "ge.exec.enableScopeFusionPasses";
+const char *const OPTION_EXEC_PROFILING_FPPONIT_OPTIONS = "ge.exec.profilingFpPointOptions";
+const char *const OPTION_EXEC_PROFILING_BPPONIT_OPTIONS = "ge.exec.profilingBpPointOptions";
 // profiling flag
 const char *const OPTION_EXEC_PROFILING_MODE = "ge.exec.profilingMode";
 const char *const OPTION_EXEC_PROFILING_OPTIONS = "ge.exec.profilingOptions";
diff --git a/inc/external/graph/operator_reg.h b/inc/external/graph/operator_reg.h
index f0e1e84a..759c70f2 100644
--- a/inc/external/graph/operator_reg.h
+++ b/inc/external/graph/operator_reg.h
@@ -223,6 +223,7 @@ class OpReg {
                                                                                                                    \
  private:                                                                                                          \
   void __dy_input_##x() {                                                                                          \
+    Operator::DynamicInputRegister(#x, 0, true);                                                                   \
     (void)OpReg()
 
 #define DYNAMIC_OUTPUT(x, t)                                                                                         \
@@ -242,6 +243,7 @@ class OpReg {
                                                                                                                      \
  private:                                                                                                            \
   void __dy_output_##x() {                                                                                           \
+    Operator::DynamicOutputRegister(#x, 0, true);                                                                    \
     (void)OpReg()
 
 #define GRAPH(x)                                                                                \
diff --git a/inc/external/register/register.h b/inc/external/register/register.h
index e905e8d4..f3091fae 100644
--- a/inc/external/register/register.h
+++ b/inc/external/register/register.h
@@ -55,6 +55,28 @@ class Message;
 }  // namespace google
 
 namespace domi {
+const int64_t kMaxNameLength = 1048576;  // 1M
+
+enum DynamicType { kInvalid = 0, kInput = 1, kOutput = 2 };
+struct DynamicInputOutputInfo {
+  DynamicType type;  // input/output
+  const char *port_name;
+  int64_t port_name_len;
+  const char *attr_name;
+  int64_t attr_name_len;
+  DynamicInputOutputInfo()
+      : type(kInvalid), port_name(nullptr), port_name_len(0), attr_name(nullptr), attr_name_len(0) {}
+  DynamicInputOutputInfo(DynamicType type, const char *port_name, int64_t port_name_len, const char *attr_name,
+                         int64_t attr_name_len)
+      : type(type),
+        port_name(port_name),
+        port_name_len(port_name_len),
+        attr_name(attr_name),
+        attr_name_len(attr_name_len) {}
+};
+Status AutoMappingByOpFn(const ge::Operator &op_src, ge::Operator &op);
+Status AutoMappingByOpFnDynamic(const ge::Operator &op_src, ge::Operator &op,
+                                const vector<DynamicInputOutputInfo> &dynamic_name_attr_value);
 Status AutoMappingFn(const google::protobuf::Message *op_src, ge::Operator &op);
 Status AutoMappingFnDynamic(const google::protobuf::Message *op_src, ge::Operator &op,
                             std::map<std::string, std::pair<std::string, std::string>> dynamic_name_attr_value,
@@ -71,6 +93,7 @@ using ParseParamFunc = std::function<domi::Status(const google::protobuf::Messag
 using ParseParamByOpFunc = std::function<domi::Status(const ge::Operator &, ge::Operator &)>;
 using FusionParseParamFunc =
   std::function<domi::Status(const std::vector<const google::protobuf::Message *>, ge::Operator &)>;
+using FusionParseParamByOpFunc = std::function<domi::Status(const std::vector<ge::Operator> &, ge::Operator &)>;
 using ParseSubgraphFunc = std::function<Status(const std::string &subgraph_name, const ge::Graph &graph)>;
 
 class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistrationData {
@@ -91,6 +114,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistrationData {
 
   OpRegistrationData &FusionParseParamsFn(const FusionParseParamFunc &fusionParseParamFn);
 
+  OpRegistrationData &FusionParseParamsFn(const FusionParseParamByOpFunc &fusion_parse_param_fn);
+
   OpRegistrationData &ParseSubgraphPostFn(const ParseSubgraphFunc &subgraph_post_fn);
 
   OpRegistrationData &ImplyType(const domi::ImplyType &imply_type);
@@ -108,6 +133,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistrationData {
   ParseParamFunc GetParseParamFn() const;
   ParseParamByOpFunc GetParseParamByOperatorFn() const;
   FusionParseParamFunc GetFusionParseParamFn() const;
+  FusionParseParamByOpFunc GetFusionParseParamByOpFn() const;
   ParseSubgraphFunc GetParseSubgraphPostFn() const;
 
  private:
diff --git a/inc/external/register/scope/scope_fusion_pass_register.h b/inc/external/register/scope/scope_fusion_pass_register.h
index 77be4b8c..8e5605a7 100644
--- a/inc/external/register/scope/scope_fusion_pass_register.h
+++ b/inc/external/register/scope/scope_fusion_pass_register.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 #include <map>
+#include <unordered_map>
 #include "ge/ge_api_error_codes.h"
 #include "register/register_error_codes.h"
 #include "register/register_types.h"
@@ -52,15 +53,16 @@ class ScopePassManager;
 
 class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY Scope {
  public:
-  explicit Scope(const std::string &name, const std::string &sub_type = "", Scope *father_scope = nullptr);
+  Scope();
+  Status Init(const std::string &name, const std::string &sub_type = "", Scope *father_scope = nullptr);
   ~Scope();
 
-  std::string Name() const;
-  std::string SubType() const;
-  std::map<std::string, ge::OperatorPtr> AllNodesMap() const;
+  const std::string &Name() const;
+  const std::string &SubType() const;
+  const std::unordered_map<std::string, ge::OperatorPtr> &AllNodesMap() const;
   Scope *GetSubScope(const std::string &scope_name) const;
-  std::string LastName() const;
-  std::vector<Scope *> GetAllSubScopes() const;
+  const std::string LastName() const;
+  const std::vector<Scope *> &GetAllSubScopes() const;
   const Scope *GetFatherScope() const;
 
  private:
@@ -76,12 +78,13 @@ class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY Scope {
 class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY FusionScopesResult {
  public:
   FusionScopesResult();
+  Status Init();
   ~FusionScopesResult();
   void SetName(const std::string &name);
   void SetType(const std::string &type);
   void SetDescription(const std::string &description);
-  std::string Name() const;
-  std::vector<ge::OperatorPtr> Nodes() const;
+  const std::string &Name() const;
+  const std::vector<ge::OperatorPtr> &Nodes() const;
   void InsertInputs(const std::string &inner_op_name, const std::vector<int32_t> &index_map);
   void InsertOutputs(const std::string &inner_op_name, const std::vector<int32_t> &index_map);
 
@@ -136,7 +139,7 @@ class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY ScopeTree {
   ScopeTree &operator=(const ScopeTree &scopetree) = delete;
   ~ScopeTree();
 
-  std::vector<Scope *> GetAllScopes() const;
+  const std::vector<Scope *> &GetAllScopes() const;
 
  private:
   class ScopeTreeImpl;
@@ -154,7 +157,7 @@ class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY ScopeGraph {
   ~ScopeGraph();
 
   const ScopeTree *GetScopeTree() const;
-  std::map<std::string, ge::OperatorPtr> GetNodesMap() const;
+  const std::unordered_map<std::string, ge::OperatorPtr> &GetNodesMap() const;
 
  private:
   class ScopeGraphImpl;
@@ -203,7 +206,7 @@ class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY NodeOpTypeFeature : ScopeBa
 
 class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY NodeAttrFeature : ScopeBaseFeature {
  public:
-  NodeAttrFeature(std::string nodeType, std::string attr_name, ge::DataType datatype, ScopeAttrValue attr_value);
+  NodeAttrFeature(std::string nodeType, std::string attr_name, ge::DataType datatype, ScopeAttrValue &attr_value);
   NodeAttrFeature(NodeAttrFeature const &feature);
   NodeAttrFeature &operator=(NodeAttrFeature const &feature);
   ~NodeAttrFeature();
diff --git a/inc/framework/common/ge_types.h b/inc/framework/common/ge_types.h
index 3686befc..36c1a0bf 100644
--- a/inc/framework/common/ge_types.h
+++ b/inc/framework/common/ge_types.h
@@ -258,16 +258,19 @@ struct ComputeGraphDescInfo {
 
 struct OpDescInfo {
   std::string op_name;
+  std::string op_type;
   uint32_t task_id;
   uint32_t stream_id;
   std::vector<Format> input_format;
   std::vector<std::vector<int64_t>> input_shape;
   std::vector<DataType> input_data_type;
   std::vector<void *> input_addrs;
+  std::vector<int64_t> input_size;
   std::vector<Format> output_format;
   std::vector<std::vector<int64_t>> output_shape;
   std::vector<DataType> output_data_type;
   std::vector<void *> output_addrs;
+  std::vector<int64_t> output_size;
 };
 struct ModelDumpConfig {
   std::string model_name;
diff --git a/inc/framework/common/helper/model_helper.h b/inc/framework/common/helper/model_helper.h
index 3671f970..fbe7e73f 100644
--- a/inc/framework/common/helper/model_helper.h
+++ b/inc/framework/common/helper/model_helper.h
@@ -64,6 +64,7 @@ class ModelHelper {
   Status LoadWeights(OmFileLoadHelper& om_load_helper);
   Status LoadTask(OmFileLoadHelper& om_load_helper);
   Status LoadTBEKernelStore(OmFileLoadHelper& om_load_helper);
+  Status LoadCustAICPUKernelStore(OmFileLoadHelper& om_load_helper);
   Status ReleaseLocalModelData() noexcept;
   Status SaveModelPartition(std::shared_ptr<OmFileSaveHelper>& om_file_save_helper, ModelPartitionType type,
                             const uint8_t* data, size_t size);
diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h
index 189c63c3..ad284d07 100644
--- a/inc/framework/common/types.h
+++ b/inc/framework/common/types.h
@@ -851,9 +851,9 @@ static constexpr int32_t PARTITION_TYPE_WEIGHTS = 1;
 static constexpr int32_t PARTITION_TYPE_TASK_INFO = 2;
 
 // number of partitions in the current model
-static constexpr uint32_t PARTITION_SIZE = 4;
+static constexpr uint32_t PARTITION_SIZE = 5;
 
-enum ModelPartitionType { MODEL_DEF = 0, WEIGHTS_DATA, TASK_INFO, TBE_KERNELS };
+enum ModelPartitionType { MODEL_DEF = 0, WEIGHTS_DATA, TASK_INFO, TBE_KERNELS, CUST_AICPU_KERNELS };
 
 struct ModelPartitionMemInfo {
   ModelPartitionType type;
diff --git a/inc/framework/executor/ge_executor.h b/inc/framework/executor/ge_executor.h
index f9fa4ce9..00846112 100644
--- a/inc/framework/executor/ge_executor.h
+++ b/inc/framework/executor/ge_executor.h
@@ -108,11 +108,11 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor {
   /// @ingroup ge
   /// @brief Get current dynamic dims info by combined dims
   /// @param [in] model_id: model id allocate from manager
-  /// @param [in] combined_dims: array of combined dimensions
+  /// @param [in] dynamic_dims: cur gear dynamic dims value
   /// @param [out] cur_dynamic_dims: current dynamic dims
   /// @return execute result
   ///
-  ge::Status GetCurDynamicDims(uint32_t model_id, const std::vector<uint64_t> &combined_dims,
+  ge::Status GetCurDynamicDims(uint32_t model_id, const std::vector<uint64_t> &dynamic_dims,
                                std::vector<uint64_t> &cur_dynamic_dims);
 
   ///
diff --git a/inc/framework/generator/ge_generator.h b/inc/framework/generator/ge_generator.h
index d3f472e9..37bca897 100644
--- a/inc/framework/generator/ge_generator.h
+++ b/inc/framework/generator/ge_generator.h
@@ -28,6 +28,7 @@
 #include "graph/graph.h"
 #include "graph/op_desc.h"
 #include "graph/detail/attributes_holder.h"
+#include "omg/omg_inner_types.h"
 
 namespace ge {
 class GeGenerator {
@@ -45,6 +46,7 @@ class GeGenerator {
   GeGenerator &operator=(const GeGenerator &) = delete;
 
   Status Initialize(const std::map<std::string, std::string> &options);
+  Status Initialize(const std::map<std::string, std::string> &options, OmgContext &context);
 
   Status Finalize();
 
diff --git a/inc/framework/omg/omg_inner_types.h b/inc/framework/omg/omg_inner_types.h
index 80361232..2f91d7aa 100644
--- a/inc/framework/omg/omg_inner_types.h
+++ b/inc/framework/omg/omg_inner_types.h
@@ -98,24 +98,14 @@ struct OmgContext {
   std::vector<std::string> out_top_names;
   // path for the aicpu custom operator so_file
   std::vector<std::string> aicpu_op_run_paths;
-  // ddk version
-  std::string ddk_version;
   // preferential format used by the entire network
   domiTensorFormat_t net_format = DOMI_TENSOR_RESERVED;
   domi::FrameworkType type = domi::FRAMEWORK_RESERVED;
   RunMode run_mode = ONLY_PRE_CHECK;
   bool train_flag = false;
-  // whether to use FP16 high precision
-  int32_t fp16_high_precision = HIGH_PRECISION_DEFAULT;
 
   std::string output_type;
 
-  // Save the name of the entire network: Some special operators are used to determine a network. Some operators in the
-  // network require special processing based on the specific network. e.g：faster-rcnn, the FirstStageProcessor module
-  // is determined as the Faster-R-CNN network based on the scope fusion. Then, the conv+reshape operators in the
-  // FirstStageBoxPredictor/BoxEncodingPredictor scope are combined. The convolution kernel rearrangement reshape
-  // operator needs to be deleted for the convolution kernel.
-  std::string net_name;
   // Whether to use dynamic batch size or dynamic image size
   bool is_dynamic_input = false;
   std::string dynamic_batch_size;
diff --git a/inc/graph/compute_graph.h b/inc/graph/compute_graph.h
index 8d3db43c..2ec6b663 100644
--- a/inc/graph/compute_graph.h
+++ b/inc/graph/compute_graph.h
@@ -93,6 +93,7 @@ class ComputeGraph : public std::enable_shared_from_this<ComputeGraph>, public A
   NodePtr AddNodeFront(const OpDescPtr &op);
   NodePtr AddInputNode(NodePtr node);
   NodePtr AddOutputNode(NodePtr node);
+  NodePtr AddOutputNodeByIndex(NodePtr node, int32_t index);
   // insert node with specific pre_node
   NodePtr AddNodeAfter(OpDescPtr &op, const NodePtr &pre_node);
   NodePtr AddNodeAfter(NodePtr node, const NodePtr &pre_node);
@@ -138,6 +139,7 @@ class ComputeGraph : public std::enable_shared_from_this<ComputeGraph>, public A
 
   graphStatus TopologicalSorting();
   bool IsValid() const;
+  void InValid() { is_valid_flag_ = false; }
   void Dump() const;
 
   void Swap(ComputeGraph &graph);
@@ -268,6 +270,7 @@ class ComputeGraph : public std::enable_shared_from_this<ComputeGraph>, public A
   friend class ModelSerializeImp;
   friend class GraphDebugImp;
   friend class OnnxUtils;
+  friend class TuningUtils;
 
   std::string name_;
   uint32_t graph_id_ = 0;
diff --git a/inc/graph/debug/ge_attr_define.h b/inc/graph/debug/ge_attr_define.h
index 714375e4..b0bf8ce9 100644
--- a/inc/graph/debug/ge_attr_define.h
+++ b/inc/graph/debug/ge_attr_define.h
@@ -1031,6 +1031,13 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_INPUT_L1_FLAG;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_INPUT_L1_ADDR;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_INPUT_L1_VALID_SIZE;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_ENGINE_NAME_FOR_LX;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_KKERNEL_LIB_NAME_FOR_LX;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NEED_LX_FUSION;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OPTIMIZE_GROUP;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_COMPILE_STRATEGY;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_TBE_KERNEL_NAME;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_TBE_KERNEL_BUFFER;
 
 // for unregistered op
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_UNREGST_OPPATH;
diff --git a/inc/graph/node.h b/inc/graph/node.h
index 2629f525..f4a1c6a8 100644
--- a/inc/graph/node.h
+++ b/inc/graph/node.h
@@ -174,6 +174,9 @@ class Node : public std::enable_shared_from_this<Node> {
     fusion_output_dataflow_list_ = fusion_output_list;
   }
 
+  bool GetHostNode() const { return host_node_; }
+  void SetHostNode(bool is_host) { host_node_ = is_host; }
+
   void SetOrigNode(const NodePtr &orignode) { orig_node_ = orignode; }
 
   NodePtr GetOrigNode() { return orig_node_; }
@@ -192,6 +195,7 @@ class Node : public std::enable_shared_from_this<Node> {
   OutControlAnchorPtr out_control_anchor_;
   map<string, GeAttrValue> attrs_;  // lint !e1073
   bool has_init_{false};
+  bool host_node_{false};
   bool anchor_status_updated_{false};
   std::vector<uint32_t> send_event_id_list_;
   std::vector<uint32_t> recv_event_id_list_;
@@ -202,6 +206,7 @@ class Node : public std::enable_shared_from_this<Node> {
   NodePtr orig_node_;
   friend class NodeUtils;
   friend class OnnxUtils;
+  friend class TuningUtils;
 };
 }  // namespace ge
 
diff --git a/inc/graph/op_desc.h b/inc/graph/op_desc.h
index 27c91efc..c7da30b7 100644
--- a/inc/graph/op_desc.h
+++ b/inc/graph/op_desc.h
@@ -18,6 +18,7 @@
 #define INC_GRAPH_OP_DESC_H_
 
 #include <functional>
+#include <algorithm>
 #include <map>
 #include <memory>
 #include <string>
@@ -87,6 +88,8 @@ class OpDesc : public std::enable_shared_from_this<OpDesc>, public AttrHolder {
 
   graphStatus AddInputDescMiddle(const string &name, const unsigned int num, size_t index);
 
+  graphStatus AddOutputDescMiddle(const string &name, const unsigned int num, size_t index);
+
   graphStatus AddOutputDescForward(const string &name, const unsigned int num);
 
   graphStatus AddOptionalInputDesc(const string &name, const GeTensorDesc &input_desc);
@@ -187,6 +190,14 @@ class OpDesc : public std::enable_shared_from_this<OpDesc>, public AttrHolder {
 
   graphStatus CommonVerify() const;
 
+  graphStatus AddRegisterInputName(const string &name);
+
+  graphStatus AddRegisterOutputName(const string &name);
+
+  vector<string> GetRegisterInputName() const;
+
+  vector<string> GetRegisterOutputName() const;
+
   using AttrHolder::AddRequiredAttr;
   using AttrHolder::DelAttr;
   using AttrHolder::GetAllAttrNames;
@@ -297,9 +308,11 @@ class OpDesc : public std::enable_shared_from_this<OpDesc>, public AttrHolder {
 
   vector<GeTensorDescPtr> inputs_desc_{};
   map<string, uint32_t> input_name_idx_{};
+  vector<string> register_input_name_{};
   std::unordered_set<string> optional_input_names_{};
   vector<GeTensorDescPtr> outputs_desc_{};
   map<string, uint32_t> output_name_idx_{};
+  vector<string> register_output_name_{};
   std::function<graphStatus(Operator &)> infer_func_ = nullptr;
   std::function<graphStatus(Operator &)> infer_format_func_ = nullptr;
   std::function<graphStatus(Operator &)> verifier_func_ = nullptr;
diff --git a/inc/graph/op_kernel_bin.h b/inc/graph/op_kernel_bin.h
index e81d79d0..3970460a 100644
--- a/inc/graph/op_kernel_bin.h
+++ b/inc/graph/op_kernel_bin.h
@@ -42,6 +42,7 @@ class OpKernelBin {
 
 using OpKernelBinPtr = std::shared_ptr<OpKernelBin>;
 const char *const OP_EXTATTR_NAME_TBE_KERNEL = "tbeKernel";
+const char *const OP_EXTATTR_CUSTAICPU_KERNEL = "cust_aicpu_kernel";
 }  // namespace ge
 
 #endif  // INC_GRAPH_OP_KERNEL_BIN_H_
diff --git a/inc/graph/opsproto_manager.h b/inc/graph/opsproto_manager.h
index 46b722ec..06846573 100644
--- a/inc/graph/opsproto_manager.h
+++ b/inc/graph/opsproto_manager.h
@@ -23,6 +23,7 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <mutex>
 
 namespace ge {
 class OpsProtoManager {
@@ -30,14 +31,15 @@ class OpsProtoManager {
   static OpsProtoManager *Instance();
 
   bool Initialize(const std::map<std::string, std::string> &options);
-
   void Finalize();
 
+ private:
   void LoadOpsProtoPluginSo(std::string &path);
 
- private:
   std::string pluginPath_;
   std::vector<void *> handles_;
+  bool is_init_ = false;
+  std::mutex mutex_;
 };
 }  // namespace ge
 
diff --git a/inc/graph/tuning_utils.h b/inc/graph/tuning_utils.h
new file mode 100644
index 00000000..98262a23
--- /dev/null
+++ b/inc/graph/tuning_utils.h
@@ -0,0 +1,130 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAIN_TUNING_UTILS_H
+#define MAIN_TUNING_UTILS_H
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <queue>
+#include <mutex>
+
+#include <graph/anchor.h>
+#include <graph/detail/attributes_holder.h>
+#include <graph/ge_tensor.h>
+#include <graph/graph.h>
+#include <graph/model.h>
+#include <graph/node.h>
+#include <graph/utils/graph_utils.h>
+#include <graph/utils/type_utils.h>
+
+#include "framework/common/debug/ge_log.h"
+#include "utils/attr_utils.h"
+#include "utils/node_utils.h"
+#include "external/ge/ge_api_types.h"
+#include "graph/debug/ge_attr_define.h"
+#include "graph/utils/op_desc_utils.h"
+#include "graph/utils/tensor_utils.h"
+namespace ge {
+// Configure build mode, default value is "normal"
+const char *const BUILD_MODE = "ge.buildMode";
+const char *const BUILD_STEP = "ge.buildStep";
+// Configure tuning path
+const char *const TUNING_PATH = "ge.tuningPath";
+// for interface: aclgrphBuildModel
+const std::set<std::string> ir_builder_supported_options_for_lx_fusion = {BUILD_MODE, BUILD_STEP, TUNING_PATH};
+
+// Build model
+const char *const BUILD_MODE_NORMAL = "normal";
+const char *const BUILD_MODE_TUNING = "tuning";
+const char *const BUILD_MODE_BASELINE = "baseline";
+const std::set<std::string> build_mode_options = {BUILD_MODE_NORMAL, BUILD_MODE_TUNING, BUILD_MODE_BASELINE};
+
+// Build step
+const char *const BUILD_STEP_BEFORE_UB_MATCH = "before_ub_match";
+const char *const BUILD_STEP_AFTER_UB_MATCH = "after_ub_match";
+const char *const BUILD_STEP_AFTER_BUILDER = "after_builder";
+const char *const BUILD_STEP_AFTER_BUILDER_SUB = "after_builder_sub";
+const char *const BUILD_STEP_AFTER_MERGE = "after_merge";
+const std::set<std::string> build_step_options = {BUILD_STEP_BEFORE_UB_MATCH, BUILD_STEP_AFTER_UB_MATCH,
+                                                  BUILD_STEP_AFTER_BUILDER, BUILD_STEP_AFTER_BUILDER_SUB,
+                                                  BUILD_STEP_AFTER_MERGE};
+
+using SubgraphCreateOutNode = std::unordered_map<ComputeGraphPtr, NodePtr>;
+using NodetoNodeMap = std::unordered_map<NodePtr, NodePtr>;
+using NodeSet = std::set<NodePtr>;
+using NodeNametoNodeNameMap = std::unordered_map<std::string, std::string>;
+using NodetoNodeNameMap = std::unordered_map<NodePtr, std::string>;
+class TuningUtils {
+ public:
+  TuningUtils() = default;
+  ~TuningUtils() = default;
+  // Dump all the subgraphs and modify
+  // the subgraphs in them to be executable subgraphs if exe_flag is true
+  // `tuning_path` means path to save the graphs
+  static graphStatus ConvertGraphToFile(std::vector<ComputeGraphPtr> tuning_subgraphs,
+                                        std::vector<ComputeGraphPtr> non_tuning_subgraphs = {}, bool exe_flag = false,
+                                        const std::string &path = "", const std::string &user_path = "");
+  // Recovery `graph` from graph dump files configured in options
+  static graphStatus ConvertFileToGraph(const map<int64_t, string> &options, ge::Graph &graph);
+
+ private:
+  // part 1
+  struct HelpInfo {
+    int64_t index;
+    bool exe_flag;
+    bool is_tuning_graph;
+    const std::string &path;
+    const std::string &user_path;
+  };
+  static graphStatus MakeExeGraph(ComputeGraphPtr &exe_graph, const HelpInfo &help_info);
+  static graphStatus HandlePld(NodePtr &node);
+  static graphStatus HandleEnd(NodePtr &node);
+  static graphStatus ChangePld2Data(NodePtr &node, NodePtr &data_node);
+  static graphStatus ChangeEnd2NetOutput(NodePtr &node, NodePtr &out_node);
+  static graphStatus LinkEnd2NetOutput(NodePtr &node, NodePtr &out_node);
+  static graphStatus CreateDataNode(NodePtr &node, NodePtr &data_node);
+  static graphStatus CreateNetOutput(NodePtr &node, NodePtr &out_node);
+  static graphStatus AddAttrToDataNodeForMergeGraph(const NodePtr &pld, NodePtr &data_node);
+  static graphStatus AddAttrToNetOutputForMergeGraph(const NodePtr &end, NodePtr &out_node);
+  static void DumpGraphToPath(ComputeGraphPtr &exe_graph, int64_t index, bool is_tuning_graph, std::string path);
+
+  static SubgraphCreateOutNode create_output_;
+  // part 2
+  static graphStatus MergeAllSubGraph(std::vector<ComputeGraphPtr> &graphs, ComputeGraphPtr &graph);
+  static graphStatus MergeSubGraph(ComputeGraphPtr &graph);
+  // Deletes new data and output nodes added by call `MakeExeGraph()` func in part 1
+  static graphStatus RemoveDataNetoutputEdge(ComputeGraphPtr &graph);
+  static graphStatus GetInAndOutAnchorPair(NodePtr &data_node, NodePtr &out_node, AnchorPtr &dest_in_anchor,
+                                           AnchorPtr &src_out_anchor);
+  static NodeNametoNodeNameMap data_2_netoutput_;
+  static NodetoNodeNameMap data_node_2_netoutput_;
+  static NodetoNodeMap data_node_2_netoutput_node_;
+  static NodeSet netoutput_nodes_;
+  static NodeSet merged_graph_nodes_;
+  static std::mutex mutex_;
+  // for debug
+  static std::string PrintCheckLog();
+  static std::string GetNodeNameByAnchor(const Anchor *anchor);
+};
+}  // namespace ge
+#endif  // MAIN_TUNING_UTILS_H
diff --git a/inc/graph/utils/graph_utils.h b/inc/graph/utils/graph_utils.h
index 5f627ea4..fdcbe1a9 100644
--- a/inc/graph/utils/graph_utils.h
+++ b/inc/graph/utils/graph_utils.h
@@ -36,8 +36,8 @@
   do {                                                                                                 \
     GraphUtils::DumpGEGraph(compute_graph, name);                                                      \
     GraphUtils::DumpGEGraphToOnnx(*compute_graph, name);                                               \
+    uint64_t i = 0;                                                                                    \
     for (const auto &sub_graph_func : compute_graph->GetAllSubgraphs()) {                              \
-      static int8_t i = 0;                                                                             \
       auto sub_graph_func_name = std::string(name) + std::string("_sub_graph_") + std::to_string(i++); \
       GraphUtils::DumpGEGraph(sub_graph_func, sub_graph_func_name);                                    \
       GraphUtils::DumpGEGraphToOnnx(*sub_graph_func, sub_graph_func_name);                             \
@@ -203,10 +203,13 @@ class GraphUtils {
 
   static bool MatchDumpStr(const std::string &suffix);
 
-  static void DumpGEGraph(const ge::ComputeGraphPtr &graph, const std::string &suffix, bool is_always_dump = false);
+  static void DumpGEGraph(const ge::ComputeGraphPtr &graph, const std::string &suffix, bool is_always_dump = false,
+                          const std::string &user_graph_name = "");
 
   static bool LoadGEGraph(const char *file, ge::ComputeGraph &compute_graph);
 
+  static bool LoadGEGraph(const char *file, ge::ComputeGraphPtr &compute_graph);
+
   static void BreakConnect(const std::map<OperatorImplPtr, NodePtr> &all_nodes_infos);
 
   static void DumpGEGraphToOnnx(const ge::ComputeGraph &compute_graph, const std::string &suffix);
diff --git a/src/common/graph/CMakeLists.txt b/src/common/graph/CMakeLists.txt
index f041e4b6..4f9e1a00 100755
--- a/src/common/graph/CMakeLists.txt
+++ b/src/common/graph/CMakeLists.txt
@@ -24,6 +24,7 @@ file(GLOB_RECURSE PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "../../proto/task.proto"
         "../../proto/fwk_adaper.proto"
         "../../proto/op_mapping_info.proto"
+        "../proto/dump_task.proto"
         )
 
 file(GLOB_RECURSE ONNX_PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
diff --git a/src/common/graph/compute_graph.cc b/src/common/graph/compute_graph.cc
index 52953fb2..e6c306b0 100644
--- a/src/common/graph/compute_graph.cc
+++ b/src/common/graph/compute_graph.cc
@@ -36,6 +36,7 @@
 namespace ge {
 namespace {
 const size_t OUTPUT_PARAM_SIZE = 2;
+const std::string alias_name_attr = "_aliasName";
 bool IsUseBFS() {
   string run_mode;
   const int base = 10;
@@ -133,6 +134,14 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY NodePtr ComputeGraph::FindNode(co
     if (node->GetName() == name) {
       return node;
     }
+    std::vector<string> out_alias_name;
+    if (AttrUtils::GetListStr(node->GetOpDesc(), alias_name_attr, out_alias_name)) {
+      for (const auto &alias_name : out_alias_name) {
+        if (alias_name == name) {
+          return node;
+        }
+      }
+    }
   }
   return nullptr;
 }
@@ -258,6 +267,7 @@ NodePtr ComputeGraph::AddNodeFront(NodePtr node) {
     GELOGE(GRAPH_FAILED, "The node ptr or op desc should not be null.");
     return nullptr;
   }
+  node->SetHostNode(is_valid_flag_);
   node->GetOpDesc()->SetId(nodes_.size());
   if (nodes_.size() > 0 && nodes_[0]->GetType() == DATA) {
     (void)nodes_.insert(nodes_.begin() + 1, node);
@@ -284,6 +294,7 @@ NodePtr ComputeGraph::AddNodeAfter(NodePtr node, const NodePtr &pre_node) {
     GELOGE(GRAPH_FAILED, "The node ptr or op desc should not be null.");
     return nullptr;
   }
+  node->SetHostNode(is_valid_flag_);
   node->GetOpDesc()->SetId(nodes_.size());
   auto node_iter = std::find(nodes_.begin(), nodes_.end(), pre_node);
   if (node_iter != nodes_.end()) {
@@ -313,6 +324,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY NodePtr ComputeGraph::AddNode(Nod
     GELOGE(GRAPH_FAILED, "The node ptr should not be null.");
     return nullptr;
   }
+  node->SetHostNode(is_valid_flag_);
   node->GetOpDesc()->SetId((int64_t)GetDirectNodesSize());
   nodes_.push_back(node);
   return node;
@@ -339,6 +351,7 @@ NodePtr ComputeGraph::AddNode(OpDescPtr op, int64_t id) {  // for unserialize.
   NodePtr node = shared_ptr<Node>(new (std::nothrow) Node(op, shared_from_this()));
   GE_IF_BOOL_EXEC(node == nullptr, GELOGE(GRAPH_FAILED, "node_ptr is NULL!!!"); return nullptr);
   GE_IF_BOOL_EXEC(node->Init() != GRAPH_SUCCESS, GELOGE(GRAPH_FAILED, "node init fail."); return nullptr);
+  node->SetHostNode(is_valid_flag_);
   nodes_.push_back(node);
   return node;
 }
@@ -355,7 +368,9 @@ NodePtr ComputeGraph::AddInputNode(NodePtr node) {
   return node;
 }
 
-NodePtr ComputeGraph::AddOutputNode(NodePtr node) {
+NodePtr ComputeGraph::AddOutputNode(NodePtr node) { return AddOutputNodeByIndex(node, 0); }
+
+NodePtr ComputeGraph::AddOutputNodeByIndex(NodePtr node, int32_t index) {
   if (node == nullptr || node->GetOpDesc() == nullptr) {
     GELOGE(GRAPH_FAILED, "The node ptr or opdesc should not be null.");
     return nullptr;
@@ -365,7 +380,7 @@ NodePtr ComputeGraph::AddOutputNode(NodePtr node) {
   NodePtr result = node;
   // [output_nodes_info_ : should not be null]
   for (const auto &item : output_nodes_info_) {
-    if (item.first->GetName() == node->GetName()) {
+    if (item.first->GetName() == node->GetName() && item.second == index) {
       already_have = true;
       result = item.first;
       break;
@@ -373,7 +388,8 @@ NodePtr ComputeGraph::AddOutputNode(NodePtr node) {
   }
 
   if (!already_have) {
-    output_nodes_info_.emplace_back(std::make_pair(node, 0));
+    output_nodes_info_.emplace_back(std::make_pair(node, index));
+    GELOGI("Push back node name:%s, index:%ld, into output_nodes_info_.", node->GetName().c_str(), index);
   }
 
   if (std::find(nodes_.begin(), nodes_.end(), node) == nodes_.end()) {
diff --git a/src/common/graph/debug/ge_op_types.h b/src/common/graph/debug/ge_op_types.h
index f11ef31e..dff87331 100644
--- a/src/common/graph/debug/ge_op_types.h
+++ b/src/common/graph/debug/ge_op_types.h
@@ -32,6 +32,8 @@ GE_REGISTER_OPTYPE(STATELESSWHILE, "StatelessWhile");
 GE_REGISTER_OPTYPE(SQUEEZE, "Squeeze");
 GE_REGISTER_OPTYPE(EXPANDDIMS, "ExpandDims");
 GE_REGISTER_OPTYPE(SWITCH, "Switch");
+GE_REGISTER_OPTYPE(REFSWITCH, "RefSwitch");
+GE_REGISTER_OPTYPE(SWITCHN, "SwitchN");
 GE_REGISTER_OPTYPE(MERGE, "Merge");
 GE_REGISTER_OPTYPE(STREAMMERGE, "StreamMerge");
 GE_REGISTER_OPTYPE(ENTER, "Enter");
@@ -40,6 +42,7 @@ GE_REGISTER_OPTYPE(NEXTITERATION, "NextIteration");
 GE_REGISTER_OPTYPE(REFNEXTITERATION, "RefNextIteration");
 GE_REGISTER_OPTYPE(CONSTANT, "Const");
 GE_REGISTER_OPTYPE(PLACEHOLDER, "PlaceHolder");
+GE_REGISTER_OPTYPE(END, "End");
 GE_REGISTER_OPTYPE(FRAMEWORKOP, "FrameworkOp");
 GE_REGISTER_OPTYPE(GETNEXT, "GetNext");
 GE_REGISTER_OPTYPE(INITDATA, "InitData");
diff --git a/src/common/graph/format_refiner.cc b/src/common/graph/format_refiner.cc
index 9cb76539..4cb41349 100644
--- a/src/common/graph/format_refiner.cc
+++ b/src/common/graph/format_refiner.cc
@@ -43,7 +43,7 @@ namespace ge {
 namespace {
 const std::unordered_set<string> kChangeDimNodes = {PERMUTE, EXPANDDIMS, SQUEEZE};
 const string kIsGraphInferred = "_is_graph_inferred";
-RefRelations reflection_builder;
+thread_local RefRelations reflection_builder;
 }  // namespace
 
 graphStatus ReflectionProcess(const std::unordered_set<RefCell, RefCellHash> &reflection,
diff --git a/src/common/graph/ge_attr_define.cc b/src/common/graph/ge_attr_define.cc
index fde03a43..708347a7 100644
--- a/src/common/graph/ge_attr_define.cc
+++ b/src/common/graph/ge_attr_define.cc
@@ -967,6 +967,13 @@ const std::string ATTR_NAME_SWITCH_FOR_L2_FUSION = "_enable_l2_fusion";
 const std::string ATTR_NAME_OP_INPUT_L1_FLAG = "_op_input_l1_flag";
 const std::string ATTR_NAME_OP_INPUT_L1_ADDR = "_op_input_l1_addr";
 const std::string ATTR_NAME_OP_INPUT_L1_VALID_SIZE = "_op_input_l1_valid_size";
+const std::string ATTR_NAME_ENGINE_NAME_FOR_LX = "_lxfusion_engine_name";
+const std::string ATTR_NAME_KKERNEL_LIB_NAME_FOR_LX = "_lxfusion_op_kernel_lib_name";
+const std::string ATTR_NAME_NEED_LX_FUSION = "_lx_fusion";
+const std::string ATTR_NAME_OPTIMIZE_GROUP = "_optimize_group";
+const std::string ATTR_NAME_OP_COMPILE_STRATEGY = "_op_compile_strategy";
+const std::string ATTR_NAME_TBE_KERNEL_NAME = "_tbe_kernel_name";
+const std::string ATTR_NAME_TBE_KERNEL_BUFFER = "_tbe_kernel_buffer";
 
 // Op debug attrs
 const std::string ATTR_OP_DEBUG_FLAG = "_op_debug_flag";
diff --git a/src/common/graph/graph.mk b/src/common/graph/graph.mk
index b007dac8..9e9ffa3a 100644
--- a/src/common/graph/graph.mk
+++ b/src/common/graph/graph.mk
@@ -8,6 +8,7 @@ COMMON_LOCAL_SRC_FILES := \
     ./proto/task.proto \
     ./proto/fwk_adapter.proto \
     ./proto/op_mapping_info.proto \
+    ./proto/dump_task.proto \
     ./anchor.cc \
     ./ge_attr_value.cc \
     ./attr_value.cc \
@@ -29,6 +30,7 @@ COMMON_LOCAL_SRC_FILES := \
     ./ge_tensor.cc \
     ./detail/attributes_holder.cc \
     ./utils/anchor_utils.cc \
+    ./utils/tuning_utils.cc \
     ./utils/graph_utils.cc \
     ./utils/ge_ir_utils.cc \
     ./utils/node_utils.cc \
@@ -51,6 +53,7 @@ COMMON_LOCAL_C_INCLUDES := \
     proto/task.proto \
     proto/fwk_adapter.proto \
     proto/op_mapping_info.proto \
+    proto/dump_task.proto \
     inc \
     inc/external \
     inc/external/graph \
diff --git a/src/common/graph/model_serialize.cc b/src/common/graph/model_serialize.cc
index 673bb31b..16855fc5 100644
--- a/src/common/graph/model_serialize.cc
+++ b/src/common/graph/model_serialize.cc
@@ -195,9 +195,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool ModelSerializeImp::Serialize
     }
   }
   // Outputs
-  for (const auto &output : graph->GetOutputNodes()) {
-    if (output != nullptr) {
-      graph_proto->add_output(output->GetName() + ":0");
+  for (const auto &output : graph->GetGraphOutNodesInfo()) {
+    if (output.first != nullptr) {
+      graph_proto->add_output(output.first->GetName() + ":" + std::to_string(output.second));
+      GELOGI("Add output to graph proto, node name:%s, index:%ld", output.first->GetName().c_str(), output.second);
     }
   }
   if (graph->attrs_.GetProtoMsg() != nullptr) {
@@ -440,7 +441,8 @@ bool ModelSerializeImp::HandleNodeNameRef() {
     }
 
     GE_IF_BOOL_EXEC(item.graph == nullptr, continue);
-    auto ret = item.graph->AddOutputNode(node_it->second);
+    auto ret = item.graph->AddOutputNodeByIndex(node_it->second, item.index);
+    GELOGI("node name:%s, item.index:%ld", node_it->second->GetName().c_str(), item.index);
     if (ret == nullptr) {
       GELOGE(GRAPH_FAILED, "AddOutputNode failed.");
       return false;
diff --git a/src/common/graph/op_desc.cc b/src/common/graph/op_desc.cc
index 0b22eb83..706ec9cd 100644
--- a/src/common/graph/op_desc.cc
+++ b/src/common/graph/op_desc.cc
@@ -219,6 +219,10 @@ graphStatus OpDesc::AddInputDesc(const string &name, const ge::GeTensorDesc &inp
     }
     inputs_desc_.push_back(in_desc);
     (void)input_name_idx_.insert(make_pair(name, index));
+    if (find(register_input_name_.begin(), register_input_name_.end(), name) == register_input_name_.end()) {
+      register_input_name_.push_back(name);
+    }
+
     return GRAPH_SUCCESS;
   }
 }
@@ -255,6 +259,38 @@ graphStatus OpDesc::AddInputDescMiddle(const string &name, const unsigned int nu
   return GRAPH_SUCCESS;
 }
 
+graphStatus OpDesc::AddOutputDescMiddle(const string &name, const unsigned int num, size_t index) {
+  for (unsigned int i = 0; i < num; i++) {
+    string output_name = name + std::to_string(i);
+    GE_CHK_BOOL_RET_STATUS((output_name_idx_.find(output_name) == output_name_idx_.end()), GRAPH_FAILED,
+                           "Add input tensor_desc is existed. name[%s]", output_name.c_str());
+
+    std::shared_ptr<GeTensorDesc> out_desc = ComGraphMakeShared<GeTensorDesc>(GeTensorDesc());
+    if (out_desc == nullptr) {
+      GELOGE(GRAPH_FAILED, "AddInputDescMiddle failed, malloc shared_ptr failed.");
+      return GRAPH_FAILED;
+    }
+
+    if (index > outputs_desc_.size()) {
+      GELOGE(GRAPH_FAILED, "AddInputDescMiddle failed, insert index should not more than inputs size.");
+      return GRAPH_FAILED;
+    }
+
+    (void)outputs_desc_.insert(outputs_desc_.begin() + index + i, out_desc);
+
+    // Update index in input_name_idx
+    for (auto it = output_name_idx_.begin(); it != output_name_idx_.end(); ++it) {
+      if (it->second >= (index + i)) {
+        it->second += 1;
+      }
+    }
+
+    (void)output_name_idx_.insert(make_pair(output_name, i + index));
+  }
+
+  return GRAPH_SUCCESS;
+}
+
 graphStatus OpDesc::AddInputDescForward(const string &name, const unsigned int num) {
   for (unsigned int i = 0; i < num; i++) {
     string input_name = name + std::to_string(i);
@@ -550,6 +586,9 @@ graphStatus OpDesc::AddOutputDesc(const string &name, const ge::GeTensorDesc &ou
   }
   outputs_desc_.push_back(tensor);
   (void)output_name_idx_.insert(make_pair(name, index));
+  if (find(register_output_name_.begin(), register_output_name_.end(), name) == register_output_name_.end()) {
+    register_output_name_.push_back(name);
+  }
   return GRAPH_SUCCESS;
 }
 
@@ -655,6 +694,16 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ConstGeTensorDescPtr OpDesc::GetI
   return inputs_desc_[it->second];
 }
 
+graphStatus OpDesc::AddRegisterInputName(const std::string &name) {
+  if (find(register_input_name_.begin(), register_input_name_.end(), name) == register_input_name_.end()) {
+    register_input_name_.push_back(name);
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+vector<string> OpDesc::GetRegisterInputName() const { return register_input_name_; }
+
 graphStatus OpDesc::AddDynamicInputDesc(const string &name, const unsigned int num, bool is_push_back) {
   if (is_push_back) {
     for (unsigned int i = 0; i < num; i++) {
@@ -663,6 +712,10 @@ graphStatus OpDesc::AddDynamicInputDesc(const string &name, const unsigned int n
   } else {
     if (AddInputDescForward(name, num) != GRAPH_SUCCESS) return GRAPH_FAILED;
   }
+  if (AddRegisterInputName(name) != GRAPH_SUCCESS) {
+    return GRAPH_FAILED;
+  }
+
   return GRAPH_SUCCESS;
 }
 
@@ -673,6 +726,16 @@ graphStatus OpDesc::AddDynamicInputDescByIndex(const string &name, const unsigne
   return GRAPH_SUCCESS;
 }
 
+graphStatus OpDesc::AddRegisterOutputName(const string &name) {
+  if (find(register_output_name_.begin(), register_output_name_.end(), name) == register_output_name_.end()) {
+    register_output_name_.push_back(name);
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+vector<string> OpDesc::GetRegisterOutputName() const { return register_output_name_; }
+
 graphStatus OpDesc::AddDynamicOutputDesc(const string &name, const unsigned int num, bool is_push_back) {
   if (is_push_back) {
     for (unsigned int i = 0; i < num; i++) {
@@ -681,6 +744,10 @@ graphStatus OpDesc::AddDynamicOutputDesc(const string &name, const unsigned int
   } else {
     if (AddOutputDescForward(name, num) != GRAPH_SUCCESS) return GRAPH_FAILED;
   }
+
+  if (AddRegisterOutputName(name) != GRAPH_SUCCESS) {
+    return GRAPH_FAILED;
+  }
   return GRAPH_SUCCESS;
 }
 
diff --git a/src/common/graph/opsproto/opsproto_manager.cc b/src/common/graph/opsproto/opsproto_manager.cc
index 4c8c1be5..d482715b 100644
--- a/src/common/graph/opsproto/opsproto_manager.cc
+++ b/src/common/graph/opsproto/opsproto_manager.cc
@@ -31,6 +31,13 @@ OpsProtoManager *OpsProtoManager::Instance() {
 }
 
 bool OpsProtoManager::Initialize(const std::map<std::string, std::string> &options) {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  if (is_init_) {
+    GELOGI("OpsProtoManager is already initialized.");
+    return true;
+  }
+
   /*lint -e1561*/
   auto proto_iter = options.find("ge.opsProtoLibPath");
   /*lint +e1561*/
@@ -42,10 +49,19 @@ bool OpsProtoManager::Initialize(const std::map<std::string, std::string> &optio
   pluginPath_ = proto_iter->second;
   LoadOpsProtoPluginSo(pluginPath_);
 
+  is_init_ = true;
+
   return true;
 }
 
 void OpsProtoManager::Finalize() {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  if (!is_init_) {
+    GELOGI("OpsProtoManager is not initialized.");
+    return;
+  }
+
   for (auto handle : handles_) {
     if (handle != nullptr) {
       if (dlclose(handle) != 0) {
@@ -57,6 +73,8 @@ void OpsProtoManager::Finalize() {
       GELOGW("close opsprotomanager handler failure, handler is nullptr");
     }
   }
+
+  is_init_ = false;
 }
 
 static std::vector<std::string> Split(const std::string &str, char delim) {
diff --git a/src/common/graph/shape_refiner.cc b/src/common/graph/shape_refiner.cc
index 35c109af..a87e3753 100644
--- a/src/common/graph/shape_refiner.cc
+++ b/src/common/graph/shape_refiner.cc
@@ -601,7 +601,7 @@ InferenceContextPtr CreateInferenceContext(const std::unordered_map<NodePtr, Inf
 }
 
 namespace {
-std::unordered_map<NodePtr, InferenceContextPtr> context_map;
+thread_local std::unordered_map<NodePtr, InferenceContextPtr> context_map;
 }
 
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void ShapeRefiner::ClearContextMap() { context_map.clear(); }
@@ -645,6 +645,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ShapeRefiner::InferSh
   graphStatus status = InferShapeAndType(node, op, before_subgraph);
   if (status == GRAPH_PARAM_INVALID || status == GRAPH_SUCCESS) {
     if (is_unknown_graph) {
+      PrintInOutTensorShape(node, "after_infershape when running");
       return GRAPH_SUCCESS;
     }
     auto op_desc = node->GetOpDesc();
diff --git a/src/common/graph/utils/graph_utils.cc b/src/common/graph/utils/graph_utils.cc
index 19c28c63..e2f9f857 100644
--- a/src/common/graph/utils/graph_utils.cc
+++ b/src/common/graph/utils/graph_utils.cc
@@ -29,6 +29,7 @@
 #include <fstream>
 #include <iomanip>
 #include <queue>
+#include <atomic>
 
 #include "./ge_context.h"
 #include "debug/ge_util.h"
@@ -57,6 +58,7 @@ namespace {
 const int32_t kBaseOfIntegerValue = 10;
 #ifdef FMK_SUPPORT_DUMP
 const char *const kDumpGeGraph = "DUMP_GE_GRAPH";
+const int kDumpGraphIndexWidth = 5;
 #endif
 const char *const kDumpGraphLevel = "DUMP_GRAPH_LEVEL";
 const char *const kDumpStrBuild = "Build";
@@ -431,10 +433,15 @@ GraphUtils::InsertNodeAfter(const OutDataAnchorPtr &src, const std::vector<InDat
   OutControlAnchorPtr src_out_ctrl_anchor = src_node->GetOutControlAnchor();
   GE_CHECK_NOTNULL(src_out_ctrl_anchor);
 
+  bool ctrl_edge_flag = true;
+  std::string type = NodeUtils::GetNodeType(src->GetOwnerNode());
+  if ((type == SWITCH) || (type == REFSWITCH) || (type == SWITCHN)) {
+    ctrl_edge_flag = false;
+  }
+
   for (auto &dst : dsts) {
     GE_CHECK_NOTNULL(dst);
     NodePtr dst_node = dst->GetOwnerNode();
-    GE_CHECK_NOTNULL(dst_node);
     GELOGI("Insert node %s between %s->%s.", insert_node->GetName().c_str(), src_node->GetName().c_str(),
            dst_node->GetName().c_str());
     if (src_node->GetOwnerComputeGraph() != dst_node->GetOwnerComputeGraph()) {
@@ -450,11 +457,12 @@ GraphUtils::InsertNodeAfter(const OutDataAnchorPtr &src, const std::vector<InDat
       return GRAPH_FAILED;
     }
 
-    OutControlAnchorPtr new_out_ctrl_anchor = insert_node->GetOutControlAnchor();
-    GE_CHECK_NOTNULL(new_out_ctrl_anchor);
+    if (!ctrl_edge_flag) {
+      continue;
+    }
     for (const InControlAnchorPtr &peer_in_ctrl_anchor : src_out_ctrl_anchor->GetPeerInControlAnchors()) {
       if ((RemoveEdge(src_out_ctrl_anchor, peer_in_ctrl_anchor) != GRAPH_SUCCESS) ||
-          (AddEdge(new_out_ctrl_anchor, peer_in_ctrl_anchor) != GRAPH_SUCCESS)) {
+          (AddEdge(insert_node->GetOutControlAnchor(), peer_in_ctrl_anchor) != GRAPH_SUCCESS)) {
         GELOGE(GRAPH_FAILED, "ReplaceEdge from %s->%s to %s->%s failed.", src_node->GetName().c_str(),
                peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str(), insert_node->GetName().c_str(),
                peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str());
@@ -552,7 +560,8 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::MatchDumpStr(con
 
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraph(const ge::ComputeGraphPtr &graph,
                                                                             const std::string &suffix,
-                                                                            bool is_always_dump) {
+                                                                            bool is_always_dump,
+                                                                            const std::string &user_graph_name) {
 #ifdef FMK_SUPPORT_DUMP
   char *dump_ge_graph = std::getenv(kDumpGeGraph);
   GE_IF_BOOL_EXEC(dump_ge_graph == nullptr && !is_always_dump, return;);
@@ -563,32 +572,33 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraph(cons
   }
 
   // file name
-  static int file_idx = 0;
-  const int dump_graph_index_width = 5;
-  file_idx++;
-  GELOGD("Start to dump om txt: %d", file_idx);
+  static std::atomic_long atomic_file_index(0);
+  auto file_index = atomic_file_index.fetch_add(1);
+  GELOGD("Start to dump om txt: %ld", file_index);
 
-  static int max_dumpfile_num = 0;
-  if (max_dumpfile_num == 0) {
+  thread_local long max_dump_file_num = 0;
+  if (max_dump_file_num == 0) {
     string opt = "0";
     (void)GetContext().GetOption(OPTION_GE_MAX_DUMP_FILE_NUM, opt);
-    max_dumpfile_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue);
+    max_dump_file_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue);
   }
-  if (max_dumpfile_num != 0 && file_idx > max_dumpfile_num) {
-    GELOGW("dump graph file cnt > maxDumpFileNum, maxDumpFileCnt=%d.", max_dumpfile_num);
+  if (max_dump_file_num != 0 && file_index > max_dump_file_num) {
+    GELOGW("dump graph file cnt > maxDumpFileNum, maxDumpFileCnt=%ld.", max_dump_file_num);
     return;
   }
 
   std::stringstream stream_file_name;
-  stream_file_name << "ge_proto_" << std::setw(dump_graph_index_width) << std::setfill('0') << file_idx;
+  stream_file_name << "ge_proto_" << std::setw(kDumpGraphIndexWidth) << std::setfill('0') << file_index;
   stream_file_name << "_" << suffix << ".txt";
-  std::string proto_file = stream_file_name.str();
+  std::string proto_file = user_graph_name.empty() ? stream_file_name.str() : user_graph_name;
 
   // Create buffer
   ge::Model model("", "");
   model.SetGraph(GraphUtils::CreateGraphFromComputeGraph(std::const_pointer_cast<ComputeGraph>(graph)));
   Buffer buffer;
-  model.Save(buffer, true);
+  const int64_t kDumpLevel =
+    (dump_ge_graph != nullptr) ? std::strtol(dump_ge_graph, nullptr, kBaseOfIntegerValue) : ge::OnnxUtils::NO_DUMP;
+  model.Save(buffer, kDumpLevel != ge::OnnxUtils::DUMP_ALL);
 
   // Write file
   ge::proto::ModelDef ge_proto;
@@ -631,6 +641,35 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::LoadGEGraph(cons
   }
 }
 
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::LoadGEGraph(const char *file,
+                                                                            ge::ComputeGraphPtr &compute_graph) {
+  ge::proto::ModelDef model_def;
+  // Get ModelDef object from file generated by DumpGEGraph()
+  if (!ReadProtoFromTextFile(file, &model_def)) {
+    GELOGE(GRAPH_FAILED, "Get ModelDef failed from file");
+    return false;
+  }
+  ge::Model model;
+  // Get Model object from ModelDef by deserialize ModelDef
+  if (model.Load(model_def) == GRAPH_SUCCESS) {
+    GE_CHK_BOOL_EXEC(GraphUtils::GetComputeGraph(model.GetGraph()) != nullptr, return false,
+                     "Get computer graph is nullptr");
+    compute_graph = GraphUtils::GetComputeGraph(model.GetGraph());
+    for (const auto &node : compute_graph->GetDirectNode()) {
+      GELOGI("Node %s set owner graph", node->GetName().c_str());
+      GE_CHECK_NOTNULL(node);
+      if (node->SetOwnerComputeGraph(compute_graph) != GRAPH_SUCCESS) {
+        GELOGE(GRAPH_FAILED, "Node %s set owner graph failed", node->GetName().c_str());
+        return false;
+      }
+    }
+    return true;
+  } else {
+    GELOGE(GRAPH_FAILED, "Get Model failed from ModelDef");
+    return false;
+  }
+}
+
 // Printing protocol messages in text format is useful for debugging and human editing of messages.
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::WriteProtoToTextFile(
   const google::protobuf::Message &proto, const char *real_path) {
@@ -666,16 +705,16 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::WriteProtoToText
     return;
   }
   if (fseek(file, 0L, SEEK_END) == 0) {
-    int64_t fileSize = ftell(file);
-    static int64_t maxDumpFileSize = 0;
-    if (maxDumpFileSize == 0) {
+    long fileSize = ftell(file);
+    thread_local long max_dump_file_size = 0;
+    if (max_dump_file_size == 0) {
       string opt = "0";
       // Can not check return value
       (void)GetContext().GetOption(OPTION_GE_MAX_DUMP_FILE_SIZE, opt);
-      maxDumpFileSize = atol(opt.c_str());
+      max_dump_file_size = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue);
     }
-    if (maxDumpFileSize != 0 && fileSize != -1 && fileSize > maxDumpFileSize) {
-      GELOGW("dump graph file size > maxDumpFileSize, maxDumpFileSize=%ld.", maxDumpFileSize);
+    if (max_dump_file_size != 0 && fileSize != -1 && fileSize > max_dump_file_size) {
+      GELOGW("dump graph file size > maxDumpFileSize, maxDumpFileSize=%ld.", max_dump_file_size);
       GE_IF_BOOL_EXEC(std::remove(real_path) != 0, GELOGW("remove %s failed", real_path));
       GE_CHK_BOOL_EXEC(fclose(file) == 0, return, "Fclose %s failed", real_path);
       return;
@@ -734,25 +773,23 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraphToOnn
   }
 
   // 2.Set file name
-  static int file_index = 0;
-  file_index++;
-  GELOGD("Start to dump ge onnx file: %d", file_index);
+  static std::atomic_long atomic_file_index(0);
+  auto file_index = atomic_file_index.fetch_add(1);
+  GELOGD("Start to dump ge onnx file: %ld", file_index);
 
-  static int max_dumpfile_num = 0;
-  if (max_dumpfile_num == 0) {
+  thread_local long max_dump_file_num = 0;
+  if (max_dump_file_num == 0) {
     string opt = "0";
     (void)GetContext().GetOption(OPTION_GE_MAX_DUMP_FILE_NUM, opt);
-    max_dumpfile_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue);
+    max_dump_file_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue);
   }
-  if (max_dumpfile_num != 0 && file_index > max_dumpfile_num) {
-    GELOGW("dump graph file cnt > maxDumpFileNum, maxDumpFileNum=%d.", max_dumpfile_num);
+  if (max_dump_file_num != 0 && file_index > max_dump_file_num) {
+    GELOGW("dump graph file cnt > maxDumpFileNum, maxDumpFileNum=%ld.", max_dump_file_num);
     return;
   }
 
-  /// 99999 graphs can be dumped at most at one time
-  /// setw(5) is for formatted sort
   std::stringstream stream_file_name;
-  stream_file_name << "ge_onnx_" << std::setw(5) << std::setfill('0') << file_index;
+  stream_file_name << "ge_onnx_" << std::setw(kDumpGraphIndexWidth) << std::setfill('0') << file_index;
   stream_file_name << "_graph_" << compute_graph.GetGraphID();
   stream_file_name << "_" << suffix << ".pbtxt";
   std::string proto_file = stream_file_name.str();
@@ -1363,6 +1400,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraphPtr GraphUtils::FindR
 /// Make a copy of ComputeGraph.
 /// @param graph: original graph.
 /// @param prefix: node name prefix of new graph.
+/// @param output_nodes: output nodes of new graph.
 /// @return ComputeGraphPtr
 ///
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraphPtr
@@ -1399,6 +1437,14 @@ GraphUtils::CloneGraph(const ComputeGraphPtr &graph, const std::string &prefix,
     }
   }
 
+  std::string session_graph_id;
+  if (AttrUtils::GetStr(*graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id)) {
+    bool ret = AttrUtils::SetStr(*new_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id);
+    if (!ret) {
+      GELOGE(GRAPH_FAILED, "Set attr ATTR_NAME_SESSION_GRAPH_ID failed.");
+      return nullptr;
+    }
+  }
   return new_graph;
 }
 
diff --git a/src/common/graph/utils/op_desc_utils.cc b/src/common/graph/utils/op_desc_utils.cc
index 92883877..e0579581 100644
--- a/src/common/graph/utils/op_desc_utils.cc
+++ b/src/common/graph/utils/op_desc_utils.cc
@@ -479,6 +479,19 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector<GeTensorPtr> OpDescUtils::
     return ret;
   }
 
+  if (node.GetType() == DATA) {
+    auto parent = NodeUtils::GetParentInput(node);
+    if ((parent != nullptr) && NodeUtils::IsConst(*parent)) {
+      auto weight = MutableWeights(parent->GetOpDesc());
+      if (weight == nullptr) {
+        GELOGI("const op has no weight, op name:%s", parent->GetName().c_str());
+        return ret;
+      }
+      ret.push_back(weight);
+    }
+    return ret;
+  }
+
   // Other operators, get weights from connected constop
   auto input_nodes = GetConstInputs(node);
   for (const auto &input_node : input_nodes) {
@@ -560,11 +573,9 @@ OpDescPtr OpDescUtils::CreateConstOp(const GeTensorPtr &tensor_ptr) {
 
   const_opdesc->SetType(CONSTANT);
 
-  static int const_count = 0;
-  const_opdesc->SetName("dynamic_const_" + std::to_string(const_count));
-
+  thread_local int64_t const_count = 0;
+  const_opdesc->SetName("dynamic_const_" + std::to_string(GetTid()) + "_" + std::to_string(const_count));
   GELOGI("add const op: %s", const_opdesc->GetName().c_str());
-
   ++const_count;
 
   (void)const_opdesc->AddOutputDesc(tensor_ptr->GetTensorDesc());
diff --git a/src/common/graph/utils/tuning_utils.cc b/src/common/graph/utils/tuning_utils.cc
new file mode 100644
index 00000000..0f07a197
--- /dev/null
+++ b/src/common/graph/utils/tuning_utils.cc
@@ -0,0 +1,684 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/tuning_utils.h"
+#include "../debug/ge_util.h"
+#include "../debug/ge_op_types.h"
+
+namespace ge {
+const std::string peer_node_name_attr = "_peerNodeName";
+const std::string parent_node_name_attr = "_parentNodeName";
+const std::string alias_name_attr = "_aliasName";
+const std::string parent_node_attr = "parentNode";
+const std::string parent_node_anchor_index_attr = "_parentNodeAnchorIndex";
+const std::string tuning_subgraph_prefix = "/aicore_subgraph_";
+const std::string non_tuning_subgraph_prefix = "/subgraph_";
+const std::set<std::string> kPartitionOpTypes = {PLACEHOLDER, END};
+const std::set<std::string> kExeTypes = {DATA, NETOUTPUT};
+NodeNametoNodeNameMap TuningUtils::data_2_netoutput_;
+NodetoNodeNameMap TuningUtils::data_node_2_netoutput_;
+NodetoNodeMap TuningUtils::data_node_2_netoutput_node_;
+NodeSet TuningUtils::netoutput_nodes_;
+NodeSet TuningUtils::merged_graph_nodes_;
+SubgraphCreateOutNode TuningUtils::create_output_;
+std::mutex TuningUtils::mutex_;
+
+std::string TuningUtils::PrintCheckLog() {
+  std::stringstream ss;
+  ss << "d2n:{";
+  for (const auto &pair : data_2_netoutput_) {
+    ss << "data:" << pair.first << "-"
+       << "netoutput:" << pair.second;
+    ss << " | ";
+  }
+  ss << "}";
+  ss << "netoutputs:{";
+  for (const auto &node : netoutput_nodes_) {
+    ss << "netoutput:" << node->GetName();
+    ss << " | ";
+  }
+  ss << "}";
+  return ss.str();
+}
+
+std::string TuningUtils::GetNodeNameByAnchor(const Anchor *anchor) {
+  if (anchor == nullptr) {
+    GELOGE(GRAPH_FAILED, "Anchor is nullptr");
+    return "Null";
+  }
+  auto node = anchor->GetOwnerNode();
+  return node == nullptr ? "Null" : node->GetName();
+}
+
+// part 1
+graphStatus TuningUtils::ConvertGraphToFile(std::vector<ComputeGraphPtr> tuning_subgraphs,
+                                            std::vector<ComputeGraphPtr> non_tuning_subgraphs, bool exe_flag,
+                                            const std::string &path, const std::string &user_path) {
+  int64_t i = 0;
+  int64_t j = 0;
+  std::lock_guard<std::mutex> lock(mutex_);
+  for (auto &subgraph : tuning_subgraphs) {
+    create_output_.emplace(subgraph, nullptr);
+    auto help_info = HelpInfo{i, exe_flag, true, path, user_path};
+    if (MakeExeGraph(subgraph, help_info) != SUCCESS) {
+      GELOGE(GRAPH_FAILED, "TUU:subgraph %zu generate exe graph failed", i);
+      return GRAPH_FAILED;
+    }
+    i++;
+  }
+
+  for (auto &subgraph : non_tuning_subgraphs) {
+    create_output_.emplace(subgraph, nullptr);
+    auto help_info = HelpInfo{j, true, false, path, user_path};
+    if (MakeExeGraph(subgraph, help_info) != SUCCESS) {
+      GELOGE(GRAPH_FAILED, "TUU:non tuning_subgraph %zu generate exe graph failed", j);
+      return GRAPH_FAILED;
+    }
+    j++;
+  }
+  create_output_.clear();
+  return SUCCESS;
+}
+
+// +---------------+
+// | pld     pld   |
+// |  \      /     |
+// | relu relu     |
+// |   \   /       |
+// |   add         |
+// |    |          |
+// |   end         |
+// +---------------+
+//        |
+//        |
+//        V
+// +---------------+
+// | data   data   |
+// |  \      /     |
+// | relu relu     |
+// |   \   /       |
+// |   add         |
+// |    |          |
+// |  netoutput    |
+// +---------------+
+graphStatus TuningUtils::MakeExeGraph(ComputeGraphPtr &exe_graph, const HelpInfo &help_info) {
+  GE_CHECK_NOTNULL(exe_graph);
+  // if not make exe, just dump and return
+  if (!help_info.exe_flag) {
+    DumpGraphToPath(exe_graph, help_info.index, help_info.is_tuning_graph, help_info.path);
+    GELOGI("TUU:just return, dump original sub_graph[%s]index[%d]", exe_graph->GetName().c_str(), help_info.index);
+    return SUCCESS;
+  }
+  // modify sub graph
+  for (NodePtr &node : exe_graph->GetDirectNode()) {
+    // 1.handle pld
+    if (node->GetType() == PLACEHOLDER) {
+      if (HandlePld(node) != SUCCESS) {
+        GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(),
+               exe_graph->GetName().c_str());
+        return FAILED;
+      }
+    }
+    // 2.handle end
+    if (node->GetType() == END) {
+      if (HandleEnd(node) != SUCCESS) {
+        GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(),
+               exe_graph->GetName().c_str());
+        return FAILED;
+      }
+    }
+  }
+  graphStatus ret = exe_graph->TopologicalSorting();
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Graph[%s] topological sort failed, ret:%d.", exe_graph->GetName().c_str(), ret);
+    return ret;
+  }
+  // dump subgraphs which modified by us
+  if (help_info.user_path.empty()) {
+    DumpGraphToPath(exe_graph, help_info.index, help_info.is_tuning_graph, help_info.path);
+  } else {
+    GraphUtils::DumpGEGraph(exe_graph, "", true, help_info.user_path);
+  }
+  return SUCCESS;
+}
+
+void TuningUtils::DumpGraphToPath(ComputeGraphPtr &exe_graph, int64_t index, bool is_tuning_graph, std::string path) {
+  if (!path.empty()) {
+    if (is_tuning_graph) {
+      GraphUtils::DumpGEGraph(exe_graph, "", true, path + tuning_subgraph_prefix + std::to_string(index) + ".txt");
+    } else {
+      GraphUtils::DumpGEGraph(exe_graph, "", true, path + non_tuning_subgraph_prefix + std::to_string(index) + ".txt");
+    }
+  } else {
+    path = "./";
+    if (is_tuning_graph) {
+      GraphUtils::DumpGEGraph(exe_graph, "", true, path + tuning_subgraph_prefix + std::to_string(index) + ".txt");
+    } else {
+      GraphUtils::DumpGEGraph(exe_graph, "", true, path + non_tuning_subgraph_prefix + std::to_string(index) + ".txt");
+    }
+  }
+}
+
+graphStatus TuningUtils::CreateDataNode(NodePtr &node, NodePtr &data_node) {
+  auto graph = node->GetOwnerComputeGraph();
+  GE_CHECK_NOTNULL(graph);
+  auto data_op_desc = ComGraphMakeShared<OpDesc>(node->GetName(), DATA);
+  GE_CHECK_NOTNULL(data_op_desc);
+  auto pld_op_desc = node->GetOpDesc();
+  GE_CHECK_NOTNULL(pld_op_desc);
+  auto output_desc = pld_op_desc->GetOutputDesc(0);  // only one output for pld and data
+  // data inputdesc & outputdesc set as same
+  if (data_op_desc->AddInputDesc(output_desc) != SUCCESS) {
+    GELOGE(FAILED, "TUU:data node %s AddOutputDesc failed", data_op_desc->GetName().c_str());
+    return FAILED;
+  }
+  if (data_op_desc->AddOutputDesc(output_desc) != SUCCESS) {
+    GELOGE(FAILED, "TUU:data node %s AddOutputDesc failed", data_op_desc->GetName().c_str());
+    return FAILED;
+  }
+  data_node = graph->AddNode(data_op_desc);
+  GE_CHECK_NOTNULL(data_node);
+  if (data_node->SetOwnerComputeGraph(graph) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "TUU:SetOwnerComputeGraph failed");
+    return FAILED;
+  }
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::AddAttrToDataNodeForMergeGraph(const NodePtr &pld, NodePtr &data_node) {
+  auto op_desc = data_node->GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
+
+  auto pld_desc = pld->GetOpDesc();
+  GE_CHECK_NOTNULL(pld_desc);
+  // inherit
+  // a.  set `end's input node type` as attr
+  std::string parent_op_type;
+  if (!AttrUtils::GetStr(pld_desc, "parentOpType", parent_op_type)) {
+    GELOGE(FAILED, "TUU:pld %s get parentOpType failed", pld_desc->GetName().c_str());
+    return FAILED;
+  }
+  (void)AttrUtils::SetStr(op_desc, "parentOpType", parent_op_type);
+  // b. set `end's input node name` as attr
+  std::string parent_op_name;
+  if (!AttrUtils::GetStr(pld_desc, parent_node_name_attr, parent_op_name)) {
+    GELOGE(FAILED, "TUU:pld %s get _parentNodeName failed", pld_desc->GetName().c_str());
+    return FAILED;
+  }
+  (void)AttrUtils::SetStr(op_desc, parent_node_name_attr, parent_op_name);
+  // c. set `end's input node's out anchor index` as attr
+  int parent_node_anchor_index;
+  if (!AttrUtils::GetInt(pld_desc, "anchorIndex", parent_node_anchor_index)) {
+    GELOGE(FAILED, "TUU:pld %s get anchorIndex failed", pld_desc->GetName().c_str());
+    return FAILED;
+  }
+  (void)AttrUtils::SetInt(op_desc, parent_node_anchor_index_attr, parent_node_anchor_index);
+  GELOGD("TUU:from node %s(%s) to add attr to node %s(%s) success", pld->GetName().c_str(), pld->GetType().c_str(),
+         data_node->GetName().c_str(), data_node->GetType().c_str());
+  // d. set `end node name` as attr
+  std::string peer_end_name;
+  if (!AttrUtils::GetStr(pld_desc, peer_node_name_attr, peer_end_name)) {
+    GELOGE(FAILED, "TUU:pld %s get _peerNodeName failed", pld_desc->GetName().c_str());
+    return FAILED;
+  }
+  (void)AttrUtils::SetStr(op_desc, peer_node_name_attr, peer_end_name);
+  GELOGD("TUU:from node %s(%s) to add attr to node %s(%s) success", pld->GetName().c_str(), pld->GetType().c_str(),
+         data_node->GetName().c_str(), data_node->GetType().c_str());
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::ChangePld2Data(NodePtr &node, NodePtr &data_node) {
+  auto type_pld = node->GetType();
+  auto type_data = data_node->GetType();
+  if (type_pld != PLACEHOLDER || type_data != DATA) {
+    GELOGE(FAILED, "TUU:Failed to change node %s from type %s to type %s", node->GetName().c_str(), type_pld.c_str(),
+           type_data.c_str());
+    return FAILED;
+  }
+  auto graph = node->GetOwnerComputeGraph();
+  GE_CHECK_NOTNULL(graph);
+  std::vector<int> output_map(node->GetAllOutDataAnchorsSize());
+  for (size_t i = 0; i < node->GetAllOutDataAnchorsSize(); ++i) {
+    output_map[i] = static_cast<int>(i);
+  }
+
+  auto ret = GraphUtils::ReplaceNodeAnchors(data_node, node, {}, output_map);
+  if (ret != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "TUU:Failed to replace node %s by node %s error node %u", node->GetName().c_str(),
+           data_node->GetName().c_str(), ret);
+    return FAILED;
+  }
+
+  NodeUtils::UnlinkAll(*node);
+
+  ret = GraphUtils::RemoveNodeWithoutRelink(graph, node);
+  if (ret != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "TUU:Failed to remove node %s from graph", node->GetName().c_str());
+    return FAILED;
+  }
+
+  GELOGD("TUU:Remove node %s(%s) by the ChangePld2Data process, replace it with node %s(%s)", node->GetName().c_str(),
+         node->GetType().c_str(), data_node->GetName().c_str(), data_node->GetType().c_str());
+  return ret;
+}
+
+graphStatus TuningUtils::HandlePld(NodePtr &node) {
+  GE_CHECK_NOTNULL(node);
+  auto graph = node->GetOwnerComputeGraph();
+  GE_CHECK_NOTNULL(graph);
+  NodePtr data_node = nullptr;
+
+  // 1. create data node
+  if (CreateDataNode(node, data_node) != SUCCESS) {
+    GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str());
+    return FAILED;
+  }
+  // 2. add necessary info to data_node for recovery whole graph
+  if (AddAttrToDataNodeForMergeGraph(node, data_node) != SUCCESS) {
+    GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str());
+    return FAILED;
+  }
+  // 3. replace pld node by data node created before
+  if (ChangePld2Data(node, data_node) != SUCCESS) {
+    GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str());
+    return FAILED;
+  }
+  GELOGD("TUU:pld[%s] handle success", node->GetName().c_str());
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::CreateNetOutput(NodePtr &node, NodePtr &out_node) {
+  GE_CHECK_NOTNULL(node);
+  auto graph = node->GetOwnerComputeGraph();
+  GE_CHECK_NOTNULL(graph);
+  auto search = create_output_.find(graph);
+  if (search == create_output_.end()) {
+    GELOGE(FAILED, "TUU:node %s's owner sub graph %s not exist in create_output map", node->GetName().c_str(),
+           graph->GetName().c_str());
+    return FAILED;
+  }
+  if (search->second != nullptr) {
+    out_node = search->second;
+    GELOGD("TUU:sub graph %s has created output node, just return", graph->GetName().c_str());
+    return SUCCESS;
+  }
+  auto out_op_desc = ComGraphMakeShared<OpDesc>(node->GetName(), NETOUTPUT);
+  GE_CHECK_NOTNULL(out_op_desc);
+  out_node = graph->AddNode(out_op_desc);
+  GE_CHECK_NOTNULL(out_node);
+  if (out_node->SetOwnerComputeGraph(graph) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "TUU:SetOwnerComputeGraph failed");
+    return FAILED;
+  }
+  create_output_[graph] = out_node;
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::AddAttrToNetOutputForMergeGraph(const NodePtr &end, NodePtr &out_node) {
+  GE_CHECK_NOTNULL(end);
+  GE_CHECK_NOTNULL(out_node);
+  auto op_desc = out_node->GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
+  std::vector<std::string> alias_names = {};
+  (void)AttrUtils::GetListStr(op_desc, alias_name_attr, alias_names);
+  alias_names.push_back(end->GetName());
+  (void)AttrUtils::SetListStr(op_desc, alias_name_attr, alias_names);
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::LinkEnd2NetOutput(NodePtr &end_node, NodePtr &out_node) {
+  GE_CHECK_NOTNULL(end_node);
+  GE_CHECK_NOTNULL(out_node);
+  // get end in node is control node or normal node
+  AnchorPtr end_in_anchor = (end_node->GetInDataAnchor(0)->GetFirstPeerAnchor() == nullptr)
+                              ? Anchor::DynamicAnchorCast<Anchor>(end_node->GetInControlAnchor())
+                              : Anchor::DynamicAnchorCast<Anchor>(end_node->GetInDataAnchor(0));
+  auto src_anchor = end_in_anchor->GetFirstPeerAnchor();  // src_anchor should be only 1
+  if (GraphUtils::RemoveEdge(src_anchor, end_in_anchor) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "TUU:remove end input edge from from %s(%d) to %s(%d) failed. node_name:%s, graph_name:%s",
+           GetNodeNameByAnchor(src_anchor.get()).c_str(), src_anchor->GetIdx(),
+           GetNodeNameByAnchor(end_in_anchor.get()).c_str(), end_in_anchor->GetIdx(), end_node->GetName().c_str(),
+           end_node->GetOwnerComputeGraph()->GetName().c_str());
+    return FAILED;
+  }
+  // add edge between `end in node` and `out_node`
+  if (src_anchor->IsTypeOf<OutDataAnchor>()) {
+    std::shared_ptr<InDataAnchor> anchor =
+      ComGraphMakeShared<InDataAnchor>(out_node, out_node->GetAllInDataAnchors().size());
+    GE_CHECK_NOTNULL(anchor);
+    out_node->in_data_anchors_.push_back(anchor);
+    if (GraphUtils::AddEdge(src_anchor, anchor) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "TUU:add edge from %s(%d) to %s(%d) failed. node_name:%s, graph_name:%s",
+             GetNodeNameByAnchor(src_anchor.get()).c_str(), src_anchor->GetIdx(),
+             GetNodeNameByAnchor(anchor.get()).c_str(), anchor->GetIdx(), end_node->GetName().c_str(),
+             end_node->GetOwnerComputeGraph()->GetName().c_str());
+      return FAILED;
+    }
+    auto end_op_desc = end_node->GetOpDesc();
+    GE_CHECK_NOTNULL(end_op_desc);
+    auto out_node_op_desc = out_node->GetOpDesc();
+    GE_CHECK_NOTNULL(out_node_op_desc);
+    // end node always has one input
+    if (out_node_op_desc->AddInputDesc(end_op_desc->GetInputDesc(0)) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "TUU:node %s add input desc failed.", out_node_op_desc->GetName().c_str());
+      return FAILED;
+    }
+  } else if (src_anchor->IsTypeOf<OutControlAnchor>()) {
+    auto anchor = out_node->GetInControlAnchor();
+    if (GraphUtils::AddEdge(src_anchor, anchor) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "TUU:add edge from %s(%d) to %s(%d) failed. node_name:%s, graph_name:%s",
+             GetNodeNameByAnchor(src_anchor.get()).c_str(), src_anchor->GetIdx(),
+             GetNodeNameByAnchor(anchor.get()).c_str(), anchor->GetIdx(), end_node->GetName().c_str(),
+             end_node->GetOwnerComputeGraph()->GetName().c_str());
+      return FAILED;
+    }
+  } else {
+    GELOGE(FAILED, "TUU: node_name:%s, graph_name:%s handled failed", end_node->GetName().c_str(),
+           end_node->GetOwnerComputeGraph()->GetName().c_str());
+    return FAILED;
+  }
+
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::ChangeEnd2NetOutput(NodePtr &end_node, NodePtr &out_node) {
+  GE_CHECK_NOTNULL(end_node);
+  GE_CHECK_NOTNULL(out_node);
+  auto type_end = end_node->GetType();
+  auto type_out = out_node->GetType();
+  if (type_end != END || type_out != NETOUTPUT) {
+    GELOGE(FAILED, "TUU:Failed to change end_node %s from type %s to type %s", end_node->GetName().c_str(),
+           type_end.c_str(), type_out.c_str());
+    return FAILED;
+  }
+  // link all `end nodes's in node` to this out_node
+  if (LinkEnd2NetOutput(end_node, out_node) != SUCCESS) {
+    GELOGE(FAILED, "TUU:end_node [%s] LinkEnd2NetOutput failed.", end_node->GetName().c_str());
+    return FAILED;
+  }
+  // remove `end node`
+  NodeUtils::UnlinkAll(*end_node);
+  auto graph = end_node->GetOwnerComputeGraph();
+  GE_CHECK_NOTNULL(graph);
+  if (GraphUtils::RemoveNodeWithoutRelink(graph, end_node) != SUCCESS) {
+    GELOGE(FAILED, "TUU:end node [%s] RemoveNodeWithoutRelink failed.", end_node->GetName().c_str());
+    return FAILED;
+  }
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::HandleEnd(NodePtr &node) {
+  GE_CHECK_NOTNULL(node);
+  auto graph = node->GetOwnerComputeGraph();
+  GE_CHECK_NOTNULL(graph);
+  NodePtr out_node = nullptr;
+
+  // 1. create net_output node , add only one NetOutput node to one subgraph
+  if (CreateNetOutput(node, out_node) != SUCCESS) {
+    GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str());
+    return FAILED;
+  }
+  // 2. add necessary info to out_node for recovery whole graph
+  if (AddAttrToNetOutputForMergeGraph(node, out_node) != SUCCESS) {
+    GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str());
+    return FAILED;
+  }
+  // 3. replace all end nodes by one output node created before
+  if (ChangeEnd2NetOutput(node, out_node) != SUCCESS) {
+    GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str());
+    return FAILED;
+  }
+  GELOGD("TUU:end[%s] handle success", node->GetName().c_str());
+  return SUCCESS;
+}
+
+// part 2
+graphStatus TuningUtils::ConvertFileToGraph(const map<int64_t, string> &options, ge::Graph &graph) {
+  // 1. get all subgraph object
+  std::vector<ComputeGraphPtr> graphs;
+  // options format like {index:"subgraph_path"}
+  for (const auto &pair : options) {
+    ComputeGraphPtr compute_graph = ComGraphMakeShared<ComputeGraph>(std::to_string(pair.first));
+    if (!ge::GraphUtils::LoadGEGraph(pair.second.c_str(), *compute_graph)) {
+      GELOGE(FAILED, "TUU:load graph from file failed");
+    }
+    graphs.push_back(compute_graph);
+  }
+  // 2. merge graph
+  ComputeGraphPtr merged_graph = ComGraphMakeShared<ComputeGraph>("whole_graph_after_tune");
+  GE_CHECK_NOTNULL(merged_graph);
+  if (MergeAllSubGraph(graphs, merged_graph) != SUCCESS) {
+    GELOGE(FAILED, "TUU:MergeGraph failed");
+    return FAILED;
+  }
+  // 3. set parent graph
+  for (const auto &node : merged_graph->GetDirectNode()) {
+    GE_CHECK_NOTNULL(node);
+    if (node->SetOwnerComputeGraph(merged_graph) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "TUU:node %s set owner graph failed", node->GetName().c_str());
+      return FAILED;
+    }
+  }
+  graph = GraphUtils::CreateGraphFromComputeGraph(merged_graph);
+  return SUCCESS;
+}
+
+// +----------------------------------+
+// | const const                      |
+// |  \     /                         |
+// | netoutput(end,end)               |
+// +----------------------------------+
+//         +
+// +----------------------------------+
+// | data(pld)   data(pld)            |
+// |  \         /                     |
+// | relu     relu                    |
+// |   \      /                       |
+// |    \   /                         |
+// |    add                           |
+// |     |                            |
+// |  netoutput(end)                  |
+// +----------------------------------+
+//         +
+// +----------------------------------+
+// |  data(pld)                       |
+// |      /                           |
+// |  netoutput                       |
+// +----------------------------------+
+//        |
+//        |
+//        V
+// +----------------------------------+
+// | const     const                  |
+// |  \         /                     |
+// | relu     relu                    |
+// |   \      /                       |
+// |    \   /                         |
+// |    add                           |
+// |     |                            |
+// |  netoutput                       |
+// +----------------------------------+
+graphStatus TuningUtils::MergeAllSubGraph(std::vector<ComputeGraphPtr> &subgraphs,
+                                          ComputeGraphPtr &output_merged_compute_graph) {
+  GE_CHECK_NOTNULL(output_merged_compute_graph);
+  // 1. handle all subgraphs
+  for (auto &subgraph : subgraphs) {
+    Status ret_status = MergeSubGraph(subgraph);
+    if (ret_status != SUCCESS) {
+      GELOGE(ret_status, "TUU:subgraph %s merge failed", subgraph->GetName().c_str());
+      return ret_status;
+    }
+  }
+
+  for (const auto &node : merged_graph_nodes_) {
+    (void)output_merged_compute_graph->AddNode(node);
+    GELOGD("TUU:graph %s add node %s success", output_merged_compute_graph->GetName().c_str(), node->GetName().c_str());
+  }
+
+  // 2. remove data and output node added by us
+  if (RemoveDataNetoutputEdge(output_merged_compute_graph) != SUCCESS) {
+    GELOGE(FAILED, "TUU:Failed to merge graph %s", output_merged_compute_graph->GetName().c_str());
+    return FAILED;
+  }
+  graphStatus ret = output_merged_compute_graph->TopologicalSorting();
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Graph[%s] topological sort failed, ret:%d.", output_merged_compute_graph->GetName().c_str(), ret);
+    return ret;
+  }
+  GELOGD("TUU:Print-%s", PrintCheckLog().c_str());
+  GELOGI("TUU:output_merged_compute_graph %s success", output_merged_compute_graph->GetName().c_str());
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::MergeSubGraph(ComputeGraphPtr &subgraph) {
+  for (auto &node : subgraph->GetDirectNode()) {
+    if (kPartitionOpTypes.count(node->GetType()) > 0) {
+      GELOGE(FAILED, "TUU:subgraph passed in should not contain nodes of end or pld type");
+      return FAILED;
+    }
+    // handle data converted from pld node
+    if (node->GetType() == DATA) {
+      auto op_desc = node->GetOpDesc();
+      GE_CHECK_NOTNULL(op_desc);
+      std::string peer_out_name;
+      bool has_valid_str = (AttrUtils::GetStr(op_desc, peer_node_name_attr, peer_out_name)) && (!peer_out_name.empty());
+      if (has_valid_str) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        data_2_netoutput_.emplace(op_desc->GetName(), peer_out_name);
+        data_node_2_netoutput_.emplace(node, peer_out_name);
+        continue;
+      }
+    }
+    // handle netoutput converted from end node
+    if (node->GetType() == NETOUTPUT) {
+      auto op_desc = node->GetOpDesc();
+      GE_CHECK_NOTNULL(op_desc);
+      std::vector<string> out_alias_name;
+      bool has_valid_str =
+        (AttrUtils::GetListStr(op_desc, alias_name_attr, out_alias_name)) && (!out_alias_name.empty());
+      if (has_valid_str) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        netoutput_nodes_.insert(node);
+      }
+    }
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      merged_graph_nodes_.emplace(node);
+    }
+    GELOGD("TUU:subgraph %s add node %s success", subgraph->GetName().c_str(), node->GetName().c_str());
+  }
+  GELOGI("TUU:merge subgraph %s success", subgraph->GetName().c_str());
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::RemoveDataNetoutputEdge(ComputeGraphPtr &graph) {
+  GE_CHECK_NOTNULL(graph);
+  // 1. traverse
+  for (auto &pair : data_node_2_netoutput_) {
+    auto data_node = pair.first;
+    GE_CHECK_NOTNULL(data_node);
+    auto netoutput_name = pair.second;
+    auto netoutput_node = graph->FindNode(netoutput_name);
+    GE_CHECK_NOTNULL(netoutput_node);
+    data_node_2_netoutput_node_.emplace(data_node, netoutput_node);
+    // 2. get `data out anchor` and `net output in anchor` and `net output in node's out anchor`
+    AnchorPtr data_out_anchor = (data_node->GetOutDataAnchor(0)->GetFirstPeerAnchor() == nullptr)
+                                  ? Anchor::DynamicAnchorCast<Anchor>(data_node->GetOutControlAnchor())
+                                  : Anchor::DynamicAnchorCast<Anchor>(data_node->GetOutDataAnchor(0));
+    AnchorPtr net_output_in_anchor = nullptr;
+    AnchorPtr src_out_anchor = nullptr;
+    if (GetInAndOutAnchorPair(data_node, netoutput_node, net_output_in_anchor, src_out_anchor) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "TUU:get out node:%s 's in anchor related with data node:%s failed",
+             netoutput_node->GetName().c_str(), data_node->GetName().c_str());
+      return FAILED;
+    }
+    // 3. relink
+    if (GraphUtils::RemoveEdge(src_out_anchor, net_output_in_anchor) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "TUU:remove edge from %s(%d) to %s(%d) failed. node_name:(data:%s;netoutput:%s), graph_name:%s",
+             GetNodeNameByAnchor(src_out_anchor.get()).c_str(), src_out_anchor->GetIdx(),
+             GetNodeNameByAnchor(net_output_in_anchor.get()).c_str(), net_output_in_anchor->GetIdx(),
+             data_node->GetName().c_str(), netoutput_node->GetName().c_str(), graph->GetName().c_str());
+      return FAILED;
+    }
+    GE_CHECK_NOTNULL(data_out_anchor);
+    for (const auto &peer_in_anchor : data_out_anchor->GetPeerAnchors()) {
+      if (GraphUtils::RemoveEdge(data_out_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
+        GELOGE(FAILED, "TUU:remove edge from %s(%d) to %s(%d) failed. node_name:(data:%s;netoutput:%s), graph_name:%s",
+               GetNodeNameByAnchor(data_out_anchor.get()).c_str(), data_out_anchor->GetIdx(),
+               GetNodeNameByAnchor(peer_in_anchor.get()).c_str(), peer_in_anchor->GetIdx(),
+               data_node->GetName().c_str(), netoutput_node->GetName().c_str(), graph->GetName().c_str());
+        return FAILED;
+      }
+      if (GraphUtils::AddEdge(src_out_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
+        GELOGE(FAILED, "TUU:add edge from %s(%d) to %s(%d) failed. node_name:(data:%s;netoutput:%s), graph_name:%s",
+               GetNodeNameByAnchor(src_out_anchor.get()).c_str(), src_out_anchor->GetIdx(),
+               GetNodeNameByAnchor(peer_in_anchor.get()).c_str(), peer_in_anchor->GetIdx(),
+               data_node->GetName().c_str(), netoutput_node->GetName().c_str(), graph->GetName().c_str());
+        return FAILED;
+      }
+    }
+  }
+  // 4. remove out nodes added by us
+  for (auto &node : netoutput_nodes_) {
+    NodeUtils::UnlinkAll(*node);
+    if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "TUU:Failed to remove node %s from graph", node->GetName().c_str());
+      return FAILED;
+    }
+    GELOGD("TUU:Remove node %s by the RemoveDataNetoutputEdge process success", node->GetName().c_str());
+  }
+  return SUCCESS;
+}
+
+graphStatus TuningUtils::GetInAndOutAnchorPair(NodePtr &data_node, NodePtr &out_node, AnchorPtr &dest_in_anchor,
+                                               AnchorPtr &src_out_anchor) {
+  // 1. get `data parent node name`, i.e. `netoutput input node name`
+  std::string netoutput_input_name;
+  auto op_desc = data_node->GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
+  if (!AttrUtils::GetStr(op_desc, parent_node_name_attr, netoutput_input_name)) {
+    GELOGE(FAILED, "TUU:Failed to get parent node attr from node %s", op_desc->GetName().c_str());
+    return FAILED;
+  }
+  // 2. find index
+  int parent_node_anchor_index;
+  if (!AttrUtils::GetInt(op_desc, parent_node_anchor_index_attr, parent_node_anchor_index)) {
+    GELOGE(FAILED, "TUU:Failed to get parent node anchor index attr from node %s", op_desc->GetName().c_str());
+    return FAILED;
+  }
+  // 3.find in data or ctrl anchor by 1&2 step
+  for (auto &in_anchor : out_node->GetAllInAnchors()) {
+    GE_CHECK_NOTNULL(in_anchor);
+    for (auto &src_anchor : in_anchor->GetPeerAnchors()) {  // get all peer anchors for ctrl
+      GE_CHECK_NOTNULL(src_anchor);
+      auto src_node = src_anchor->GetOwnerNode();
+      GE_CHECK_NOTNULL(src_node);
+      if (src_node->GetName() == netoutput_input_name && src_anchor->GetIdx() == parent_node_anchor_index) {
+        dest_in_anchor = in_anchor;
+        src_out_anchor = src_anchor;
+        GELOGD("TUU:get out node:%s 's in anchor(%d) src_node:%s 's out anchor(%d) related with data node:%s",
+               out_node->GetName().c_str(), dest_in_anchor->GetIdx(), netoutput_input_name.c_str(),
+               parent_node_anchor_index, data_node->GetName().c_str());
+        break;
+      }
+    }
+  }
+  GE_CHECK_NOTNULL(dest_in_anchor);
+  GE_CHECK_NOTNULL(src_out_anchor);
+  return SUCCESS;
+}
+
+}  // namespace ge
\ No newline at end of file
diff --git a/src/ge/CMakeLists.txt b/src/ge/CMakeLists.txt
index 922502e6..18c433cb 100755
--- a/src/ge/CMakeLists.txt
+++ b/src/ge/CMakeLists.txt
@@ -31,6 +31,7 @@ file(GLOB PROTO_HEADER_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "../proto/ge_ir.proto"
         "../proto/fwk_adapter.proto"
         "../proto/op_mapping_info.proto"
+        "../proto/dump_task.proto"
         )
 ge_protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST})
 ge_protobuf_generate(ge PROTO_CLIENT_SRCS PROTO_CLIENT_HDRS ${PROTO_CLIENT_LIST})
@@ -39,6 +40,7 @@ ge_protobuf_generate(ge PROTO_HEADER_SRCS PROTO_HEADER_HDRS ${PROTO_HEADER_LIST}
 include_directories(${CMAKE_CURRENT_LIST_DIR})
 include_directories(${GE_SOURCE_DIR})
 include_directories(${GE_SOURCE_DIR}/src)
+include_directories(${GE_SOURCE_DIR}/src/ge/analyzer)
 include_directories(${GE_SOURCE_DIR}/inc)
 include_directories(${GE_SOURCE_DIR}/inc/common/util)
 include_directories(${GE_SOURCE_DIR}/inc/external)
@@ -55,6 +57,7 @@ include_directories(${CMAKE_BINARY_DIR}/proto/ge)
 ######### libge_runner.so #############
 # need to remove dependencies on pb files later
 file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
+        "analyzer/analyzer.cc"
         "client/ge_api.cc"
         "common/dump/dump_manager.cc"
         "common/dump/dump_properties.cc"
@@ -105,12 +108,12 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "graph/manager/graph_caching_allocator.cc"
         "graph/manager/graph_var_manager.cc"
         "graph/manager/model_manager/event_manager.cc"
+        "graph/manager/rdma_pool_allocator.cc"
         "graph/manager/trans_var_data_utils.cc"
         "graph/manager/util/debug.cc"
         "graph/manager/util/hcom_util.cc"
         "graph/manager/util/rt_context_util.cc"
         "graph/manager/util/variable_accelerate_ctrl.cc"
-        "graph/manager/model_manager/event_manager.cc"
         "graph/manager/util/debug.cc"
         "graph/manager/util/hcom_util.cc"
         "graph/manager/util/rt_context_util.cc"
@@ -228,6 +231,7 @@ target_link_libraries(ge_runner
 ######### libge_compiler.so #############
 # need to remove dependencies on pb files later
 file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
+        "analyzer/analyzer.cc"
         "common/dump/dump_properties.cc"
         "common/dump/dump_manager.cc"
         "common/dump/dump_op.cc"
@@ -276,6 +280,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "graph/manager/trans_var_data_utils.cc"
         "graph/manager/graph_var_manager.cc"
         "graph/manager/model_manager/event_manager.cc"
+        "graph/manager/rdma_pool_allocator.cc"
         "graph/manager/util/debug.cc"
         "graph/manager/util/rt_context_util.cc"
         "graph/manager/util/variable_accelerate_ctrl.cc"
diff --git a/src/ge/analyzer/analyzer.cc b/src/ge/analyzer/analyzer.cc
new file mode 100644
index 00000000..1c944971
--- /dev/null
+++ b/src/ge/analyzer/analyzer.cc
@@ -0,0 +1,304 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "analyzer.h"
+
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+
+#include "framework/common/debug/ge_log.h"
+#include "framework/common/util.h"
+#include "graph/utils/graph_utils.h"
+#include "graph/utils/node_utils.h"
+#include "graph/utils/type_utils.h"
+
+namespace ge {
+using json = nlohmann::json;
+using Status = ge::Status;
+using ComputeGraph = ge::ComputeGraph;
+using namespace analyzer;
+
+namespace {
+constexpr int kFileAuthority = 0640;
+constexpr int kJsonDumpLevel = 4;
+
+const std::string kFilePath = "./";
+const std::string kAnalyzeFile = "ge_check_op.json";
+
+const std::string kUnknownShape = "unknownshape";
+const std::string kUnsupport = "unsupport";
+
+const std::string kSessionId = "session_id";
+const std::string kGraphId = "graph_id";
+const std::string kOpInfo = "op_info";
+const std::string kErrorType = "error_type";
+const std::string kOpName = "name";
+const std::string kOpType = "type";
+const std::string kReason = "reason";
+const std::string kInput = "input";
+const std::string kOutput = "output";
+const std::string kShape = "shape";
+const std::string kDataType = "data_type";
+const std::string kLayout = "layout";
+const std::string kResult = "result";
+const std::string kOp = "op";
+
+std::map<analyzer::AnalyzeType, std::string> errors_map{{PARSER, "paser_error"},
+                                                        {INFER_SHAPE, "infer_shape_error"},
+                                                        {CHECKSUPPORT, "check_support_error"},
+                                                        {GRAPH_OPTIMIZE, "graph_optimize_error"},
+                                                        {GRAPH_PARTION, "graph_partion_error"},
+                                                        {GRAPH_BUILDER, "graph_builder_error"}};
+}  // namespace
+
+Analyzer *Analyzer::GetInstance() {
+  static Analyzer instance;
+  return &instance;
+}
+
+Status Analyzer::BuildJsonObject(uint64_t session_id, uint64_t graph_id) {
+  GELOGD("Start to build map. SessionId:%lu GraphId:%lu", session_id, graph_id);
+  std::lock_guard<std::recursive_mutex> lg(mutex_);
+  auto iter = graph_infos_.find(session_id);
+  if (iter == graph_infos_.end()) {
+    auto p = new (std::nothrow) GraphInfo();
+    GE_CHECK_NOTNULL(p);
+    std::shared_ptr<GraphInfo> graph_info(p);
+    std::map<uint64_t, std::shared_ptr<GraphInfo>> graph_map;
+    graph_map[graph_id] = graph_info;
+    graph_info->session_id = session_id;
+    graph_info->graph_id = graph_id;
+    graph_infos_.insert({session_id, graph_map});
+  } else {
+    auto iter1 = (iter->second).find(graph_id);
+    if (iter1 == (iter->second).end()) {
+      auto p = new (std::nothrow) GraphInfo();
+      GE_CHECK_NOTNULL(p);
+      std::shared_ptr<GraphInfo> graph_info(p);
+      graph_info->session_id = session_id;
+      graph_info->graph_id = graph_id;
+      (iter->second).insert({graph_id, graph_info});
+    } else {
+      GELOGI("session_id:%lu graph_id:%lu already existed json object", session_id, graph_id);
+    }
+  }
+  return SUCCESS;
+}
+
+ge::Status Analyzer::Initialize() {
+  ClearHistoryFile();
+  return CreateAnalyzerFile();
+}
+
+void Analyzer::Finalize() {
+  GELOGD("Analyzer start to finalize!");
+  std::lock_guard<std::recursive_mutex> lg(mutex_);
+  for (auto &session_resource : graph_infos_) {
+    session_resource.second.clear();
+  }
+  graph_infos_.clear();
+
+  std::lock_guard<std::mutex> lk(file_mutex_);
+  if (json_file_.is_open()) {
+    json_file_.close();
+  }
+}
+
+void Analyzer::DestroySessionJsonObject(uint64_t session_id) {
+  std::lock_guard<std::recursive_mutex> lg(mutex_);
+  auto iter = graph_infos_.find(session_id);
+  if (iter == graph_infos_.end()) {
+    GELOGW("can not find the stored object by session_id[%lu].Do nothing", session_id);
+  } else {
+    graph_infos_.erase(iter);
+  }
+}
+
+void Analyzer::DestroyGraphJsonObject(uint64_t session_id, uint64_t graph_id) {
+  std::lock_guard<std::recursive_mutex> lg(mutex_);
+  auto iter = graph_infos_.find(session_id);
+  if (iter == graph_infos_.end()) {
+    GELOGW("can not find the stored object by session_id[%lu].Do nothing", session_id);
+  } else {
+    auto iter1 = (iter->second).find(graph_id);
+    if (iter1 == (iter->second).end()) {
+      GELOGW("can not find the graph json object by session_id[%lu] and graph_id[%lu].Do nothing", session_id,
+             graph_id);
+    }
+    (iter->second).erase(iter1);
+  }
+}
+
+std::shared_ptr<GraphInfo> Analyzer::GetJsonObject(uint64_t session_id, uint64_t graph_id) {
+  std::lock_guard<std::recursive_mutex> lg(mutex_);
+  auto iter = graph_infos_.find(session_id);
+  if (iter == graph_infos_.end()) {
+    GELOGE(PARAM_INVALID, "session_id:%lu does not exist!", session_id);
+    return nullptr;
+  } else {
+    auto iter1 = (iter->second).find(graph_id);
+    if (iter1 == (iter->second).end()) {
+      GELOGE(PARAM_INVALID, "graph_id:%lu does not exist!", graph_id);
+      return nullptr;
+    }
+    GELOGI("GetJsonObject Success!session_id:%lu graph_id:%lu", session_id, graph_id);
+    return iter1->second;
+  }
+}
+
+void Analyzer::ClearHistoryFile() {
+  GELOGD("Analyzer start to clear history file!");
+
+  // Remove history files
+  int res = remove(json_file_name_.c_str());
+  GELOGD("remove file %s, result:%d", json_file_name_.c_str(), res);
+}
+
+ge::Status Analyzer::CreateAnalyzerFile() {
+  GELOGD("start to create analyzer file!");
+  // Check whether the manifest exists, if not, create it.
+  string real_path = RealPath(kFilePath.c_str());
+  if (real_path.empty()) {
+    GELOGE(FAILED, "File path is invalid.");
+    return FAILED;
+  }
+  string file = real_path + "/" + kAnalyzeFile;
+  GELOGD("Created analyzer file:[%s]", file.c_str());
+  int fd = open(file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kFileAuthority);
+  if (fd < 0) {
+    GELOGE(INTERNAL_ERROR, "Fail to open the file: %s.", file.c_str());
+    return INTERNAL_ERROR;
+  }
+  if (close(fd) != 0) {
+    GELOGE(INTERNAL_ERROR, "Fail to close the file: %s.", file.c_str());
+    return INTERNAL_ERROR;
+  }
+  json_file_name_ = file;
+
+  GELOGD("success to create analyzer file[%s]!", json_file_name_.c_str());
+  return SUCCESS;
+}
+
+ge::Status Analyzer::SaveAnalyzerDataToFile() {
+  GELOGD("start to save analyze file!");
+  std::lock_guard<std::mutex> lg(file_mutex_);
+  json_file_.open(json_file_name_, std::ios::out);
+  if (!json_file_.is_open()) {
+    GELOGE(FAILED, "analyzer file does not exist[%s]", json_file_name_.c_str());
+    return PARAM_INVALID;
+  }
+
+  std::lock_guard<std::recursive_mutex> lk(mutex_);
+  for (auto &ele : graph_infos_) {
+    for (auto &ele2 : ele.second) {
+      json jsn;
+      GraphInfoToJson(jsn, *(ele2.second));
+      json_file_ << jsn.dump(kJsonDumpLevel) << std::endl;
+    }
+  }
+
+  json_file_.close();
+  return SUCCESS;
+}
+
+ge::Status Analyzer::DoAnalyze(DataInfo &data_info) {
+  GELOGD("start to do analyzer!");
+
+  auto pnode = data_info.node_ptr;
+  GE_CHECK_NOTNULL(pnode);
+  auto desc = pnode->GetOpDesc();
+  GE_CHECK_NOTNULL(desc);
+  // buff analyze data
+  std::lock_guard<std::recursive_mutex> lg(mutex_);
+  auto graph_info = GetJsonObject(data_info.session_id, data_info.graph_id);
+  GE_CHECK_NOTNULL(graph_info);
+  auto status = SaveOpInfo(desc, data_info, graph_info);
+  if (status != SUCCESS) {
+    GELOGE(status, "save op info failed!");
+    return FAILED;
+  }
+  // save data to file
+  return SaveAnalyzerDataToFile();
+}
+
+ge::Status Analyzer::SaveOpInfo(ge::OpDescPtr desc, DataInfo &data_info,
+                                std::shared_ptr<analyzer::GraphInfo> graph_info) {
+  auto iter = errors_map.find(data_info.analyze_type);
+  if (iter == errors_map.end()) {
+    return PARAM_INVALID;
+  }
+  OpInfo op_info;
+  op_info.error_type = iter->second;
+  op_info.op_name = desc->GetName();
+  op_info.op_type = desc->GetType();
+  op_info.reason = data_info.reason;
+
+  for (const auto &ptr : desc->GetAllInputsDescPtr()) {
+    TensorInfo tensor_info;
+    tensor_info.shape = ptr->GetShape().GetDims();
+    tensor_info.d_type = ge::TypeUtils::DataTypeToSerialString(ptr->GetDataType());
+    tensor_info.layout = ge::TypeUtils::FormatToSerialString(ptr->GetFormat());
+    op_info.input_info.emplace_back(tensor_info);
+  }
+  for (const auto &ptr : desc->GetAllOutputsDescPtr()) {
+    TensorInfo tensor_info;
+    tensor_info.shape = ptr->GetShape().GetDims();
+    tensor_info.d_type = ge::TypeUtils::DataTypeToSerialString(ptr->GetDataType());
+    tensor_info.layout = ge::TypeUtils::FormatToSerialString(ptr->GetFormat());
+    op_info.output_info.emplace_back(tensor_info);
+  }
+  graph_info->op_info.emplace_back(op_info);
+
+  return SUCCESS;
+}
+
+void Analyzer::TensorInfoToJson(json &j, const TensorInfo &tensor_info) {
+  j[kShape] = tensor_info.shape;
+  j[kDataType] = tensor_info.d_type;
+  j[kLayout] = tensor_info.layout;
+}
+
+void Analyzer::OpInfoToJson(json &j, const OpInfo &op_info) {
+  j[kErrorType] = op_info.error_type;
+  j[kOpName] = op_info.op_name;
+  j[kOpType] = op_info.op_type;
+  j[kReason] = op_info.reason;
+  for (size_t i = 0; i < op_info.input_info.size(); i++) {
+    json json_tensor_info;
+    TensorInfoToJson(json_tensor_info, op_info.input_info.at(i));
+    j[kInput + std::to_string(i)] = json_tensor_info;
+  }
+  for (size_t i = 0; i < op_info.output_info.size(); i++) {
+    json json_tensor_info;
+    TensorInfoToJson(json_tensor_info, op_info.output_info.at(i));
+    j[kOutput + std::to_string(i)] = json_tensor_info;
+  }
+}
+
+void Analyzer::GraphInfoToJson(json &j, const GraphInfo &graph_info) {
+  GELOGD("start to buff graph info!");
+  j[kSessionId] = graph_info.session_id;
+  j[kGraphId] = graph_info.graph_id;
+  std::vector<json> json_op_infos;
+  for (size_t i = 0; i < graph_info.op_info.size(); i++) {
+    json json_op_info;
+    OpInfoToJson(json_op_info, graph_info.op_info.at(i));
+    json_op_infos.emplace_back(json_op_info);
+  }
+  j[kOp] = json_op_infos;
+}
+}  // namespace ge
diff --git a/src/ge/analyzer/analyzer.h b/src/ge/analyzer/analyzer.h
new file mode 100644
index 00000000..4ac8b391
--- /dev/null
+++ b/src/ge/analyzer/analyzer.h
@@ -0,0 +1,186 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DOMI_ANALYZER_ANANLYZER_H_
+#define DOMI_ANALYZER_ANANLYZER_H_
+
+#include "nlohmann/json.hpp"
+
+#include <map>
+#include <string>
+#include <mutex>
+#include <memory>
+#include <fstream>
+
+#include "external/ge/ge_api_types.h"
+#include "graph/compute_graph.h"
+#include "graph/node.h"
+
+namespace ge {
+namespace analyzer {
+enum AnalyzeType {
+  PARSER = 0,
+  INFER_SHAPE = 1,
+  CHECKSUPPORT = 2,
+  GRAPH_OPTIMIZE = 3,
+  GRAPH_PARTION = 4,
+  GRAPH_BUILDER = 5,
+};
+
+struct TensorInfo {
+  vector<int64_t> shape;
+  string d_type;
+  string layout;
+};
+
+struct OpInfo {
+  string error_type;
+  string op_name;
+  string op_type;
+  std::vector<TensorInfo> input_info;
+  std::vector<TensorInfo> output_info;
+  string reason;
+};
+
+struct GraphInfo {
+  uint64_t session_id = 0;
+  uint64_t graph_id = 0;
+  std::vector<OpInfo> op_info;
+};
+
+struct DataInfo {
+  DataInfo() = default;
+  ~DataInfo() = default;
+
+  DataInfo(uint64_t sess, uint64_t graph, AnalyzeType type, ge::NodePtr node, std::string error_info) {
+    session_id = sess;
+    graph_id = graph;
+    analyze_type = type;
+    node_ptr = node;
+    reason = error_info;
+  }
+  uint64_t session_id;
+  uint64_t graph_id;
+  AnalyzeType analyze_type;
+  ge::NodePtr node_ptr{nullptr};
+  std::string reason;
+};
+}  // namespace analyzer
+
+class Analyzer {
+ public:
+  /**
+   * @ingroup ge
+   * @brief: get analyzer instance.
+   * @param [in]: None
+   * @return: Analyzer instance ptr
+   */
+  static Analyzer *GetInstance();
+
+  /**
+   * @ingroup ge
+   * @brief: check whether env var ENABLE_NETWORK_ANALYSIS_DEBUG is enabled.
+   *     When enable env, it will keep adaptor sink geop graph even though fail.
+   * @param [in]: None
+   * @return: true: enable env   false : disable env
+   */
+  bool IsEnableNetAnalyzeDebug() { return std::getenv("ENABLE_NETWORK_ANALYSIS_DEBUG") != nullptr; }
+
+  /**
+   * @ingroup ge
+   * @brief: build buff object by sess id and graph id .
+   * @param [in]: session id & graph id
+   * @return: 0: success other: failed
+   */
+  ge::Status BuildJsonObject(uint64_t session_id, uint64_t graph_id);
+
+  /**
+   * @ingroup ge
+   * @brief: get buff object by sess id and graph id .
+   * @param [in]: session id & graph id
+   * @return: nullptr if failed
+   */
+  std::shared_ptr<analyzer::GraphInfo> GetJsonObject(uint64_t session_id, uint64_t graph_id);
+
+  /**
+   * @ingroup ge
+   * @brief: analyzer globle init method.
+   * @param [in]: None
+   * @return: None
+   */
+  ge::Status Initialize();
+
+  /**
+   * @ingroup ge
+   * @brief: DeConstruct method. Release all used resource of analyzer.
+   * @param [in]: None
+   * @return: None
+   */
+  void Finalize();
+
+  /**
+   * @ingroup ge
+   * @brief: DeConstruct method. Only release resource about session id.
+   * @param [in]: None
+   * @return: None
+   */
+  void DestroySessionJsonObject(uint64_t session_id);
+
+  /**
+   * @ingroup ge
+   * @brief: DeConstruct method. Only release resource about session id and graph id.
+   * @param [in]: None
+   * @return: None
+   */
+  void DestroyGraphJsonObject(uint64_t session_id, uint64_t graph_id);
+
+  /**
+   * @ingroup ge
+   * @brief: main process method. Buff analyzed data and output to json file
+   * @param [in]: DataInfo Object
+   * @return: 0: SUCCESS other: FAILED
+   */
+  ge::Status DoAnalyze(analyzer::DataInfo &data_info);
+
+  Analyzer(const Analyzer &) = delete;
+  Analyzer &operator=(const Analyzer &) = delete;
+  Analyzer(Analyzer &&) = delete;
+  Analyzer &operator=(Analyzer &&) = delete;
+
+ private:
+  void TensorInfoToJson(nlohmann::json &j, const analyzer::TensorInfo &tensor_info);
+  void OpInfoToJson(nlohmann::json &j, const analyzer::OpInfo &op_info);
+  void GraphInfoToJson(nlohmann::json &j, const analyzer::GraphInfo &graph_info);
+
+  ge::Status SaveAnalyzerDataToFile();
+  ge::Status SaveOpInfo(ge::OpDescPtr desc, analyzer::DataInfo &data_info,
+                        std::shared_ptr<analyzer::GraphInfo> graph_info);
+
+  void ClearHistoryFile();
+  ge::Status CreateAnalyzerFile();
+
+  explicit Analyzer(){};
+  ~Analyzer() = default;
+
+ private:
+  std::map<uint64_t, std::map<uint64_t, std::shared_ptr<analyzer::GraphInfo>>> graph_infos_;
+  std::recursive_mutex mutex_;  // protect graph_infos_
+  std::mutex file_mutex_;       // protect json_file_
+  std::ofstream json_file_;
+  std::string json_file_name_;
+};
+}  // namespace ge
+#endif  // DOMI_ANALYZER_ANANLYZER_H_
diff --git a/src/ge/client/ge_api.cc b/src/ge/client/ge_api.cc
index 9eb15ee4..0458a508 100644
--- a/src/ge/client/ge_api.cc
+++ b/src/ge/client/ge_api.cc
@@ -32,7 +32,6 @@
 #include "register/op_registry.h"
 #include "common/ge/tbe_plugin_manager.h"
 
-using domi::GetContext;
 using domi::OpRegistry;
 using std::map;
 using std::string;
diff --git a/src/ge/common/CMakeLists.txt b/src/ge/common/CMakeLists.txt
index adcdb1bc..58ba9bac 100755
--- a/src/ge/common/CMakeLists.txt
+++ b/src/ge/common/CMakeLists.txt
@@ -25,6 +25,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "../model/ge_model.cc"
         "auth/file_saver.cc"
         "context/ctx.cc"
+        "cust_aicpu_kernel_store.cc"
         "debug/memory_dumper.cc"
         "fmk_error_codes.cc"
         "formats/format_transfers/datatype_transfer.cc"
@@ -52,6 +53,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "ge_format_util.cc"
         "helper/model_helper.cc"
         "helper/om_file_helper.cc"
+        "kernel_store.cc"
         "math/fp16_math.cc"
         "model_parser/base.cc"
         "model_saver.cc"
diff --git a/src/ge/common/base64.h b/src/ge/common/base64.h
new file mode 100644
index 00000000..26819c88
--- /dev/null
+++ b/src/ge/common/base64.h
@@ -0,0 +1,119 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_COMMON_BASE64_H_
+#define GE_COMMON_BASE64_H_
+
+#include <algorithm>
+#include <string>
+
+#include "debug/ge_log.h"
+#include "ge_error_codes.h"
+
+namespace ge {
+namespace {
+const char *kBase64Chars =
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  "abcdefghijklmnopqrstuvwxyz"
+  "0123456789+/";
+const char kEqualSymbol = '=';
+const size_t kBase64CharsNum = 64;
+const size_t kThreeByteOneGroup = 3;
+const size_t kFourByteOneGroup = 4;
+}  // namespace
+
+namespace base64 {
+static inline bool IsBase64Char(const char &c) { return (isalnum(c) || (c == '+') || (c == '/')); }
+
+static std::string EncodeToBase64(const std::string &raw_data) {
+  size_t encode_length = raw_data.size() / kThreeByteOneGroup * kFourByteOneGroup;
+  encode_length += raw_data.size() % kThreeByteOneGroup == 0 ? 0 : kFourByteOneGroup;
+  size_t raw_data_index = 0;
+  size_t encode_data_index = 0;
+  std::string encode_data;
+  encode_data.resize(encode_length);
+
+  for (; raw_data_index + kThreeByteOneGroup <= raw_data.size(); raw_data_index += kThreeByteOneGroup) {
+    auto char_1 = static_cast<uint8_t>(raw_data[raw_data_index]);
+    auto char_2 = static_cast<uint8_t>(raw_data[raw_data_index + 1]);
+    auto char_3 = static_cast<uint8_t>(raw_data[raw_data_index + 2]);
+    encode_data[encode_data_index++] = kBase64Chars[char_1 >> 2u];
+    encode_data[encode_data_index++] = kBase64Chars[((char_1 << 4u) & 0x30) | (char_2 >> 4u)];
+    encode_data[encode_data_index++] = kBase64Chars[((char_2 << 2u) & 0x3c) | (char_3 >> 6u)];
+    encode_data[encode_data_index++] = kBase64Chars[char_3 & 0x3f];
+  }
+
+  if (raw_data_index < raw_data.size()) {
+    auto tail = raw_data.size() - raw_data_index;
+    auto char_1 = static_cast<uint8_t>(raw_data[raw_data_index]);
+    if (tail == 1) {
+      encode_data[encode_data_index++] = kBase64Chars[char_1 >> 2u];
+      encode_data[encode_data_index++] = kBase64Chars[(char_1 << 4u) & 0x30];
+      encode_data[encode_data_index++] = kEqualSymbol;
+      encode_data[encode_data_index++] = kEqualSymbol;
+    } else {
+      auto char_2 = static_cast<uint8_t>(raw_data[raw_data_index + 1]);
+      encode_data[encode_data_index++] = kBase64Chars[char_1 >> 2u];
+      encode_data[encode_data_index++] = kBase64Chars[((char_1 << 4u) & 0x30) | (char_2 >> 4u)];
+      encode_data[encode_data_index++] = kBase64Chars[(char_2 << 2u) & 0x3c];
+      encode_data[encode_data_index++] = kEqualSymbol;
+    }
+  }
+  return encode_data;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+static Status DecodeFromBase64(const std::string &base64_data, std::string &decode_data) {
+  if (base64_data.size() % kFourByteOneGroup != 0) {
+    GELOGE(PARAM_INVALID, "base64 data size must can be divided by 4, but given data size is %zu", base64_data.size());
+    return PARAM_INVALID;
+  }
+  decode_data.clear();
+  size_t base64_data_len = base64_data.size();
+  uint8_t byte_4[kFourByteOneGroup];
+  auto FindCharInBase64Chars = [&](const char &raw_char) -> uint8_t {
+    auto char_pos = std::find(kBase64Chars, kBase64Chars + kBase64CharsNum, raw_char);
+    return static_cast<uint8_t>(std::distance(kBase64Chars, char_pos)) & 0xff;
+  };
+
+  for (std::size_t input_data_index = 0; input_data_index < base64_data_len; input_data_index += 4) {
+    for (size_t i = 0; i < kFourByteOneGroup; ++i) {
+      if (base64_data[input_data_index + i] == kEqualSymbol && input_data_index >= base64_data_len - 4 && i > 1) {
+        byte_4[i] = kBase64CharsNum;
+      } else if (IsBase64Char(base64_data[input_data_index + i])) {
+        byte_4[i] = FindCharInBase64Chars(base64_data[input_data_index + i]);
+      } else {
+        GELOGE(PARAM_INVALID, "given base64 data is illegal");
+        return PARAM_INVALID;
+      }
+    }
+    decode_data += static_cast<char>((byte_4[0] << 2u) + ((byte_4[1] & 0x30) >> 4u));
+    if (byte_4[2] >= kBase64CharsNum) {
+      break;
+    } else if (byte_4[3] >= kBase64CharsNum) {
+      decode_data += static_cast<char>(((byte_4[1] & 0x0f) << 4u) + ((byte_4[2] & 0x3c) >> 2u));
+      break;
+    }
+    decode_data += static_cast<char>(((byte_4[1] & 0x0f) << 4u) + ((byte_4[2] & 0x3c) >> 2u));
+    decode_data += static_cast<char>(((byte_4[2] & 0x03) << 6u) + byte_4[3]);
+  }
+  return SUCCESS;
+}
+#pragma GCC diagnostic pop
+}  // namespace base64
+}  // namespace ge
+#endif  // GE_COMMON_BASE64_H_
\ No newline at end of file
diff --git a/src/ge/common/cust_aicpu_kernel_store.cc b/src/ge/common/cust_aicpu_kernel_store.cc
new file mode 100644
index 00000000..46eb484b
--- /dev/null
+++ b/src/ge/common/cust_aicpu_kernel_store.cc
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common/cust_aicpu_kernel_store.h"
+
+namespace ge {
+
+CustAICPUKernelStore::CustAICPUKernelStore() {}
+
+void CustAICPUKernelStore::AddCustAICPUKernel(const CustAICPUKernelPtr &kernel) { AddKernel(kernel); }
+
+void CustAICPUKernelStore::LoadCustAICPUKernelBinToOpDesc(const std::shared_ptr<ge::OpDesc> &op_desc) const {
+  GELOGI("LoadCustAICPUKernelBinToOpDesc in");
+  if (op_desc != nullptr) {
+    auto kernel_bin = FindKernel(op_desc->GetName());
+    if (kernel_bin != nullptr) {
+      GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(ge::OP_EXTATTR_CUSTAICPU_KERNEL, kernel_bin),
+                      GELOGW("LoadKernelCustAICPUBinToOpDesc: SetExtAttr for kernel_bin failed");)
+      GELOGI("Load cust aicpu kernel:%s, %zu", kernel_bin->GetName().c_str(), kernel_bin->GetBinDataSize());
+    }
+  }
+  GELOGI("LoadCustAICPUKernelBinToOpDesc success");
+}
+}  // namespace ge
diff --git a/src/ge/common/cust_aicpu_kernel_store.h b/src/ge/common/cust_aicpu_kernel_store.h
new file mode 100644
index 00000000..6dff0435
--- /dev/null
+++ b/src/ge/common/cust_aicpu_kernel_store.h
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_COMMON_CUST_AICPU_KERNEL_STORE_H_
+#define GE_COMMON_CUST_AICPU_KERNEL_STORE_H_
+
+#include "common/kernel_store.h"
+
+namespace ge {
+
+class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY CustAICPUKernelStore : public KernelStore {
+ public:
+  CustAICPUKernelStore();
+  ~CustAICPUKernelStore() {}
+
+  void AddCustAICPUKernel(const CustAICPUKernelPtr &kernel);
+
+  void LoadCustAICPUKernelBinToOpDesc(const std::shared_ptr<ge::OpDesc> &op_desc) const;
+};
+}  // namespace ge
+
+#endif  // GE_COMMON_CUST_AICPU_KERNEL_STORE_H_
diff --git a/src/ge/common/debug/memory_dumper.cc b/src/ge/common/debug/memory_dumper.cc
index 56724be8..1a7d9db8 100644
--- a/src/ge/common/debug/memory_dumper.cc
+++ b/src/ge/common/debug/memory_dumper.cc
@@ -157,7 +157,7 @@ int MemoryDumper::OpenFile(const char *filename) {
   // Using the O_EXCL, if the file already exists,return failed to avoid privilege escalation vulnerability.
   mode_t mode = S_IRUSR | S_IWUSR;
 
-  int32_t fd = mmOpen2(real_path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode);
+  int32_t fd = mmOpen2(real_path.c_str(), O_RDWR | O_CREAT | O_APPEND, mode);
   if (fd == EN_ERROR || fd == EN_INVALID_PARAM) {
     GELOGE(kInvalidFd, "open file failed. errno = %d, %s", fd, strerror(errno));
     return kInvalidFd;
diff --git a/src/ge/common/formats/format_transfers/datatype_transfer.cc b/src/ge/common/formats/format_transfers/datatype_transfer.cc
index 08c6889f..a603b2f4 100644
--- a/src/ge/common/formats/format_transfers/datatype_transfer.cc
+++ b/src/ge/common/formats/format_transfers/datatype_transfer.cc
@@ -44,6 +44,9 @@ enum DataTypeTransMode {
   kTransferWithDatatypeInt8ToFloat,
   kTransferWithDatatypeInt8ToInt32,
   kTransferWithDatatypeInt64ToInt32,
+  kTransferWithDatatypeInt32ToInt64,
+  kTransferWithDatatypeInt32ToDouble,
+  kTransferWithDatatypeDoubleToInt32,
 };
 
 std::map<std::pair<DataType, DataType>, DataTypeTransMode> trans_mode_map{
@@ -59,7 +62,11 @@ std::map<std::pair<DataType, DataType>, DataTypeTransMode> trans_mode_map{
   {std::pair<DataType, DataType>(DT_UINT8, DT_INT32), kTransferWithDatatypeUint8ToInt32},
   {std::pair<DataType, DataType>(DT_INT8, DT_FLOAT), kTransferWithDatatypeInt8ToFloat},
   {std::pair<DataType, DataType>(DT_INT8, DT_INT32), kTransferWithDatatypeInt8ToInt32},
-  {std::pair<DataType, DataType>(DT_INT64, DT_INT32), kTransferWithDatatypeInt64ToInt32}};
+  {std::pair<DataType, DataType>(DT_INT64, DT_INT32), kTransferWithDatatypeInt64ToInt32},
+  {std::pair<DataType, DataType>(DT_INT32, DT_INT64), kTransferWithDatatypeInt32ToInt64},
+  {std::pair<DataType, DataType>(DT_INT32, DT_DOUBLE), kTransferWithDatatypeInt32ToDouble},
+  {std::pair<DataType, DataType>(DT_DOUBLE, DT_INT32), kTransferWithDatatypeDoubleToInt32},
+};
 
 template <typename SrcT, typename DstT>
 Status TransDataSrc2Dst(const CastArgs &args, uint8_t *dst, const size_t data_size) {
@@ -82,38 +89,30 @@ Status TransDataSrc2Fp16(const CastArgs &args, uint8_t *dst, const size_t data_s
 }
 
 Status CastKernel(const CastArgs &args, uint8_t *dst, const size_t data_size, const DataTypeTransMode trans_mode) {
-  switch (trans_mode) {
-    case kTransferWithDatatypeFloatToFloat16:
-      return TransDataSrc2Fp16<float>(args, dst, data_size);
-    case kTransferWithDatatypeFloatToInt32:
-      return TransDataSrc2Dst<float, int32_t>(args, dst, data_size);
-    case kTransferWithDatatypeFloat16ToFloat:
-      return TransDataSrc2Dst<fp16_t, float>(args, dst, data_size);
-    case kTransferWithDatatypeFloat16ToInt32:
-      return TransDataSrc2Dst<fp16_t, int32_t>(args, dst, data_size);
-    case kTransferWithDatatypeInt32ToFloat:
-      return TransDataSrc2Dst<int32_t, float>(args, dst, data_size);
-    case kTransferWithDatatypeInt32ToFloat16:
-      return TransDataSrc2Fp16<int32_t>(args, dst, data_size);
-    case kTransferWithDatatypeInt32ToUint8:
-      return TransDataSrc2Dst<int32_t, uint8_t>(args, dst, data_size);
-    case kTransferWithDatatypeInt32ToInt8:
-      return TransDataSrc2Dst<int32_t, int8_t>(args, dst, data_size);
-    case kTransferWithDatatypeUint8ToFloat:
-      return TransDataSrc2Dst<uint8_t, float>(args, dst, data_size);
-    case kTransferWithDatatypeUint8ToInt32:
-      return TransDataSrc2Dst<uint8_t, int32_t>(args, dst, data_size);
-    case kTransferWithDatatypeInt8ToFloat:
-      return TransDataSrc2Dst<int8_t, float>(args, dst, data_size);
-    case kTransferWithDatatypeInt8ToInt32:
-      return TransDataSrc2Dst<int8_t, int32_t>(args, dst, data_size);
-    case kTransferWithDatatypeInt64ToInt32:
-      return TransDataSrc2Dst<int64_t, int32_t>(args, dst, data_size);
-    default:
-      GELOGE(PARAM_INVALID, "Trans data type from %s to %s is not supported.",
-             TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(),
-             TypeUtils::DataTypeToSerialString(args.dst_data_type).c_str());
-      return UNSUPPORTED;
+  static std::map<DataTypeTransMode, std::function<Status(const CastArgs &, uint8_t *, const size_t)>> transfer_handle =
+    {
+      {kTransferWithDatatypeFloatToFloat16, TransDataSrc2Fp16<float>},
+      {kTransferWithDatatypeFloatToInt32, TransDataSrc2Dst<float, int32_t>},
+      {kTransferWithDatatypeFloat16ToFloat, TransDataSrc2Dst<fp16_t, float>},
+      {kTransferWithDatatypeFloat16ToInt32, TransDataSrc2Dst<fp16_t, int32_t>},
+      {kTransferWithDatatypeInt32ToFloat, TransDataSrc2Dst<int32_t, float>},
+      {kTransferWithDatatypeInt32ToFloat16, TransDataSrc2Fp16<int32_t>},
+      {kTransferWithDatatypeInt32ToUint8, TransDataSrc2Dst<int32_t, uint8_t>},
+      {kTransferWithDatatypeInt32ToInt8, TransDataSrc2Dst<int32_t, int8_t>},
+      {kTransferWithDatatypeUint8ToFloat, TransDataSrc2Dst<uint8_t, float>},
+      {kTransferWithDatatypeUint8ToInt32, TransDataSrc2Dst<uint8_t, int32_t>},
+      {kTransferWithDatatypeInt8ToFloat, TransDataSrc2Dst<int8_t, float>},
+      {kTransferWithDatatypeInt8ToInt32, TransDataSrc2Dst<int8_t, int32_t>},
+      {kTransferWithDatatypeInt64ToInt32, TransDataSrc2Dst<int64_t, int32_t>},
+      {kTransferWithDatatypeInt32ToInt64, TransDataSrc2Dst<int32_t, int64_t>},
+      {kTransferWithDatatypeInt32ToDouble, TransDataSrc2Dst<int32_t, double>},
+      {kTransferWithDatatypeDoubleToInt32, TransDataSrc2Dst<double, int32_t>},
+    };
+  auto it = transfer_handle.find(trans_mode);
+  if (it == transfer_handle.end()) {
+    return UNSUPPORTED;
+  } else {
+    return (it->second)(args, dst, data_size);
   }
 }
 }  // namespace
diff --git a/src/ge/common/ge_common.mk b/src/ge/common/ge_common.mk
index e913c8f5..7632b46d 100644
--- a/src/ge/common/ge_common.mk
+++ b/src/ge/common/ge_common.mk
@@ -36,7 +36,9 @@ GE_COMMON_LOCAL_SRC_FILES := \
     properties_manager.cc \
     types.cc\
     model_parser/base.cc \
+    kernel_store.cc \
     tbe_kernel_store.cc \
+    cust_aicpu_kernel_store.cc \
     op/attr_value_util.cc \
     op/ge_op_utils.cc \
     thread_pool.cc \
diff --git a/src/ge/common/helper/model_cache_helper.cc b/src/ge/common/helper/model_cache_helper.cc
index e9b1de83..d3b4dde5 100644
--- a/src/ge/common/helper/model_cache_helper.cc
+++ b/src/ge/common/helper/model_cache_helper.cc
@@ -310,7 +310,7 @@ Status ModelCacheHelper::GetNodesNeedRecompile(ComputeGraphPtr &graph, vector<No
     string kernel_lib_name = op_desc->GetOpKernelLibName();
     if (kernel_lib_name.empty()) {
       // reset op kernel lib
-      (void)instance->DNNEngineManagerObj().GetDNNEngineName(op_desc);
+      (void)instance->DNNEngineManagerObj().GetDNNEngineName(node);
       kernel_lib_name = op_desc->GetOpKernelLibName();
       if (kernel_lib_name.empty()) {
         GELOGW("Get node:%s, type:%s kernel lib failed.", node->GetName().c_str(), op_desc->GetType().c_str());
diff --git a/src/ge/common/helper/model_helper.cc b/src/ge/common/helper/model_helper.cc
index 19614566..d860f7ba 100644
--- a/src/ge/common/helper/model_helper.cc
+++ b/src/ge/common/helper/model_helper.cc
@@ -41,6 +41,7 @@ Status ModelHelper::SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_fil
                                        const uint8_t *data, size_t size) {
   if (size < 1 || size > UINT32_MAX) {
     GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu invalid", size);
+    ErrorManager::GetInstance().ATCReportErrMessage("E19022");
     return PARAM_INVALID;
   }
   if (data == nullptr) {
@@ -101,16 +102,22 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
   TBEKernelStore tbe_kernel_store = ge_model->GetTBEKernelStore();
   GELOGI("TBE_KERNELS size is %zu", tbe_kernel_store.DataSize());
   if (tbe_kernel_store.DataSize() > 0) {
-    if (SaveModelPartition(om_file_save_helper, ModelPartitionType::TBE_KERNELS, tbe_kernel_store.Data(),
-                           tbe_kernel_store.DataSize()) != SUCCESS) {
-      GELOGE(PARAM_INVALID, "Add tbe kernel partition failed");
-      return PARAM_INVALID;
-    }
+    GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper, ModelPartitionType::TBE_KERNELS, tbe_kernel_store.Data(),
+                                         tbe_kernel_store.DataSize()),
+                      "Add tbe kernel partition failed");
   }
 
   // no need to check value, DATA->NetOutput
   (void)tbe_kernel_store.Load(tbe_kernel_store.Data(), tbe_kernel_store.DataSize());
 
+  CustAICPUKernelStore cust_aicpu_kernel_store = ge_model->GetCustAICPUKernelStore();
+  GELOGI("cust aicpu kernels size is %zu", cust_aicpu_kernel_store.DataSize());
+  if (cust_aicpu_kernel_store.DataSize() > 0) {
+    GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper, ModelPartitionType::CUST_AICPU_KERNELS,
+                                         cust_aicpu_kernel_store.Data(), cust_aicpu_kernel_store.DataSize()),
+                      "Add cust aicpu kernel partition failed");
+  }
+
   std::shared_ptr<ModelTaskDef> model_task_def = ge_model->GetModelTaskDefPtr();
   if (model_task_def == nullptr) {
     GELOGE(MEMALLOC_FAILED, "Create model task def ptr failed");
@@ -308,6 +315,10 @@ Status ModelHelper::GenerateGeModel(OmFileLoadHelper &om_load_helper) {
   if (ret != SUCCESS) {
     return GE_EXEC_LOAD_KERNEL_PARTITION_FAILED;
   }
+  ret = LoadCustAICPUKernelStore(om_load_helper);
+  if (ret != SUCCESS) {
+    return GE_EXEC_LOAD_KERNEL_PARTITION_FAILED;
+  }
   return SUCCESS;
 }
 
@@ -384,6 +395,22 @@ Status ModelHelper::LoadTBEKernelStore(OmFileLoadHelper &om_load_helper) {
   return SUCCESS;
 }
 
+Status ModelHelper::LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper) {
+  // Load cust aicpu kernels
+  ModelPartition partition_kernel_def;
+  CustAICPUKernelStore kernel_store;
+  if (om_load_helper.GetModelPartition(ModelPartitionType::CUST_AICPU_KERNELS, partition_kernel_def) == SUCCESS) {
+    GELOGI("Kernels partition size:%u", partition_kernel_def.size);
+    if (kernel_store.Load(partition_kernel_def.data, partition_kernel_def.size)) {
+      GELOGI("Load cust aicpu kernels success");
+    } else {
+      GELOGW("Load cust aicpu kernels failed");
+    }
+  }
+  model_->SetCustAICPUKernelStore(kernel_store);
+  return SUCCESS;
+}
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY GeModelPtr ModelHelper::GetGeModel() {
   if (model_ != nullptr) {
     return model_;
diff --git a/src/ge/common/helper/om_file_helper.cc b/src/ge/common/helper/om_file_helper.cc
index f25e2af3..ca506731 100644
--- a/src/ge/common/helper/om_file_helper.cc
+++ b/src/ge/common/helper/om_file_helper.cc
@@ -27,6 +27,9 @@
 
 using std::string;
 
+namespace {
+const int32_t kOptionalNum = 2;
+}
 namespace ge {
 // For Load
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::Init(const ge::ModelData &model) {
@@ -67,7 +70,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::GetMod
   }
 
   if (!found) {
-    if (type != ModelPartitionType::TBE_KERNELS && type != ModelPartitionType::WEIGHTS_DATA) {
+    if (type != ModelPartitionType::TBE_KERNELS && type != ModelPartitionType::WEIGHTS_DATA &&
+        type != ModelPartitionType::CUST_AICPU_KERNELS) {
       GELOGE(FAILED, "GetModelPartition:type:%d is not in partition_datas!", static_cast<int>(type));
       return FAILED;
     }
@@ -114,7 +118,7 @@ Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, const uint
   // Davinici model partition include graph-info  weight-info  task-info  tbe-kernel :
   // Original model partition include graph-info
   if ((partition_table->num != PARTITION_SIZE) && (partition_table->num != (PARTITION_SIZE - 1)) &&
-      (partition_table->num != 1)) {
+      (partition_table->num != (PARTITION_SIZE - kOptionalNum)) && (partition_table->num != 1)) {
     GELOGE(GE_EXEC_MODEL_PARTITION_NUM_INVALID, "Invalid partition_table->num:%u", partition_table->num);
     return GE_EXEC_MODEL_PARTITION_NUM_INVALID;
   }
diff --git a/src/ge/common/kernel_store.cc b/src/ge/common/kernel_store.cc
new file mode 100644
index 00000000..e465d184
--- /dev/null
+++ b/src/ge/common/kernel_store.cc
@@ -0,0 +1,118 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common/kernel_store.h"
+
+namespace ge {
+
+void KernelStore::AddKernel(const KernelBinPtr &kernel) {
+  if (kernel != nullptr) {
+    kernels_[kernel->GetName()] = kernel;
+  }
+}
+
+bool KernelStore::Build() {
+  buffer_.clear();
+  size_t total_len = 0;
+  for (const auto &item : kernels_) {
+    auto kernel = item.second;
+    total_len += sizeof(KernelStoreItemHead);
+    total_len += kernel->GetName().length();
+    total_len += kernel->GetBinDataSize();
+  }
+
+  try {
+    buffer_.resize(total_len);
+  } catch (std::bad_alloc &e) {
+    GELOGE(ge::MEMALLOC_FAILED, "All build memory failed, memory size %zu", total_len);
+    return false;
+  }
+
+  uint8_t *next_buffer = buffer_.data();
+  size_t remain_len = total_len;
+  errno_t mem_ret;
+  for (const auto &item : kernels_) {
+    auto kernel = item.second;
+    KernelStoreItemHead kernel_head{};
+    kernel_head.magic = kKernelItemMagic;
+    kernel_head.name_len = static_cast<uint32_t>(kernel->GetName().length());
+    kernel_head.bin_len = static_cast<uint32_t>(kernel->GetBinDataSize());
+
+    GELOGI("get kernel bin name %s, addr %p, size %u", kernel->GetName().c_str(), kernel->GetBinData(),
+           kernel->GetBinDataSize());
+    mem_ret = memcpy_s(next_buffer, remain_len, &kernel_head, sizeof(kernel_head));
+    GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false);
+    next_buffer += sizeof(kernel_head);
+
+    mem_ret = memcpy_s(next_buffer, remain_len - sizeof(kernel_head), kernel->GetName().data(), kernel_head.name_len);
+    GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false);
+    next_buffer += kernel_head.name_len;
+
+    mem_ret = memcpy_s(next_buffer, remain_len - sizeof(kernel_head) - kernel_head.name_len, kernel->GetBinData(),
+                       kernel_head.bin_len);
+    GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false);
+
+    next_buffer += kernel_head.bin_len;
+    remain_len = remain_len - sizeof(kernel_head) - kernel_head.name_len - kernel_head.bin_len;
+  }
+  kernels_.clear();
+  return true;
+}
+
+const uint8_t *KernelStore::Data() const { return buffer_.data(); }
+
+size_t KernelStore::DataSize() const { return buffer_.size(); }
+
+bool KernelStore::Load(const uint8_t *data, const size_t &len) {
+  if (data == nullptr || len == 0) {
+    return false;
+  }
+  size_t buffer_len = len;
+  while (buffer_len > sizeof(KernelStoreItemHead)) {
+    const char *next_buffer = reinterpret_cast<const char *>(data) + (len - buffer_len);
+
+    const auto *kernel_head = reinterpret_cast<const KernelStoreItemHead *>(next_buffer);
+    if (buffer_len < kernel_head->name_len + kernel_head->bin_len + sizeof(KernelStoreItemHead)) {
+      GELOGW("Invalid kernel block remain buffer len %zu, name len %u, bin len %u", buffer_len, kernel_head->name_len,
+             kernel_head->bin_len);
+      break;
+    }
+
+    next_buffer += sizeof(KernelStoreItemHead);
+    std::string name(next_buffer, kernel_head->name_len);
+
+    next_buffer += kernel_head->name_len;
+    GELOGI("Load kernel from om:%s,%u,%u", name.c_str(), kernel_head->name_len, kernel_head->bin_len);
+    std::vector<char> kernel_bin(next_buffer, next_buffer + kernel_head->bin_len);
+    KernelBinPtr teb_kernel_ptr = ge::MakeShared<KernelBin>(name, std::move(kernel_bin));
+    if (teb_kernel_ptr != nullptr) {
+      kernels_.emplace(name, teb_kernel_ptr);
+    }
+    buffer_len -= sizeof(KernelStoreItemHead) + kernel_head->name_len + kernel_head->bin_len;
+  }
+
+  return true;
+}
+
+KernelBinPtr KernelStore::FindKernel(const std::string &name) const {
+  auto it = kernels_.find(name);
+  if (it != kernels_.end()) {
+    return it->second;
+  }
+  return nullptr;
+}
+
+}  // namespace ge
diff --git a/src/ge/common/kernel_store.h b/src/ge/common/kernel_store.h
new file mode 100644
index 00000000..d73f26c5
--- /dev/null
+++ b/src/ge/common/kernel_store.h
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_COMMON_KERNEL_STORE_H_
+#define GE_COMMON_KERNEL_STORE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <securec.h>
+#include <utility>
+
+#include "common/ge/ge_util.h"
+#include "framework/common/debug/ge_log.h"
+#include "framework/common/debug/log.h"
+#include "framework/common/fmk_types.h"
+#include "graph/op_desc.h"
+#include "graph/op_kernel_bin.h"
+
+namespace ge {
+using KernelBin = ge::OpKernelBin;
+using KernelBinPtr = std::shared_ptr<ge::OpKernelBin>;
+using CustAICPUKernel = ge::OpKernelBin;
+using CustAICPUKernelPtr = std::shared_ptr<ge::OpKernelBin>;
+using TBEKernel = ge::OpKernelBin;
+using TBEKernelPtr = std::shared_ptr<ge::OpKernelBin>;
+
+const uint32_t kKernelItemMagic = 0x5d776efd;
+
+struct KernelStoreItemHead {
+  uint32_t magic;
+  uint32_t name_len;
+  uint32_t bin_len;
+};
+
+class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY KernelStore {
+ public:
+  KernelStore() = default;
+  virtual ~KernelStore() = default;
+  virtual bool Build();
+
+  virtual bool Load(const uint8_t *data, const size_t &len);
+
+  virtual const uint8_t *Data() const;
+  virtual size_t DataSize() const;
+  virtual void AddKernel(const KernelBinPtr &kernel);
+  virtual KernelBinPtr FindKernel(const std::string &name) const;
+
+ private:
+  std::unordered_map<std::string, KernelBinPtr> kernels_;
+  std::vector<uint8_t> buffer_;
+};
+}  // namespace ge
+
+#endif  // GE_COMMON_KERNEL_STORE_H_
diff --git a/src/ge/common/math/math_util.h b/src/ge/common/math/math_util.h
index 86c62209..e5a53d16 100644
--- a/src/ge/common/math/math_util.h
+++ b/src/ge/common/math/math_util.h
@@ -612,295 +612,268 @@ inline Status CheckInt32DivOverflow(int32_t a, int32_t b) {
   return SUCCESS;
 }
 
-#define FMK_INT_ADDCHECK(a, b)                                                                    \
-  if (ge::CheckIntAddOverflow((a), (b)) != SUCCESS) {                                             \
-    GELOGE(INTERNAL_ERROR, "Int %d and %d addition can result in overflow!", static_cast<int>(a), \
-           static_cast<int>(b));                                                                  \
-    return INTERNAL_ERROR;                                                                        \
-  }
-
-#define FMK_INT8_ADDCHECK(a, b)                                                                       \
-  if (ge::CheckInt8AddOverflow((a), (b)) != SUCCESS) {                                                \
-    GELOGE(INTERNAL_ERROR, "Int8 %d and %d addition can result in overflow!", static_cast<int8_t>(a), \
-           static_cast<int8_t>(b));                                                                   \
-    return INTERNAL_ERROR;                                                                            \
-  }
-
-#define FMK_INT16_ADDCHECK(a, b)                                                                        \
-  if (ge::CheckInt16AddOverflow((a), (b)) != SUCCESS) {                                                 \
-    GELOGE(INTERNAL_ERROR, "Int16 %d and %d addition can result in overflow!", static_cast<int16_t>(a), \
-           static_cast<int16_t>(b));                                                                    \
+#define FMK_INT_ADDCHECK(a, b)                                                                          \
+  if (ge::CheckIntAddOverflow((a), (b)) != SUCCESS) {                                                   \
+    GELOGW("Int %d and %d addition can result in overflow!", static_cast<int>(a), static_cast<int>(b)); \
     return INTERNAL_ERROR;                                                                              \
   }
 
-#define FMK_INT32_ADDCHECK(a, b)                                                                        \
-  if (ge::CheckInt32AddOverflow((a), (b)) != SUCCESS) {                                                 \
-    GELOGE(INTERNAL_ERROR, "Int32 %d and %d addition can result in overflow!", static_cast<int32_t>(a), \
-           static_cast<int32_t>(b));                                                                    \
-    return INTERNAL_ERROR;                                                                              \
+#define FMK_INT8_ADDCHECK(a, b)                                                                                \
+  if (ge::CheckInt8AddOverflow((a), (b)) != SUCCESS) {                                                         \
+    GELOGW("Int8 %d and %d addition can result in overflow!", static_cast<int8_t>(a), static_cast<int8_t>(b)); \
+    return INTERNAL_ERROR;                                                                                     \
   }
 
-#define FMK_INT64_ADDCHECK(a, b)                                                                          \
-  if (ge::CheckInt64AddOverflow((a), (b)) != SUCCESS) {                                                   \
-    GELOGE(INTERNAL_ERROR, "Int64 %ld and %ld addition can result in overflow!", static_cast<int64_t>(a), \
-           static_cast<int64_t>(b));                                                                      \
-    return INTERNAL_ERROR;                                                                                \
+#define FMK_INT16_ADDCHECK(a, b)                                                                                  \
+  if (ge::CheckInt16AddOverflow((a), (b)) != SUCCESS) {                                                           \
+    GELOGW("Int16 %d and %d addition can result in overflow!", static_cast<int16_t>(a), static_cast<int16_t>(b)); \
+    return INTERNAL_ERROR;                                                                                        \
   }
 
-#define FMK_UINT8_ADDCHECK(a, b)                                                                        \
-  if (ge::CheckUint8AddOverflow((a), (b)) != SUCCESS) {                                                 \
-    GELOGE(INTERNAL_ERROR, "UINT8 %u and %u addition can result in overflow!", static_cast<uint8_t>(a), \
-           static_cast<uint8_t>(b));                                                                    \
-    return INTERNAL_ERROR;                                                                              \
+#define FMK_INT32_ADDCHECK(a, b)                                                                                  \
+  if (ge::CheckInt32AddOverflow((a), (b)) != SUCCESS) {                                                           \
+    GELOGW("Int32 %d and %d addition can result in overflow!", static_cast<int32_t>(a), static_cast<int32_t>(b)); \
+    return INTERNAL_ERROR;                                                                                        \
   }
 
-#define FMK_UINT16_ADDCHECK(a, b)                                                                         \
-  if (ge::CheckUint16AddOverflow((a), (b)) != SUCCESS) {                                                  \
-    GELOGE(INTERNAL_ERROR, "UINT16 %u and %u addition can result in overflow!", static_cast<uint16_t>(a), \
-           static_cast<uint16_t>(b));                                                                     \
-    return INTERNAL_ERROR;                                                                                \
+#define FMK_INT64_ADDCHECK(a, b)                                                                                    \
+  if (ge::CheckInt64AddOverflow((a), (b)) != SUCCESS) {                                                             \
+    GELOGW("Int64 %ld and %ld addition can result in overflow!", static_cast<int64_t>(a), static_cast<int64_t>(b)); \
+    return INTERNAL_ERROR;                                                                                          \
   }
 
-#define FMK_UINT32_ADDCHECK(a, b)                                                                         \
-  if (ge::CheckUint32AddOverflow((a), (b)) != SUCCESS) {                                                  \
-    GELOGE(INTERNAL_ERROR, "UINT32 %u and %u addition can result in overflow!", static_cast<uint32_t>(a), \
-           static_cast<uint32_t>(b));                                                                     \
-    return INTERNAL_ERROR;                                                                                \
+#define FMK_UINT8_ADDCHECK(a, b)                                                                                  \
+  if (ge::CheckUint8AddOverflow((a), (b)) != SUCCESS) {                                                           \
+    GELOGW("Uint8 %u and %u addition can result in overflow!", static_cast<uint8_t>(a), static_cast<uint8_t>(b)); \
+    return INTERNAL_ERROR;                                                                                        \
   }
 
-#define FMK_UINT64_ADDCHECK(a, b)                                                                           \
-  if (ge::CheckUint64AddOverflow((a), (b)) != SUCCESS) {                                                    \
-    GELOGE(INTERNAL_ERROR, "UINT64 %lu and %lu addition can result in overflow!", static_cast<uint64_t>(a), \
-           static_cast<uint64_t>(b));                                                                       \
-    return INTERNAL_ERROR;                                                                                  \
+#define FMK_UINT16_ADDCHECK(a, b)                                                                                    \
+  if (ge::CheckUint16AddOverflow((a), (b)) != SUCCESS) {                                                             \
+    GELOGW("UINT16 %u and %u addition can result in overflow!", static_cast<uint16_t>(a), static_cast<uint16_t>(b)); \
+    return INTERNAL_ERROR;                                                                                           \
   }
 
-#define FMK_FP16_ADDCHECK(a, b)                                                                      \
-  if (ge::CheckFp16AddOverflow((a), (b)) != SUCCESS) {                                               \
-    GELOGE(INTERNAL_ERROR, "fp16 %f and %f addition can result in overflow!", static_cast<float>(a), \
-           static_cast<float>(b));                                                                   \
-    return INTERNAL_ERROR;                                                                           \
+#define FMK_UINT32_ADDCHECK(a, b)                                                                                    \
+  if (ge::CheckUint32AddOverflow((a), (b)) != SUCCESS) {                                                             \
+    GELOGW("Uint32 %u and %u addition can result in overflow!", static_cast<uint32_t>(a), static_cast<uint32_t>(b)); \
+    return INTERNAL_ERROR;                                                                                           \
   }
 
-#define FMK_FLOAT_ADDCHECK(a, b)                                                                      \
-  if (ge::CheckFloatAddOverflow((a), (b)) != SUCCESS) {                                               \
-    GELOGE(INTERNAL_ERROR, "float %f and %f addition can result in overflow!", static_cast<float>(a), \
-           static_cast<float>(b));                                                                    \
-    return INTERNAL_ERROR;                                                                            \
+#define FMK_UINT64_ADDCHECK(a, b)                                                                                      \
+  if (ge::CheckUint64AddOverflow((a), (b)) != SUCCESS) {                                                               \
+    GELOGW("Uint64 %lu and %lu addition can result in overflow!", static_cast<uint64_t>(a), static_cast<uint64_t>(b)); \
+    return INTERNAL_ERROR;                                                                                             \
   }
 
-#define FMK_DOUBLE_ADDCHECK(a, b)                                                                         \
-  if (ge::CheckDoubleAddOverflow((a), (b)) != SUCCESS) {                                                  \
-    GELOGE(INTERNAL_ERROR, "double %lf and %lf addition can result in overflow!", static_cast<double>(a), \
-           static_cast<double>(b));                                                                       \
-    return INTERNAL_ERROR;                                                                                \
+#define FMK_FP16_ADDCHECK(a, b)                                                                              \
+  if (ge::CheckFp16AddOverflow((a), (b)) != SUCCESS) {                                                       \
+    GELOGW("Fp16 %f and %f addition can result in overflow!", static_cast<float>(a), static_cast<float>(b)); \
+    return INTERNAL_ERROR;                                                                                   \
   }
 
-#define FMK_INT_SUBCHECK(a, b)                                                                       \
-  if (ge::CheckIntSubOverflow((a), (b)) != SUCCESS) {                                                \
-    GELOGE(INTERNAL_ERROR, "INT %d and %d subtraction can result in overflow!", static_cast<int>(a), \
-           static_cast<int>(b));                                                                     \
-    return INTERNAL_ERROR;                                                                           \
+#define FMK_FLOAT_ADDCHECK(a, b)                                                                              \
+  if (ge::CheckFloatAddOverflow((a), (b)) != SUCCESS) {                                                       \
+    GELOGW("Float %f and %f addition can result in overflow!", static_cast<float>(a), static_cast<float>(b)); \
+    return INTERNAL_ERROR;                                                                                    \
   }
 
-#define FMK_INT8_SUBCHECK(a, b)                                                                          \
-  if (ge::CheckInt8SubOverflow((a), (b)) != SUCCESS) {                                                   \
-    GELOGE(INTERNAL_ERROR, "INT8 %d and %d subtraction can result in overflow!", static_cast<int8_t>(a), \
-           static_cast<int8_t>(b));                                                                      \
-    return INTERNAL_ERROR;                                                                               \
+#define FMK_DOUBLE_ADDCHECK(a, b)                                                                                  \
+  if (ge::CheckDoubleAddOverflow((a), (b)) != SUCCESS) {                                                           \
+    GELOGW("Double %lf and %lf addition can result in overflow!", static_cast<double>(a), static_cast<double>(b)); \
+    return INTERNAL_ERROR;                                                                                         \
   }
 
-#define FMK_INT16_SUBCHECK(a, b)                                                                           \
-  if (ge::CheckInt16SubOverflow((a), (b)) != SUCCESS) {                                                    \
-    GELOGE(INTERNAL_ERROR, "INT16 %d and %d subtraction can result in overflow!", static_cast<int16_t>(a), \
-           static_cast<int16_t>(b));                                                                       \
+#define FMK_INT_SUBCHECK(a, b)                                                                             \
+  if (ge::CheckIntSubOverflow((a), (b)) != SUCCESS) {                                                      \
+    GELOGW("Int %d and %d subtraction can result in overflow!", static_cast<int>(a), static_cast<int>(b)); \
     return INTERNAL_ERROR;                                                                                 \
   }
 
-#define FMK_INT32_SUBCHECK(a, b)                                                                           \
-  if (ge::CheckInt32SubOverflow((a), (b)) != SUCCESS) {                                                    \
-    GELOGE(INTERNAL_ERROR, "INT32 %d and %d subtraction can result in overflow!", static_cast<int32_t>(a), \
-           static_cast<int32_t>(b));                                                                       \
-    return INTERNAL_ERROR;                                                                                 \
+#define FMK_INT8_SUBCHECK(a, b)                                                                                   \
+  if (ge::CheckInt8SubOverflow((a), (b)) != SUCCESS) {                                                            \
+    GELOGW("Int8 %d and %d subtraction can result in overflow!", static_cast<int8_t>(a), static_cast<int8_t>(b)); \
+    return INTERNAL_ERROR;                                                                                        \
   }
 
-#define FMK_INT64_SUBCHECK(a, b)                                                                             \
-  if (ge::CheckInt64SubOverflow((a), (b)) != SUCCESS) {                                                      \
-    GELOGE(INTERNAL_ERROR, "INT64 %ld and %ld subtraction can result in overflow!", static_cast<int64_t>(a), \
-           static_cast<int64_t>(b));                                                                         \
-    return INTERNAL_ERROR;                                                                                   \
+#define FMK_INT16_SUBCHECK(a, b)                                                                                     \
+  if (ge::CheckInt16SubOverflow((a), (b)) != SUCCESS) {                                                              \
+    GELOGW("Int16 %d and %d subtraction can result in overflow!", static_cast<int16_t>(a), static_cast<int16_t>(b)); \
+    return INTERNAL_ERROR;                                                                                           \
   }
 
-#define FMK_UINT8_SUBCHECK(a, b)                                                                           \
-  if (ge::CheckUint8SubOverflow((a), (b)) != SUCCESS) {                                                    \
-    GELOGE(INTERNAL_ERROR, "UINT8 %u and %u subtraction can result in overflow!", static_cast<uint8_t>(a), \
-           static_cast<uint8_t>(b));                                                                       \
-    return INTERNAL_ERROR;                                                                                 \
+#define FMK_INT32_SUBCHECK(a, b)                                                                                     \
+  if (ge::CheckInt32SubOverflow((a), (b)) != SUCCESS) {                                                              \
+    GELOGW("Int32 %d and %d subtraction can result in overflow!", static_cast<int32_t>(a), static_cast<int32_t>(b)); \
+    return INTERNAL_ERROR;                                                                                           \
   }
 
-#define FMK_UINT16_SUBCHECK(a, b)                                                                            \
-  if (ge::CheckUint16SubOverflow((a), (b)) != SUCCESS) {                                                     \
-    GELOGE(INTERNAL_ERROR, "UINT16 %u and %u subtraction can result in overflow!", static_cast<uint16_t>(a), \
-           static_cast<uint16_t>(b));                                                                        \
-    return INTERNAL_ERROR;                                                                                   \
+#define FMK_INT64_SUBCHECK(a, b)                                                                                       \
+  if (ge::CheckInt64SubOverflow((a), (b)) != SUCCESS) {                                                                \
+    GELOGW("Int64 %ld and %ld subtraction can result in overflow!", static_cast<int64_t>(a), static_cast<int64_t>(b)); \
+    return INTERNAL_ERROR;                                                                                             \
   }
 
-#define FMK_UINT32_SUBCHECK(a, b)                                                                            \
-  if (ge::CheckUint32SubOverflow((a), (b)) != SUCCESS) {                                                     \
-    GELOGE(INTERNAL_ERROR, "UINT32 %u and %u subtraction can result in overflow!", static_cast<uint32_t>(a), \
-           static_cast<uint32_t>(b));                                                                        \
-    return INTERNAL_ERROR;                                                                                   \
+#define FMK_UINT8_SUBCHECK(a, b)                                                                                     \
+  if (ge::CheckUint8SubOverflow((a), (b)) != SUCCESS) {                                                              \
+    GELOGW("Uint8 %u and %u subtraction can result in overflow!", static_cast<uint8_t>(a), static_cast<uint8_t>(b)); \
+    return INTERNAL_ERROR;                                                                                           \
   }
 
-#define FMK_UINT64_SUBCHECK(a, b)                                                                              \
-  if (ge::CheckUint64SubOverflow((a), (b)) != SUCCESS) {                                                       \
-    GELOGE(INTERNAL_ERROR, "UINT64 %lu and %lu subtraction can result in overflow!", static_cast<uint64_t>(a), \
-           static_cast<uint64_t>(b));                                                                          \
-    return INTERNAL_ERROR;                                                                                     \
+#define FMK_UINT16_SUBCHECK(a, b)                                                            \
+  if (ge::CheckUint16SubOverflow((a), (b)) != SUCCESS) {                                     \
+    GELOGW("Uint16 %u and %u subtraction can result in overflow!", static_cast<uint16_t>(a), \
+           static_cast<uint16_t>(b));                                                        \
+    return INTERNAL_ERROR;                                                                   \
   }
 
-#define FMK_FP16_SUBCHECK(a, b)                                                                         \
-  if (ge::CheckFp16SubOverflow((a), (b)) != SUCCESS) {                                                  \
-    GELOGE(INTERNAL_ERROR, "fp16 %f and %f subtraction can result in overflow!", static_cast<float>(a), \
-           static_cast<float>(b));                                                                      \
-    return INTERNAL_ERROR;                                                                              \
+#define FMK_UINT32_SUBCHECK(a, b)                                                            \
+  if (ge::CheckUint32SubOverflow((a), (b)) != SUCCESS) {                                     \
+    GELOGW("Uint32 %u and %u subtraction can result in overflow!", static_cast<uint32_t>(a), \
+           static_cast<uint32_t>(b));                                                        \
+    return INTERNAL_ERROR;                                                                   \
   }
 
-#define FMK_FLOAT_SUBCHECK(a, b)                                                                         \
-  if (ge::CheckFloatSubOverflow((a), (b)) != SUCCESS) {                                                  \
-    GELOGE(INTERNAL_ERROR, "float %f and %f subtraction can result in overflow!", static_cast<float>(a), \
-           static_cast<float>(b));                                                                       \
-    return INTERNAL_ERROR;                                                                               \
+#define FMK_UINT64_SUBCHECK(a, b)                                                              \
+  if (ge::CheckUint64SubOverflow((a), (b)) != SUCCESS) {                                       \
+    GELOGW("Uint64 %lu and %lu subtraction can result in overflow!", static_cast<uint64_t>(a), \
+           static_cast<uint64_t>(b));                                                          \
+    return INTERNAL_ERROR;                                                                     \
   }
 
-#define FMK_DOUBLE_SUBCHECK(a, b)                                                                            \
-  if (ge::CheckDoubleSubOverflow((a), (b)) != SUCCESS) {                                                     \
-    GELOGE(INTERNAL_ERROR, "double %lf and %lf subtraction can result in overflow!", static_cast<double>(a), \
-           static_cast<double>(b));                                                                          \
-    return INTERNAL_ERROR;                                                                                   \
+#define FMK_FP16_SUBCHECK(a, b)                                                                                 \
+  if (ge::CheckFp16SubOverflow((a), (b)) != SUCCESS) {                                                          \
+    GELOGW("Fp16 %f and %f subtraction can result in overflow!", static_cast<float>(a), static_cast<float>(b)); \
+    return INTERNAL_ERROR;                                                                                      \
   }
 
-#define FMK_INT_MULCHECK(a, b)                                                                          \
-  if (ge::CheckIntMulOverflow((a), (b)) != SUCCESS) {                                                   \
-    GELOGE(INTERNAL_ERROR, "INT %d and %d multiplication can result in overflow!", static_cast<int>(a), \
-           static_cast<int>(b));                                                                        \
-    return INTERNAL_ERROR;                                                                              \
+#define FMK_FLOAT_SUBCHECK(a, b)                                                                                 \
+  if (ge::CheckFloatSubOverflow((a), (b)) != SUCCESS) {                                                          \
+    GELOGW("Float %f and %f subtraction can result in overflow!", static_cast<float>(a), static_cast<float>(b)); \
+    return INTERNAL_ERROR;                                                                                       \
   }
 
-#define FMK_INT8_MULCHECK(a, b)                                                                             \
-  if (ge::CheckInt8MulOverflow((a), (b)) != SUCCESS) {                                                      \
-    GELOGE(INTERNAL_ERROR, "INT8 %d and %d multiplication can result in overflow!", static_cast<int8_t>(a), \
-           static_cast<int8_t>(b));                                                                         \
-    return INTERNAL_ERROR;                                                                                  \
+#define FMK_DOUBLE_SUBCHECK(a, b)                                                                                     \
+  if (ge::CheckDoubleSubOverflow((a), (b)) != SUCCESS) {                                                              \
+    GELOGW("Double %lf and %lf subtraction can result in overflow!", static_cast<double>(a), static_cast<double>(b)); \
+    return INTERNAL_ERROR;                                                                                            \
   }
 
-#define FMK_INT16_MULCHECK(a, b)                                                                              \
-  if (ge::CheckInt16MulOverflow((a), (b)) != SUCCESS) {                                                       \
-    GELOGE(INTERNAL_ERROR, "INT16 %d and %d multiplication can result in overflow!", static_cast<int16_t>(a), \
-           static_cast<int16_t>(b));                                                                          \
+#define FMK_INT_MULCHECK(a, b)                                                                                \
+  if (ge::CheckIntMulOverflow((a), (b)) != SUCCESS) {                                                         \
+    GELOGW("Int %d and %d multiplication can result in overflow!", static_cast<int>(a), static_cast<int>(b)); \
     return INTERNAL_ERROR;                                                                                    \
   }
 
-#define FMK_INT32_MULCHECK(a, b)                                                                              \
-  if (ge::CheckInt32MulOverflow((a), (b)) != SUCCESS) {                                                       \
-    GELOGE(INTERNAL_ERROR, "INT32 %d and %d multiplication can result in overflow!", static_cast<int32_t>(a), \
-           static_cast<int32_t>(b));                                                                          \
-    return INTERNAL_ERROR;                                                                                    \
+#define FMK_INT8_MULCHECK(a, b)                                                                                      \
+  if (ge::CheckInt8MulOverflow((a), (b)) != SUCCESS) {                                                               \
+    GELOGW("Int8 %d and %d multiplication can result in overflow!", static_cast<int8_t>(a), static_cast<int8_t>(b)); \
+    return INTERNAL_ERROR;                                                                                           \
   }
 
-#define FMK_INT64_MULCHECK(a, b)                                                                                \
-  if (ge::Int64MulCheckOverflow((a), (b)) != SUCCESS) {                                                         \
-    GELOGE(INTERNAL_ERROR, "INT64 %ld and %ld multiplication can result in overflow!", static_cast<int64_t>(a), \
-           static_cast<int64_t>(b));                                                                            \
-    return INTERNAL_ERROR;                                                                                      \
+#define FMK_INT16_MULCHECK(a, b)                                                              \
+  if (ge::CheckInt16MulOverflow((a), (b)) != SUCCESS) {                                       \
+    GELOGW("Int16 %d and %d multiplication can result in overflow!", static_cast<int16_t>(a), \
+           static_cast<int16_t>(b));                                                          \
+    return INTERNAL_ERROR;                                                                    \
   }
 
-#define FMK_UINT8_MULCHECK(a, b)                                                                              \
-  if (ge::CheckUint8MulOverflow((a), (b)) != SUCCESS) {                                                       \
-    GELOGE(INTERNAL_ERROR, "UINT8 %u and %u multiplication can result in overflow!", static_cast<uint8_t>(a), \
-           static_cast<uint8_t>(b));                                                                          \
-    return INTERNAL_ERROR;                                                                                    \
+#define FMK_INT32_MULCHECK(a, b)                                                              \
+  if (ge::CheckInt32MulOverflow((a), (b)) != SUCCESS) {                                       \
+    GELOGW("Int32 %d and %d multiplication can result in overflow!", static_cast<int32_t>(a), \
+           static_cast<int32_t>(b));                                                          \
+    return INTERNAL_ERROR;                                                                    \
   }
 
-#define FMK_UINT16_MULCHECK(a, b)                                                                               \
-  if (ge::CheckUint16MulOverflow((a), (b)) != SUCCESS) {                                                        \
-    GELOGE(INTERNAL_ERROR, "UINT16 %u and %u multiplication can result in overflow!", static_cast<uint16_t>(a), \
-           static_cast<uint16_t>(b));                                                                           \
-    return INTERNAL_ERROR;                                                                                      \
+#define FMK_INT64_MULCHECK(a, b)                                                                \
+  if (ge::Int64MulCheckOverflow((a), (b)) != SUCCESS) {                                         \
+    GELOGW("Int64 %ld and %ld multiplication can result in overflow!", static_cast<int64_t>(a), \
+           static_cast<int64_t>(b));                                                            \
+    return INTERNAL_ERROR;                                                                      \
   }
 
-#define FMK_UINT32_MULCHECK(a, b)                                                                               \
-  if (ge::CheckUint32MulOverflow((a), (b)) != SUCCESS) {                                                        \
-    GELOGE(INTERNAL_ERROR, "UINT32 %u and %u multiplication can result in overflow!", static_cast<uint32_t>(a), \
-           static_cast<uint32_t>(b));                                                                           \
-    return INTERNAL_ERROR;                                                                                      \
+#define FMK_UINT8_MULCHECK(a, b)                                                              \
+  if (ge::CheckUint8MulOverflow((a), (b)) != SUCCESS) {                                       \
+    GELOGW("Uint8 %u and %u multiplication can result in overflow!", static_cast<uint8_t>(a), \
+           static_cast<uint8_t>(b));                                                          \
+    return INTERNAL_ERROR;                                                                    \
   }
 
-#define FMK_UINT64_MULCHECK(a, b)                                                                                 \
-  if (ge::CheckUint64MulOverflow((a), (b)) != SUCCESS) {                                                          \
-    GELOGE(INTERNAL_ERROR, "UINT64 %lu and %lu multiplication can result in overflow!", static_cast<uint64_t>(a), \
-           static_cast<uint64_t>(b));                                                                             \
-    return INTERNAL_ERROR;                                                                                        \
+#define FMK_UINT16_MULCHECK(a, b)                                                               \
+  if (ge::CheckUint16MulOverflow((a), (b)) != SUCCESS) {                                        \
+    GELOGW("Uint16 %u and %u multiplication can result in overflow!", static_cast<uint16_t>(a), \
+           static_cast<uint16_t>(b));                                                           \
+    return INTERNAL_ERROR;                                                                      \
   }
 
-#define FMK_FP16_MULCHECK(a, b)                                                                            \
-  if (ge::CheckFp16MulOverflow((a), (b)) != SUCCESS) {                                                     \
-    GELOGE(INTERNAL_ERROR, "fp16 %f and %f multiplication can result in overflow!", static_cast<float>(a), \
-           static_cast<float>(b));                                                                         \
-    return INTERNAL_ERROR;                                                                                 \
+#define FMK_UINT32_MULCHECK(a, b)                                                               \
+  if (ge::CheckUint32MulOverflow((a), (b)) != SUCCESS) {                                        \
+    GELOGW("Uint32 %u and %u multiplication can result in overflow!", static_cast<uint32_t>(a), \
+           static_cast<uint32_t>(b));                                                           \
+    return INTERNAL_ERROR;                                                                      \
   }
 
-#define FMK_FLOAT_MULCHECK(a, b)                                                                            \
-  if (ge::CheckFloatMulOverflow((a), (b)) != SUCCESS) {                                                     \
-    GELOGE(INTERNAL_ERROR, "float %f and %f multiplication can result in overflow!", static_cast<float>(a), \
-           static_cast<float>(b));                                                                          \
-    return INTERNAL_ERROR;                                                                                  \
+#define FMK_UINT64_MULCHECK(a, b)                                                                 \
+  if (ge::CheckUint64MulOverflow((a), (b)) != SUCCESS) {                                          \
+    GELOGW("Uint64 %lu and %lu multiplication can result in overflow!", static_cast<uint64_t>(a), \
+           static_cast<uint64_t>(b));                                                             \
+    return INTERNAL_ERROR;                                                                        \
   }
 
-#define FMK_DOUBLE_MULCHECK(a, b)                                                                               \
-  if (ge::CheckDoubleMulOverflow((a), (b)) != SUCCESS) {                                                        \
-    GELOGE(INTERNAL_ERROR, "double %lf and %lf multiplication can result in overflow!", static_cast<double>(a), \
-           static_cast<double>(b));                                                                             \
-    return INTERNAL_ERROR;                                                                                      \
+#define FMK_FP16_MULCHECK(a, b)                                                                                    \
+  if (ge::CheckFp16MulOverflow((a), (b)) != SUCCESS) {                                                             \
+    GELOGW("Fp16 %f and %f multiplication can result in overflow!", static_cast<float>(a), static_cast<float>(b)); \
+    return INTERNAL_ERROR;                                                                                         \
   }
 
-#define FMK_INT_DIVCHECK(a, b)                                                                    \
-  if (CheckIntDivOverflow((a), (b)) != SUCCESS) {                                                 \
-    GELOGE(INTERNAL_ERROR, "INT %d and %d division can result in overflow!", static_cast<int>(a), \
-           static_cast<int>(b));                                                                  \
-    return INTERNAL_ERROR;                                                                        \
+#define FMK_FLOAT_MULCHECK(a, b)                                                                                    \
+  if (ge::CheckFloatMulOverflow((a), (b)) != SUCCESS) {                                                             \
+    GELOGW("Float %f and %f multiplication can result in overflow!", static_cast<float>(a), static_cast<float>(b)); \
+    return INTERNAL_ERROR;                                                                                          \
   }
 
-#define FMK_INT32_DIVCHECK(a, b)                                                                        \
-  if (CheckInt32DivOverflow((a), (b)) != SUCCESS) {                                                     \
-    GELOGE(INTERNAL_ERROR, "INT32 %d and %d division can result in overflow!", static_cast<int32_t>(a), \
-           static_cast<int32_t>(b));                                                                    \
-    return INTERNAL_ERROR;                                                                              \
+#define FMK_DOUBLE_MULCHECK(a, b)                                                               \
+  if (ge::CheckDoubleMulOverflow((a), (b)) != SUCCESS) {                                        \
+    GELOGW("Double %lf and %lf multiplication can result in overflow!", static_cast<double>(a), \
+           static_cast<double>(b));                                                             \
+    return INTERNAL_ERROR;                                                                      \
   }
 
-#define FMK_INT64_UINT32_MULCHECK(a, b)                                                                                \
-  if (ge::CheckInt64Uint32MulOverflow((a), (b)) != SUCCESS) {                                                          \
-    GELOGE(INTERNAL_ERROR, "INT64 %ld and UINT32 %u multiplication can result in overflow!", static_cast<uint32_t>(a), \
-           static_cast<uint32_t>(b));                                                                                  \
-    return INTERNAL_ERROR;                                                                                             \
+#define FMK_INT_DIVCHECK(a, b)                                                                          \
+  if (CheckIntDivOverflow((a), (b)) != SUCCESS) {                                                       \
+    GELOGW("Int %d and %d division can result in overflow!", static_cast<int>(a), static_cast<int>(b)); \
+    return INTERNAL_ERROR;                                                                              \
   }
 
-#define FMK_FP16_ZEROCHECK(a)                               \
-  if (fabs(a) < DBL_EPSILON) {                              \
-    GELOGE(INTERNAL_ERROR, "fp16 %f can not be zero !", a); \
-    return INTERNAL_ERROR;                                  \
+#define FMK_INT32_DIVCHECK(a, b)                                                                                  \
+  if (CheckInt32DivOverflow((a), (b)) != SUCCESS) {                                                               \
+    GELOGW("Int32 %d and %d division can result in overflow!", static_cast<int32_t>(a), static_cast<int32_t>(b)); \
+    return INTERNAL_ERROR;                                                                                        \
   }
 
-#define FMK_FLOAT_ZEROCHECK(a)                               \
-  if (fabs(a) < FLT_EPSILON) {                               \
-    GELOGE(INTERNAL_ERROR, "float %f can not be zero !", a); \
-    return INTERNAL_ERROR;                                   \
+#define FMK_INT64_UINT32_MULCHECK(a, b)                                                                \
+  if (ge::CheckInt64Uint32MulOverflow((a), (b)) != SUCCESS) {                                          \
+    GELOGW("Int64 %ld and UINT32 %u multiplication can result in overflow!", static_cast<uint32_t>(a), \
+           static_cast<uint32_t>(b));                                                                  \
+    return INTERNAL_ERROR;                                                                             \
   }
 
-#define FMK_DOUBLE_ZEROCHECK(a)                                \
-  if (fabs(a) < DBL_EPSILON) {                                 \
-    GELOGE(INTERNAL_ERROR, "double %lf can not be zero !", a); \
+#define FMK_FP16_ZEROCHECK(a)                                  \
+  if (fabs(a) < DBL_EPSILON || a < 0) {                        \
+    GELOGW("Fp16 %f can not less than or equal to zero! ", a); \
     return INTERNAL_ERROR;                                     \
   }
+
+#define FMK_FLOAT_ZEROCHECK(a)                                  \
+  if (fabs(a) < FLT_EPSILON || a < 0) {                         \
+    GELOGW("Float %f can not less than or equal to zero! ", a); \
+    return INTERNAL_ERROR;                                      \
+  }
+
+#define FMK_DOUBLE_ZEROCHECK(a)                                   \
+  if (fabs(a) < DBL_EPSILON || a < 0) {                           \
+    GELOGW("Double %lf can not less than or equal to zero! ", a); \
+    return INTERNAL_ERROR;                                        \
+  }
 }  // namespace ge
 #endif  // GE_COMMON_MATH_MATH_UTIL_H_
diff --git a/src/ge/common/tbe_kernel_store.cc b/src/ge/common/tbe_kernel_store.cc
index 10ed51a6..9acead2d 100644
--- a/src/ge/common/tbe_kernel_store.cc
+++ b/src/ge/common/tbe_kernel_store.cc
@@ -16,126 +16,19 @@
 
 #include "common/tbe_kernel_store.h"
 
-#include <securec.h>
-#include <utility>
-
-#include "common/ge/ge_util.h"
-#include "framework/common/debug/ge_log.h"
-#include "framework/common/debug/log.h"
-
 namespace ge {
-const uint32_t kKernelItemMagic = 0x5d776efd;
-
-struct KernelStoreItemHead {
-  uint32_t magic;
-  uint32_t name_len;
-  uint32_t bin_len;
-};
 
 TBEKernelStore::TBEKernelStore() {}
 
-void TBEKernelStore::AddTBEKernel(const TBEKernelPtr &kernel) {
-  if (kernel != nullptr) {
-    kernels_[kernel->GetName()] = kernel;
-  }
-}
-
-bool TBEKernelStore::Build() {
-  buffer_.clear();
-  size_t total_len = 0;
-  for (const auto &item : kernels_) {
-    auto kernel = item.second;
-    total_len += sizeof(KernelStoreItemHead);
-    total_len += kernel->GetName().length();
-    total_len += kernel->GetBinDataSize();
-  }
-
-  try {
-    buffer_.resize(total_len);
-  } catch (std::bad_alloc &e) {
-    GELOGE(ge::MEMALLOC_FAILED, "All build memory failed, memory size %zu", total_len);
-    return false;
-  }
-
-  uint8_t *next_buffer = buffer_.data();
-  size_t remain_len = total_len;
-  errno_t mem_ret;
-  for (const auto &item : kernels_) {
-    auto kernel = item.second;
-    KernelStoreItemHead kernel_head{};
-    kernel_head.magic = kKernelItemMagic;
-    kernel_head.name_len = static_cast<uint32_t>(kernel->GetName().length());
-    kernel_head.bin_len = static_cast<uint32_t>(kernel->GetBinDataSize());
-
-    mem_ret = memcpy_s(next_buffer, remain_len, &kernel_head, sizeof(kernel_head));
-    GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false);
-    next_buffer += sizeof(kernel_head);
-
-    mem_ret = memcpy_s(next_buffer, remain_len - sizeof(kernel_head), kernel->GetName().data(), kernel_head.name_len);
-    GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false);
-    next_buffer += kernel_head.name_len;
-
-    mem_ret = memcpy_s(next_buffer, remain_len - sizeof(kernel_head) - kernel_head.name_len, kernel->GetBinData(),
-                       kernel_head.bin_len);
-    GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false);
-
-    next_buffer += kernel_head.bin_len;
-    remain_len = remain_len - sizeof(kernel_head) - kernel_head.name_len - kernel_head.bin_len;
-  }
-  kernels_.clear();
-  return true;
-}
-
-const uint8_t *TBEKernelStore::Data() const { return buffer_.data(); }
-
-size_t TBEKernelStore::DataSize() const { return buffer_.size(); }
-
-bool TBEKernelStore::Load(const uint8_t *data, const size_t &len) {
-  if (data == nullptr || len == 0) {
-    return false;
-  }
-  size_t buffer_len = len;
-  while (buffer_len > sizeof(KernelStoreItemHead)) {
-    const char *next_buffer = reinterpret_cast<const char *>(data) + (len - buffer_len);
-
-    const auto *kernel_head = reinterpret_cast<const KernelStoreItemHead *>(next_buffer);
-    if (buffer_len < kernel_head->name_len + kernel_head->bin_len + sizeof(KernelStoreItemHead)) {
-      GELOGW("Invalid kernel block remain buffer len %zu, name len %u, bin len %u", buffer_len, kernel_head->name_len,
-             kernel_head->bin_len);
-      break;
-    }
-
-    next_buffer += sizeof(KernelStoreItemHead);
-    std::string name(next_buffer, kernel_head->name_len);
-
-    next_buffer += kernel_head->name_len;
-    GELOGI("Load kernel from om:%s,%u,%u", name.c_str(), kernel_head->name_len, kernel_head->bin_len);
-    std::vector<char> kernel_bin(next_buffer, next_buffer + kernel_head->bin_len);
-    TBEKernelPtr teb_kernel_ptr = ge::MakeShared<TBEKernel>(name, std::move(kernel_bin));
-    if (teb_kernel_ptr != nullptr) {
-      kernels_.emplace(name, teb_kernel_ptr);
-    }
-    buffer_len -= sizeof(KernelStoreItemHead) + kernel_head->name_len + kernel_head->bin_len;
-  }
-
-  return true;
-}
-
-TBEKernelPtr TBEKernelStore::FindTBEKernel(const std::string &name) const {
-  auto it = kernels_.find(name);
-  if (it != kernels_.end()) {
-    return it->second;
-  }
-  return nullptr;
-}
+void TBEKernelStore::AddTBEKernel(const TBEKernelPtr &kernel) { AddKernel(kernel); }
 
 void TBEKernelStore::LoadTBEKernelBinToOpDesc(const std::shared_ptr<ge::OpDesc> &op_desc) const {
   if (op_desc != nullptr) {
-    auto tbe_kernel = FindTBEKernel(op_desc->GetName());
-    if (tbe_kernel != nullptr) {
-      GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, tbe_kernel),
-                      GELOGW("LoadTBEKernelBinToOpDesc: SetExtAttr for tbe_kernel failed");)
-      GELOGI("Load tbe kernel:%s, %zu", tbe_kernel->GetName().c_str(), tbe_kernel->GetBinDataSize());
+    auto kernel_bin = FindKernel(op_desc->GetName());
+    if (kernel_bin != nullptr) {
+      GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, kernel_bin),
+                      GELOGW("LoadKernelTBEBinToOpDesc: SetExtAttr for kernel_bin failed");)
+      GELOGI("Load tbe kernel:%s, %zu", kernel_bin->GetName().c_str(), kernel_bin->GetBinDataSize());
     }
   }
 }
diff --git a/src/ge/common/tbe_kernel_store.h b/src/ge/common/tbe_kernel_store.h
index 51d69af2..ab1ab9b4 100644
--- a/src/ge/common/tbe_kernel_store.h
+++ b/src/ge/common/tbe_kernel_store.h
@@ -17,38 +17,17 @@
 #ifndef GE_COMMON_TBE_KERNEL_STORE_H_
 #define GE_COMMON_TBE_KERNEL_STORE_H_
 
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "framework/common/fmk_types.h"
-#include "graph/op_desc.h"
-#include "graph/op_kernel_bin.h"
+#include "common/kernel_store.h"
 
 namespace ge {
-using TBEKernel = ge::OpKernelBin;
-using TBEKernelPtr = std::shared_ptr<ge::OpKernelBin>;
 
-class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY TBEKernelStore {
+class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY TBEKernelStore : public KernelStore {
  public:
   TBEKernelStore();
-  ~TBEKernelStore() = default;
+  ~TBEKernelStore() {}
   void AddTBEKernel(const TBEKernelPtr &kernel);
-  bool Build();
-
-  bool Load(const uint8_t *data, const size_t &len);
-  TBEKernelPtr FindTBEKernel(const std::string &name) const;
 
   void LoadTBEKernelBinToOpDesc(const std::shared_ptr<ge::OpDesc> &op_desc) const;
-
-  const uint8_t *Data() const;
-  size_t DataSize() const;
-
- private:
-  std::unordered_map<std::string, TBEKernelPtr> kernels_;
-  std::vector<uint8_t> buffer_;
 };
 }  // namespace ge
 
diff --git a/src/ge/engine_manager/dnnengine_manager.cc b/src/ge/engine_manager/dnnengine_manager.cc
index fe3c1bc8..3389e1b9 100644
--- a/src/ge/engine_manager/dnnengine_manager.cc
+++ b/src/ge/engine_manager/dnnengine_manager.cc
@@ -26,7 +26,10 @@
 #include "common/ge/ge_util.h"
 #include "common/util/error_manager/error_manager.h"
 #include "framework/common/debug/ge_log.h"
+#include "analyzer/analyzer.h"
 #include "graph/ge_context.h"
+#include "graph/utils/graph_utils.h"
+#include "graph/utils/node_utils.h"
 #include "init/gelib.h"
 
 namespace {
@@ -164,11 +167,22 @@ bool DNNEngineManager::IsEngineRegistered(const std::string &name) {
   return false;
 }
 
-void DNNEngineManager::InitPerformanceStaistic() { checksupport_cost_.clear(); }
+void DNNEngineManager::InitPerformanceStaistic() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  checksupport_cost_.clear();
+}
+
+const map<string, uint64_t> &DNNEngineManager::GetCheckSupportCost() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return checksupport_cost_;
+}
 
-const map<string, uint64_t> &DNNEngineManager::GetCheckSupportCost() const { return checksupport_cost_; }
+std::string DNNEngineManager::GetDNNEngineName(const ge::NodePtr &node_ptr) {
+  std::lock_guard<std::mutex> lock(mutex_);
 
-std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) {
+  GE_IF_BOOL_EXEC(node_ptr == nullptr, GELOGE(GE_CLI_GE_NOT_INITIALIZED, "DNNEngineManager: node_ptr is nullptr");
+                  return "");
+  auto op_desc = node_ptr->GetOpDesc();
   GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(GE_CLI_GE_NOT_INITIALIZED, "DNNEngineManager: op_desc is nullptr");
                   return "");
   // Use the OpsKernelManager in GELib to get the opInfos for this opCode
@@ -190,6 +204,7 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) {
   std::string exclude_core_Type = (ge_core_type == kVectorCore) ? kAIcoreEngine : kVectorEngine;
   GELOGD("engine type will exclude: %s", exclude_core_Type.c_str());
 
+  auto root_graph = ge::GraphUtils::FindRootGraph(node_ptr->GetOwnerComputeGraph());
   std::map<std::string, std::string> unsupported_reasons;
   for (const auto &it : op_infos) {
     if (it.engine == exclude_core_Type) {
@@ -206,6 +221,9 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) {
         checksupport_cost_[kernel_name] += GetCurrentTimestap() - start_time;
         op_desc->SetOpEngineName(it.engine);
         op_desc->SetOpKernelLibName(kernel_name);
+        // set attrs for taking information when load txt to graph object
+        (void)AttrUtils::SetStr(op_desc, ATTR_NAME_ENGINE_NAME_FOR_LX, it.engine);
+        (void)AttrUtils::SetStr(op_desc, ATTR_NAME_KKERNEL_LIB_NAME_FOR_LX, kernel_name);
         GELOGD("DNNEngineManager:Set OpKernelLibName %s and engine name %s to op_desc %s", kernel_name.c_str(),
                it.engine.c_str(), op_desc->GetName().c_str());
         return it.engine;
@@ -219,6 +237,9 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) {
                  "The custom operator registered by the user does not support the logic function delivered by this "
                  "network. Check support failed, kernel_name is %s, op type is %s, op name is %s",
                  kernel_name.c_str(), op_desc->GetType().c_str(), op_desc->GetName().c_str());
+          std::string error_info =
+            "The custom operator registered by the user does not support the logic function"
+            "delivered by this network";
           return "";
         }
         unsupported_reasons.emplace(kernel_name, unsupported_reason);
@@ -235,12 +256,22 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) {
         kernel_name.c_str(), op_desc->GetType().c_str(), op_desc->GetName().c_str());
     }
   }
+
+  // concat unsupported reasons analyzed data selection
+  string reason;
   for (const auto &it : unsupported_reasons) {
+    reason += it.first + ":" + it.second + ";";
     ErrorManager::GetInstance().ATCReportErrMessage("E13002", {"optype", "opskernel", "reason"},
                                                     {op_desc->GetType(), it.first, it.second});
     GELOGE(GE_GRAPH_ASSIGN_ENGINE_FAILED, "GetDNNEngineName:Op type %s of ops kernel %s is unsupported, reason:%s",
            op_desc->GetType().c_str(), it.first.c_str(), it.second.c_str());
   }
+
+  analyzer::DataInfo analyze_info{root_graph->GetSessionID(), root_graph->GetGraphID(), analyzer::CHECKSUPPORT,
+                                  node_ptr, reason};
+  // do not change original process
+  (void)Analyzer::GetInstance()->DoAnalyze(analyze_info);
+
   ErrorManager::GetInstance().ATCReportErrMessage("E13003", {"opname", "optype"},
                                                   {op_desc->GetName(), op_desc->GetType()});
   GELOGE(GE_GRAPH_ASSIGN_ENGINE_FAILED, "Can't find any supported ops kernel and engine of %s, type is %s",
diff --git a/src/ge/engine_manager/dnnengine_manager.h b/src/ge/engine_manager/dnnengine_manager.h
index 6d5b02f9..c3ae5b95 100644
--- a/src/ge/engine_manager/dnnengine_manager.h
+++ b/src/ge/engine_manager/dnnengine_manager.h
@@ -21,6 +21,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <mutex>
 
 #include "nlohmann/json.hpp"
 
@@ -29,6 +30,7 @@
 #include "common/opskernel/ops_kernel_info_types.h"
 #include "engine/dnnengine.h"
 #include "graph/op_desc.h"
+#include "graph/node.h"
 
 using JsonHandle = void *;
 namespace ge {
@@ -61,7 +63,7 @@ class DNNEngineManager {
   std::shared_ptr<ge::DNNEngine> GetEngine(const std::string &name) const;
   bool IsEngineRegistered(const std::string &name);
   // If can't find appropriate engine name, return "", report error
-  string GetDNNEngineName(const OpDescPtr &op_desc);
+  string GetDNNEngineName(const ge::NodePtr &node_ptr);
   const map<string, SchedulerConf> &GetSchedulers() const;
   const map<string, uint64_t> &GetCheckSupportCost() const;
   void InitPerformanceStaistic();
@@ -83,6 +85,7 @@ class DNNEngineManager {
   std::map<string, SchedulerConf> schedulers_;
   std::map<string, uint64_t> checksupport_cost_;
   bool init_flag_;
+  mutable std::mutex mutex_;
 };
 }  // namespace ge
 
diff --git a/src/ge/executor/CMakeLists.txt b/src/ge/executor/CMakeLists.txt
index 17508711..f3956e31 100755
--- a/src/ge/executor/CMakeLists.txt
+++ b/src/ge/executor/CMakeLists.txt
@@ -22,6 +22,7 @@ file(GLOB PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "../../proto/insert_op.proto"
         "../../proto/op_mapping_info.proto"
         "../../proto/ge_ir.proto"
+        "../proto/dump_task.proto"
         )
 
 file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
@@ -68,6 +69,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "../graph/manager/graph_manager_utils.cc"
         "../graph/manager/graph_mem_allocator.cc"
         "../graph/manager/graph_var_manager.cc"
+        "../graph/manager/rdma_pool_allocator.cc"
         "../graph/manager/trans_var_data_utils.cc"
         "../graph/manager/util/debug.cc"
         "../hybrid/hybrid_davinci_model_stub.cc"
diff --git a/src/ge/executor/ge_executor.cc b/src/ge/executor/ge_executor.cc
index 0d334042..b4e9df35 100644
--- a/src/ge/executor/ge_executor.cc
+++ b/src/ge/executor/ge_executor.cc
@@ -344,47 +344,19 @@ Status GeExecutor::SetDynamicDims(uint32_t model_id, void *dynamic_input_addr, u
     return FAILED;
   }
 
-  Status ret = GraphExecutor::SetDynamicSize(model_id, dynamic_dims, static_cast<int32_t>(DYNAMIC_DIMS));
+  vector<uint64_t> cur_dynamic_dims;
+  Status ret = GetCurDynamicDims(model_id, dynamic_dims, cur_dynamic_dims);
   if (ret != SUCCESS) {
-    GELOGE(FAILED, "Set dynamic size failed");
+    GELOGE(FAILED, "Set cur gear dynmaic dims failed");
     return FAILED;
   }
 
-  vector<uint64_t> cur_dynamic_dims;
-  std::vector<ge::TensorDesc> input_desc;
-  std::vector<ge::TensorDesc> output_desc;
-  ret = GetModelDescInfo(model_id, input_desc, output_desc);
-  if (ret != ge::SUCCESS) {
-    GELOGE(FAILED, "GetModelDescInfo failed.");
-    return FAILED;
-  }
-  vector<string> user_designate_shape_order;
-  vector<int64_t> all_data_dims;
-  ret = GetUserDesignateShapeOrder(model_id, user_designate_shape_order);
-  if (ret != ge::SUCCESS) {
-    GELOGE(FAILED, "GetUserDesignateShapeOrder failed.");
-    return FAILED;
-  }
-  for (auto &data_name : user_designate_shape_order) {
-    for (size_t j = 0; j < input_desc.size(); ++j) {
-      if (input_desc.at(j).GetName() == data_name) {
-        for (auto dim : input_desc.at(j).GetShape().GetDims()) {
-          all_data_dims.push_back(dim);
-        }
-        break;
-      }
-    }
-  }
-  if (dynamic_dims.size() != all_data_dims.size()) {
-    GELOGE(FAILED, "Dynamic input size [%lu] is not equal with all data dims size [%lu]!", dynamic_dims.size(),
-           all_data_dims.size());
+  ret = GraphExecutor::SetDynamicSize(model_id, cur_dynamic_dims, static_cast<int32_t>(DYNAMIC_DIMS));
+  if (ret != SUCCESS) {
+    GELOGE(FAILED, "Set dynamic size failed");
     return FAILED;
   }
-  for (std::size_t i = 0; i < all_data_dims.size(); ++i) {
-    if (all_data_dims[i] < 0) {
-      cur_dynamic_dims.push_back(dynamic_dims[i]);
-    }
-  }
+
   size_t dynamic_dim_num = cur_dynamic_dims.size();
   uint64_t dynamic_input_size = static_cast<uint64_t>(dynamic_dim_num * sizeof(uint64_t));
   if (length < dynamic_input_size) {
@@ -403,58 +375,43 @@ Status GeExecutor::SetDynamicDims(uint32_t model_id, void *dynamic_input_addr, u
   return SUCCESS;
 }
 
-Status GeExecutor::GetCurDynamicDims(uint32_t model_id, const vector<uint64_t> &combined_dims,
+Status GeExecutor::GetCurDynamicDims(uint32_t model_id, const vector<uint64_t> &dynamic_dims,
                                      vector<uint64_t> &cur_dynamic_dims) {
-  vector<vector<int64_t>> combined_batch;
-  if (GraphExecutor::GetCombinedDynamicDims(model_id, combined_batch) != SUCCESS) {
-    GELOGE(FAILED, "Get combined dynamic dims info failed.");
-    return FAILED;
-  }
-  if (combined_batch.empty()) {
-    GELOGE(FAILED, "Combined dynamic dims is empty.");
+  cur_dynamic_dims.clear();
+  vector<ge::TensorDesc> input_desc;
+  vector<ge::TensorDesc> output_desc;
+  auto ret = GetModelDescInfo(model_id, input_desc, output_desc);
+  if (ret != ge::SUCCESS) {
+    GELOGE(FAILED, "GetModelDescInfo failed.");
     return FAILED;
   }
-
-  if (combined_dims.size() != combined_batch[0].size()) {
-    GELOGE(FAILED, "Input dynamic dims's dimension size[%zu] is different from model[%zu].", combined_dims.size(),
-           combined_batch[0].size());
+  vector<string> user_designate_shape_order;
+  vector<int64_t> all_data_dims;
+  ret = GetUserDesignateShapeOrder(model_id, user_designate_shape_order);
+  if (ret != ge::SUCCESS) {
+    GELOGE(FAILED, "GetUserDesignateShapeOrder failed.");
     return FAILED;
   }
-  bool matched = false;
-  size_t idx = 0;
-  for (size_t i = 0; i < combined_batch.size(); i++) {
-    bool is_match = true;
-    for (size_t j = 0; j < combined_dims.size(); j++) {
-      if (combined_dims[j] != static_cast<uint64_t>(combined_batch[i][j])) {
-        is_match = false;
+  for (auto &data_name : user_designate_shape_order) {
+    for (auto &desc : input_desc) {
+      if (desc.GetName() == data_name) {
+        for (auto dim : desc.GetShape().GetDims()) {
+          all_data_dims.push_back(dim);
+        }
         break;
       }
     }
-    if (is_match) {
-      idx = i;
-      matched = true;
-      break;
-    }
-  }
-
-  if (!matched) {
-    GELOGE(FAILED, "Input dynamic dims can not match model.");
-    return FAILED;
   }
-
-  // batch_info save the dynamic info of combined_dims
-  vector<vector<int64_t>> batch_info;
-  int32_t dynamic_type = static_cast<int32_t>(FIXED);
-  if (GraphExecutor::GetDynamicBatchInfo(model_id, batch_info, dynamic_type) != SUCCESS) {
-    GELOGE(FAILED, "Get dynamic input info failed.");
+  if (dynamic_dims.size() != all_data_dims.size()) {
+    GELOGE(FAILED, "Dynamic input size [%lu] is not equal with all data dims size [%lu]!", dynamic_dims.size(),
+           all_data_dims.size());
     return FAILED;
   }
-
-  cur_dynamic_dims.clear();
-  for (size_t i = 0; i < batch_info[idx].size(); i++) {
-    cur_dynamic_dims.emplace_back(static_cast<uint64_t>(batch_info[idx][i]));
+  for (std::size_t i = 0; i < all_data_dims.size(); ++i) {
+    if (all_data_dims[i] < 0) {
+      cur_dynamic_dims.push_back(dynamic_dims[i]);
+    }
   }
-
   return SUCCESS;
 }
 
@@ -924,13 +881,6 @@ Status GeExecutor::ExecModel(uint32_t model_id, void *stream, const ge::RunModel
       GELOGE(ret, "Get dynamic input info failed.");
       return ret;
     }
-    if (dynamic_type == static_cast<int32_t>(DYNAMIC_DIMS)) {
-      ret = GraphExecutor::GetCombinedDynamicDims(model_id, batch_info);
-      if (ret != SUCCESS) {
-        GELOGE(FAILED, "Get dynamic input info failed.");
-        return FAILED;
-      }
-    }
     if (!batch_info.empty()) {
       SetDynamicInputDataFlag(run_input_data, batch_info, input_data);
     }
diff --git a/src/ge/executor/module.mk b/src/ge/executor/module.mk
index 878341b6..6b2de8f2 100644
--- a/src/ge/executor/module.mk
+++ b/src/ge/executor/module.mk
@@ -13,6 +13,7 @@ local_ge_executor_src_files :=  \
     ../omm/csa_interact.cc \
     ../graph/manager/graph_manager_utils.cc \
     ../graph/manager/graph_var_manager.cc \
+    ../graph/manager/rdma_pool_allocator.cc \
     ../graph/manager/graph_mem_allocator.cc \
     ../graph/manager/graph_caching_allocator.cc \
     ../graph/manager/trans_var_data_utils.cc \
@@ -63,6 +64,7 @@ local_ge_executor_src_files :=  \
 local_ge_executor_c_include :=             \
     proto/insert_op.proto                  \
     proto/op_mapping_info.proto            \
+    proto/dump_task.proto                  \
     proto/ge_ir.proto                      \
     proto/task.proto                       \
     proto/om.proto                         \
diff --git a/src/ge/ge_inference.mk b/src/ge/ge_inference.mk
index 0cc0d6fb..3b9e17ea 100644
--- a/src/ge/ge_inference.mk
+++ b/src/ge/ge_inference.mk
@@ -59,6 +59,7 @@ GRAPH_MANAGER_LOCAL_SRC_FILES := \
     generator/ge_generator.cc \
     generator/generator_api.cc \
     graph/manager/graph_var_manager.cc \
+    graph/manager/rdma_pool_allocator.cc \
     graph/manager/graph_mem_allocator.cc \
     graph/manager/graph_caching_allocator.cc \
 
@@ -66,6 +67,9 @@ BUILER_SRC_FILES := \
     ir_build/ge_ir_build.cc \
     ir_build/atc_ir_common.cc \
 
+ANALYZER_SRC_FILES:= \
+    analyzer/analyzer.cc \
+
 OMG_HOST_SRC_FILES := \
     model/ge_model.cc \
     model/ge_root_model.cc \
@@ -103,6 +107,7 @@ OMG_HOST_SRC_FILES := \
     graph/passes/mark_graph_unknown_status_pass.cc \
     graph/common/omg_util.cc \
     graph/common/bcast.cc \
+    graph/common/local_context.cc \
     graph/passes/dimension_compute_pass.cc \
     graph/passes/dimension_adjust_pass.cc \
     graph/passes/get_original_format_pass.cc \
@@ -260,6 +265,7 @@ COMMON_LOCAL_C_INCLUDES := \
     proto/ge_ir.proto \
     proto/fwk_adapter.proto    \
     proto/op_mapping_info.proto \
+    proto/dump_task.proto       \
     proto/tensorflow/attr_value.proto \
     proto/tensorflow/function.proto \
     proto/tensorflow/graph.proto \
@@ -284,6 +290,9 @@ COMMON_LOCAL_C_INCLUDES := \
     third_party/protobuf/include \
     third_party/opencv/include \
 
+ANALYZER_LOCAL_INCLUDES := \
+    $(TOPDIR)framework/domi/analyzer \
+
 NEW_OMG_HOST_SRC_FILES := \
     graph/preprocess/insert_op/util_insert_aipp_op.cc \
     graph/preprocess/insert_op/ge_aipp_op.cc \
@@ -348,6 +357,7 @@ LOCAL_CFLAGS += -g -O0
 endif
 
 LOCAL_C_INCLUDES := $(COMMON_LOCAL_C_INCLUDES)
+LOCAL_C_INCLUDES += $(ANALYZER_LOCAL_INCLUDES)
 
 LOCAL_SRC_FILES := $(COMMON_LOCAL_SRC_FILES)
 LOCAL_SRC_FILES += $(GRAPH_MANAGER_LOCAL_SRC_FILES)
@@ -355,6 +365,7 @@ LOCAL_SRC_FILES += $(OMG_HOST_SRC_FILES)
 LOCAL_SRC_FILES += $(OME_HOST_SRC_FILES)
 LOCAL_SRC_FILES += $(NEW_OME_DEVICE_SRC_FILES)
 LOCAL_SRC_FILES += $(BUILER_SRC_FILES)
+LOCAL_SRC_FILES += $(ANALYZER_SRC_FILES)
 
 LOCAL_STATIC_LIBRARIES := libge_memory \
 
@@ -414,9 +425,11 @@ LOCAL_SRC_FILES += $(GRAPH_MANAGER_LOCAL_SRC_FILES)
 LOCAL_SRC_FILES += $(OMG_DEVICE_SRC_FILES)
 LOCAL_SRC_FILES += $(OME_DEVICE_SRC_FILES)
 LOCAL_SRC_FILES += $(BUILER_SRC_FILES)
+LOCAL_SRC_FILES += $(ANALYZER_SRC_FILES)
 
 
 LOCAL_C_INCLUDES := $(DEVICE_LOCAL_C_INCLUDES)
+LOCAL_C_INCLUDES += $(ANALYZER_LOCAL_INCLUDES)
 
 LOCAL_STATIC_LIBRARIES := libge_memory \
 
diff --git a/src/ge/ge_local_engine/engine/host_cpu_engine.cc b/src/ge/ge_local_engine/engine/host_cpu_engine.cc
index fd1b20d3..eb7741c0 100644
--- a/src/ge/ge_local_engine/engine/host_cpu_engine.cc
+++ b/src/ge/ge_local_engine/engine/host_cpu_engine.cc
@@ -19,10 +19,48 @@
 #include "graph/common/omg_util.h"
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/tensor_adapter.h"
-#include "mmpa/mmpa_api.h"
 #include "register/op_kernel_registry.h"
+#include "register/host_cpu_context.h"
 #include "common/ge/ge_util.h"
 #include "common/ge/plugin_manager.h"
+#include "graph/utils/type_utils.h"
+#include "common/fp16_t.h"
+
+namespace {
+#define CREATE_OUTPUT_CASE(DTYPE, TYPE)                                                                            \
+  case (DTYPE): {                                                                                                  \
+    GeTensorPtr ge_tensor = nullptr;                                                                               \
+    if (need_create_flag) {                                                                                        \
+      int64_t data_num = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize();                  \
+      std::unique_ptr<TYPE[]> buf(new (std::nothrow) TYPE[data_num]());                                            \
+      if (buf == nullptr) {                                                                                        \
+        GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed",                                     \
+               static_cast<size_t>(sizeof(TYPE) * data_num));                                                      \
+        return MEMALLOC_FAILED;                                                                                    \
+      }                                                                                                            \
+      ge_tensor = MakeShared<GeTensor>(out_desc);                                                                  \
+      GE_CHECK_NOTNULL(ge_tensor);                                                                                 \
+      GELOGI("node:%s allocate output %zu, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE));    \
+      ge_tensor->SetData(reinterpret_cast<uint8_t *>(buf.get()), data_num * sizeof(TYPE));                         \
+      ge_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType());                                          \
+      ge_tensor->MutableTensorDesc().SetShape(out_desc.GetShape());                                                \
+      outputs.emplace_back(ge_tensor);                                                                             \
+    } else {                                                                                                       \
+      ge_tensor = outputs[i];                                                                                      \
+      GE_CHECK_NOTNULL(ge_tensor);                                                                                 \
+      GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i,                      \
+             reinterpret_cast<const uint8_t *>(ge_tensor->GetData().data()), ge_tensor->GetData().size());         \
+    }                                                                                                              \
+    auto tensor = TensorAdapter::AsTensor(*ge_tensor);                                                             \
+    auto tensor_name = op_desc->GetOutputNameByIndex(i);                                                           \
+    GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu",           \
+                               op_desc->GetName().c_str(), i);                                                     \
+    GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s, addr = %p, size = %zu", \
+           op_desc->GetName().c_str(), i, tensor_name.c_str(), tensor.GetData(), tensor.GetSize());                \
+    named_outputs.emplace(tensor_name, tensor);                                                                    \
+    break;                                                                                                         \
+  }
+}  // namespace
 
 namespace ge {
 namespace {
@@ -105,17 +143,32 @@ Status HostCpuEngine::PrepareInputs(const ge::ConstOpDescPtr &op_desc, const vec
 
 Status HostCpuEngine::PrepareOutputs(const ge::ConstOpDescPtr &op_desc, vector<GeTensorPtr> &outputs,
                                      map<std::string, Tensor> &named_outputs) {
+  if (!outputs.empty() && (outputs.size() != op_desc->GetOutputsSize())) {
+    GELOGW("size of ouputs not match, size of outputs = %zu, exactly output_num=%zu.", outputs.size(),
+           op_desc->GetOutputsSize());
+    outputs.clear();
+  }
+  bool need_create_flag = (outputs.size() != op_desc->GetOutputsSize());
   for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) {
-    auto ge_tensor = MakeShared<GeTensor>(op_desc->GetOutputDesc(i));
-    GE_CHECK_NOTNULL(ge_tensor);
-    outputs.emplace_back(ge_tensor);
-    auto tensor = TensorAdapter::AsTensor(*ge_tensor);
-    auto tensor_name = op_desc->GetOutputNameByIndex(i);
-    GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu",
-                               op_desc->GetName().c_str(), i);
-    GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s", op_desc->GetName().c_str(),
-           i, tensor_name.c_str());
-    named_outputs.emplace(tensor_name, tensor);
+    const auto &out_desc = op_desc->GetOutputDesc(i);
+    switch (out_desc.GetDataType()) {
+      CREATE_OUTPUT_CASE(DT_BOOL, bool)
+      CREATE_OUTPUT_CASE(DT_INT8, int8_t)
+      CREATE_OUTPUT_CASE(DT_INT16, int16_t)
+      CREATE_OUTPUT_CASE(DT_INT32, int32_t)
+      CREATE_OUTPUT_CASE(DT_INT64, int64_t)
+      CREATE_OUTPUT_CASE(DT_UINT8, uint8_t)
+      CREATE_OUTPUT_CASE(DT_UINT16, uint16_t)
+      CREATE_OUTPUT_CASE(DT_UINT32, uint32_t)
+      CREATE_OUTPUT_CASE(DT_UINT64, uint64_t)
+      CREATE_OUTPUT_CASE(DT_FLOAT16, fp16_t)
+      CREATE_OUTPUT_CASE(DT_FLOAT, float)
+      CREATE_OUTPUT_CASE(DT_DOUBLE, double)
+      default:
+        GELOGE(PARAM_INVALID, "data type %s not support.",
+               TypeUtils::DataTypeToSerialString(out_desc.GetDataType()).c_str());
+        return PARAM_INVALID;
+    }
   }
 
   return SUCCESS;
@@ -146,6 +199,7 @@ Status HostCpuEngine::Run(NodePtr &node, const vector<ConstGeTensorPtr> &inputs,
 
   std::map<std::string, const Tensor> named_inputs;
   std::vector<GeTensorPtr> tmp_outputs;
+  tmp_outputs.swap(outputs);
   std::map<std::string, Tensor> named_outputs;
   auto op_desc = node->GetOpDesc();
   GE_CHK_STATUS_RET_NOLOG(PrepareInputs(op_desc, inputs, named_inputs));
@@ -233,6 +287,15 @@ Status HostCpuEngine::LoadLib(const std::string &lib_path) {
     return INTERNAL_ERROR;
   }
 
+  auto initialize = (Status(*)(const HostCpuContext &))dlsym(handle, "Initialize");
+  if (initialize != nullptr) {
+    GELOGI("Invoke function Initialize in lib: %s", lib_path.c_str());
+    if (initialize(HostCpuContext()) != SUCCESS) {
+      GELOGW("Failed to invoke function Initialize in lib: %s", lib_path.c_str());
+    }
+  }
+
+  GELOGI("Lib: %s has been opened", lib_path.c_str());
   lib_handles_.emplace_back(handle);
   return SUCCESS;
 }
@@ -247,4 +310,4 @@ Status HostCpuEngine::GetRealPath(std::string &path) {
   path = real_path;
   return SUCCESS;
 }
-}  // namespace ge
\ No newline at end of file
+}  // namespace ge
diff --git a/src/ge/ge_runner.mk b/src/ge/ge_runner.mk
index 66e2be5a..b4d27b1b 100644
--- a/src/ge/ge_runner.mk
+++ b/src/ge/ge_runner.mk
@@ -42,6 +42,7 @@ LIBGE_LOCAL_SRC_FILES := \
     graph/build/stream_graph_optimizer.cc \
     graph/build/task_generator.cc \
     graph/common/bcast.cc \
+    graph/common/local_context.cc \
     graph/common/omg_util.cc \
     graph/common/transop_util.cc \
     graph/execute/graph_execute.cc \
@@ -88,6 +89,7 @@ LIBGE_LOCAL_SRC_FILES := \
     graph/manager/graph_mem_allocator.cc \
     graph/manager/graph_caching_allocator.cc \
     graph/manager/graph_var_manager.cc \
+    graph/manager/rdma_pool_allocator.cc \
     graph/manager/model_manager/event_manager.cc        \
     graph/manager/trans_var_data_utils.cc \
     graph/manager/util/debug.cc                       \
@@ -289,6 +291,7 @@ LIBGE_LOCAL_SRC_FILES := \
     hybrid/node_executor/task_context.cc                                 \
     hybrid/hybrid_davinci_model.cc                                       \
     executor/ge_executor.cc \
+    analyzer/analyzer.cc \
 
 LIBCLIENT_LOCAL_SRC_FILES := \
     proto/ge_api.proto \
@@ -308,11 +311,13 @@ RUNNER_LOCAL_C_INCLUDES := \
     $(TOPDIR)inc/runtime \
     $(TOPDIR)libc_sec/include \
     $(TOPDIR)ops/built-in/op_proto/inc \
+    $(TOPDIR)framework/domi/analyzer \
     proto/fwk_adapter.proto \
     proto/ge_ir.proto \
     proto/insert_op.proto \
     proto/om.proto \
     proto/op_mapping_info.proto \
+    proto/dump_task.proto \
     proto/task.proto \
     proto/tensorflow/attr_value.proto \
     proto/tensorflow/function.proto \
diff --git a/src/ge/ge_runtime/task/aicpu_task.cc b/src/ge/ge_runtime/task/aicpu_task.cc
index 15324919..5b3d8e82 100644
--- a/src/ge/ge_runtime/task/aicpu_task.cc
+++ b/src/ge/ge_runtime/task/aicpu_task.cc
@@ -75,7 +75,8 @@ bool AicpuTask::Distribute() {
       return false;
     }
 
-    flag = rtMemcpy(ext_info_, ext_size, reinterpret_cast<void *>(ext_info.data()), ext_size, RT_MEMCPY_HOST_TO_DEVICE);
+    flag = rtMemcpy(ext_info_, ext_size, const_cast<void *>(reinterpret_cast<const void *>(ext_info.data())), ext_size,
+                    RT_MEMCPY_HOST_TO_DEVICE);
     if (flag != RT_ERROR_NONE) {
       GELOGE(RT_FAILED, "Call rt api(rtMemCpy) failed, ret: 0x%X.", flag);
       return false;
diff --git a/src/ge/generator/ge_generator.cc b/src/ge/generator/ge_generator.cc
index 0d4fac3f..edd7a155 100644
--- a/src/ge/generator/ge_generator.cc
+++ b/src/ge/generator/ge_generator.cc
@@ -15,6 +15,9 @@
  */
 
 #include "generator/ge_generator.h"
+
+#include <atomic>
+
 #include "common/ge/ge_util.h"
 #include "common/ge/plugin_manager.h"
 #include "common/helper/model_helper.h"
@@ -212,6 +215,9 @@ static void GetOpsProtoPath(string &opsproto_path) {
 
 class GeGenerator::Impl {
  public:
+  Impl(OmgContext &omg_context) : omg_context_(omg_context), graph_manager_(omg_context) {}
+  ~Impl() = default;
+
   Status BuildModel(const Graph &graph, const vector<GeTensor> &inputs, GeRootModelPtr &ge_models);
 
   Status SaveModel(const string &file_name_prefix, GeModelPtr &models, ModelBufferData &model);
@@ -221,10 +227,14 @@ class GeGenerator::Impl {
 
   Status GenerateInfershapeGraph(const Graph &graph);
 
+  OmgContext &omg_context_;
   GraphManager graph_manager_;
   SaveParam save_param_;
   bool is_offline_ = true;
   bool is_singleop_unregistered_ = false;
+  std::string build_mode_;
+  std::string build_step_;
+  static std::mutex mutex_;
 
  private:
   static std::string Trim(const std::string &str);
@@ -234,8 +244,10 @@ class GeGenerator::Impl {
   bool SetOppVersionInfo(AttrHolder &obj);
 };
 
-Status GeGenerator::Initialize(const map<string, string> &options) {
-  impl_ = ge::MakeShared<Impl>();
+Status GeGenerator::Initialize(const map<string, string> &options) { return Initialize(options, domi::GetContext()); }
+
+Status GeGenerator::Initialize(const map<string, string> &options, OmgContext &omg_context) {
+  impl_ = ge::MakeShared<Impl>(omg_context);
   if (impl_ == nullptr) {
     GELOGE(MEMALLOC_FAILED, "Make shared failed");
     return MEMALLOC_FAILED;
@@ -273,6 +285,17 @@ Status GeGenerator::Initialize(const map<string, string> &options) {
   if (iter != options.end()) {
     impl_->save_param_.pri_key_file = iter->second;
   }
+
+  // get build mode
+  iter = options.find(BUILD_MODE);
+  if (iter != options.end()) {
+    impl_->build_mode_ = iter->second;
+  }
+  // get build step
+  iter = options.find(BUILD_STEP);
+  if (iter != options.end()) {
+    impl_->build_step_ = iter->second;
+  }
   return SUCCESS;
 }
 
@@ -312,6 +335,8 @@ Status GeGenerator::GenerateInfershapeGraph(const Graph &graph) {
   return SUCCESS;
 }
 
+std::mutex GeGenerator::Impl::mutex_;
+
 // Remove the space and tab before and after the string
 std::string GeGenerator::Impl::Trim(const std::string &str) {
   if (str.empty()) {
@@ -436,8 +461,7 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr
   auto rt = rtCtxGetCurrent(&ctx);
   if (rt != RT_ERROR_NONE) {
     GELOGW("Current ctx is null.");
-  } else {
-    ge::RtContextUtil::GetInstance().SetNormalModeContext(ctx);
+    ctx = nullptr;
   }
 
   GeRootModelPtr ge_root_model = nullptr;
@@ -451,6 +475,17 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr
     }
     return ret;
   }
+
+  /// BUILD_MODE_TUNING with BUILD_STEP_BEFORE_UB_MATCH no need save model;
+  /// BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER no need save model;
+  /// BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER_SUB no need save model.
+  if ((impl_->build_mode_ == BUILD_MODE_TUNING) &&
+      (impl_->build_step_ == BUILD_STEP_BEFORE_UB_MATCH || impl_->build_step_ == BUILD_STEP_AFTER_BUILDER ||
+       impl_->build_step_ == BUILD_STEP_AFTER_BUILDER_SUB)) {
+    GELOGI("Build mode:%s with step:%s no need SaveModel.", impl_->build_mode_.c_str(), impl_->build_step_.c_str());
+    return SUCCESS;
+  }
+
   GE_CHECK_NOTNULL(ge_root_model);
   GE_CHECK_NOTNULL(ge_root_model->GetRootGraph());
   ModelHelper model_helper;
@@ -474,8 +509,8 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr
     return ret;
   }
 
-  if (RtContextUtil::GetInstance().GetNormalModeContext() != nullptr) {
-    (void)rtCtxSetCurrent(RtContextUtil::GetInstance().GetNormalModeContext());
+  if (ctx != nullptr) {
+    (void)rtCtxSetCurrent(ctx);
   }
 
   GELOGI("GenerateOfflineModel success.");
@@ -495,7 +530,8 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &in
     return PARAM_INVALID;
   }
 
-  domi::GetContext().is_dynamic_input = ContainsDynamicInpus(*op_desc);
+  OmgContext &omg_context = (impl_ == nullptr) ? domi::GetContext() : impl_->omg_context_;
+  omg_context.is_dynamic_input = ContainsDynamicInpus(*op_desc);
 
   if (op_desc->HasAttr(ATTR_NAME_UNREGST_OPPATH)) {
     impl_->is_singleop_unregistered_ = true;
@@ -633,35 +669,32 @@ Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, GeModelPtr &
 
 Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector<GeTensor> &inputs,
                                      GeRootModelPtr &ge_root_model) {
-  static GraphId id = 0;
+  static std::atomic<GraphId> atomic_graph_id(0);
+  auto graph_id = atomic_graph_id.fetch_add(1);
   const std::map<std::string, std::string> options;
-  Status ret = graph_manager_.AddGraph(id, graph, options);
+  Status ret = graph_manager_.AddGraph(graph_id, graph, options);
   if (ret != SUCCESS) {
-    GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph fail, graph id: %u", id);
+    GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph fail, graph id: %u", graph_id);
     (void)graph_manager_.Finalize();
     return GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED;
   }
 
   GELOGI("Model inputs size is %zu", inputs.size());
   graph_manager_.SetOptionsRunGraphFlag(false);
-  struct timeval tv;
-  if (gettimeofday(&tv, nullptr) != 0) {
-    GELOGE(INTERNAL_ERROR, "get the time of day failed.");
-    return INTERNAL_ERROR;
-  }
-  uint64_t session_id = static_cast<uint64_t>(tv.tv_sec * 1000000 + tv.tv_usec);  // 1000000us
+
+  static std::atomic<uint64_t> atomic_session_id(0);
+  auto session_id = atomic_session_id.fetch_add(1);
   if (is_singleop_unregistered_) {
-    ret = graph_manager_.BuildGraphForUnregisteredOp(id, inputs, ge_root_model, session_id);
+    ret = graph_manager_.BuildGraphForUnregisteredOp(graph_id, inputs, ge_root_model, session_id);
   } else {
-    ret = graph_manager_.BuildGraph(id, inputs, ge_root_model, session_id);
+    ret = graph_manager_.BuildGraph(graph_id, inputs, ge_root_model, session_id);
   }
 
   if (ret != SUCCESS) {
-    GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager build graph fail, graph id: %u", id);
+    GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager build graph fail, graph id: %u", graph_id);
     VarManagerPool::Instance().RemoveVarManager(session_id);
     return GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED;
   }
-  id += 1;
 
   VarManagerPool::Instance().RemoveVarManager(session_id);
 
@@ -669,21 +702,21 @@ Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector<GeTensor>
 }
 
 Status GeGenerator::Impl::GenerateInfershapeGraph(const Graph &graph) {
-  static GraphId id = 0;
+  static std::atomic<GraphId> atomic_graph_id(0);
+  auto graph_id = atomic_graph_id.fetch_add(1);
   const std::map<std::string, std::string> options;
-  Status ret = graph_manager_.AddGraph(id, graph, options);
+  Status ret = graph_manager_.AddGraph(graph_id, graph, options);
   if (ret != SUCCESS) {
-    GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph failed, graph id: %u", id);
+    GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph failed, graph id: %u", graph_id);
     (void)graph_manager_.Finalize();
     return GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED;
   }
 
-  ret = graph_manager_.GenerateInfershapeGraph(id);
+  ret = graph_manager_.GenerateInfershapeGraph(graph_id);
   if (ret != SUCCESS) {
     GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager generate graph failed");
     return GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED;
   }
-  id += 1;
 
   return SUCCESS;
 }
diff --git a/src/ge/graph/build/graph_builder.cc b/src/ge/graph/build/graph_builder.cc
index ac83d4ec..27d0b13f 100644
--- a/src/ge/graph/build/graph_builder.cc
+++ b/src/ge/graph/build/graph_builder.cc
@@ -63,7 +63,7 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) {
     std::string kernel_lib_name = node_ptr->GetOpDesc()->GetOpKernelLibName();
     if (kernel_lib_name.empty()) {
       // reset op kernel lib
-      (void)instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node_ptr->GetOpDesc());
+      (void)instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node_ptr);
       kernel_lib_name = node_ptr->GetOpDesc()->GetOpKernelLibName();
       if (kernel_lib_name.empty()) {
         GELOGE(INTERNAL_ERROR, "Get node:%s(%s) kernel lib failed.", node_ptr->GetName().c_str(),
@@ -84,6 +84,7 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) {
         GELOGE(ret, "Calculate op running param failed, node name is %s", node_ptr->GetName().c_str());
         return ret;
       }
+      GE_CHK_STATUS_RET(AddOutputMemTypeForNode(node_ptr));
     } else {
       GELOGE(GE_GRAPH_PARAM_NULLPTR, "Get op %s ops kernel info store failed", node_ptr->GetName().c_str());
       return INTERNAL_ERROR;
@@ -497,4 +498,24 @@ Status GraphBuilder::SecondPartition(ge::ComputeGraphPtr &comp_graph, vector<ge:
   GE_TIMESTAMP_END(GraphPartition2, "GraphPartitioner::Partition2");
   return ret;
 }
+
+Status GraphBuilder::AddOutputMemTypeForNode(const NodePtr &node) {
+  int64_t mem_type;
+  if (AttrUtils::GetInt(node->GetOpDesc(), ATTR_INPUT_MEMORY_TYPE, mem_type)) {
+    GELOGD("[%s] has attr input_memory_type %ld", node->GetName().c_str(), mem_type);
+    for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
+      const auto &peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
+      GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue);
+      const auto &src_node = peer_out_anchor->GetOwnerNode();
+      const auto &src_op = src_node->GetOpDesc();
+      GE_IF_BOOL_EXEC(src_op == nullptr, continue);
+      if (!AttrUtils::SetInt(src_op, ATTR_OUTPUT_MEMORY_TYPE, mem_type)) {
+        GELOGE(INTERNAL_ERROR, "Set out_memory_type attr failed.");
+        return INTERNAL_ERROR;
+      }
+      return SUCCESS;
+    }
+  }
+  return SUCCESS;
+}
 }  // namespace ge
diff --git a/src/ge/graph/build/graph_builder.h b/src/ge/graph/build/graph_builder.h
index dd229bc6..a70a5464 100644
--- a/src/ge/graph/build/graph_builder.h
+++ b/src/ge/graph/build/graph_builder.h
@@ -67,6 +67,7 @@ class GraphBuilder {
                                  GeModelPtr &ge_model_ptr, uint64_t session_id = INVALID_SESSION_ID);
   Status BuildForUnknownShapeGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr,
                                    uint64_t session_id = INVALID_SESSION_ID);
+  Status AddOutputMemTypeForNode(const NodePtr &node);
   Status BuildForHostCpuGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr,
                               uint64_t session_id = INVALID_SESSION_ID);
   int build_mode_;
diff --git a/src/ge/graph/build/memory/binary_block_mem_assigner.h b/src/ge/graph/build/memory/binary_block_mem_assigner.h
index 678a8adf..de6cae0d 100644
--- a/src/ge/graph/build/memory/binary_block_mem_assigner.h
+++ b/src/ge/graph/build/memory/binary_block_mem_assigner.h
@@ -24,7 +24,9 @@
 namespace ge {
 class BinaryBlockMemAssigner : public BlockMemAssigner {
  public:
-  explicit BinaryBlockMemAssigner(ge::ComputeGraphPtr compute_graph) : BlockMemAssigner(std::move(compute_graph)) {}
+  BinaryBlockMemAssigner(ComputeGraphPtr compute_graph, const std::map<std::string, std::string> &anchor_to_symbol,
+                         const std::map<std::string, std::list<NodeIndexIO>> &symbol_to_anchors)
+      : BlockMemAssigner(std::move(compute_graph), anchor_to_symbol, symbol_to_anchors) {}
 
   BinaryBlockMemAssigner(const BinaryBlockMemAssigner &) = delete;
 
diff --git a/src/ge/graph/build/memory/block_mem_assigner.cc b/src/ge/graph/build/memory/block_mem_assigner.cc
index 3d956230..53b5b71c 100644
--- a/src/ge/graph/build/memory/block_mem_assigner.cc
+++ b/src/ge/graph/build/memory/block_mem_assigner.cc
@@ -32,10 +32,12 @@
 
 #include "graph/debug/ge_attr_define.h"
 
+#include "graph/common/local_context.h"
 #include "graph/optimize/common/params.h"
 #include "omg/omg_inner_types.h"
 #include "runtime/mem.h"
 
+using std::list;
 using std::map;
 using std::pair;
 using std::set;
@@ -402,8 +404,13 @@ string MemoryBlock::String() {
   return ss.str();
 }
 
-BlockMemAssigner::BlockMemAssigner(ge::ComputeGraphPtr compute_graph)
-    : mem_offset_(0), compute_graph_(std::move(compute_graph)), life_time_(0) {}
+BlockMemAssigner::BlockMemAssigner(ComputeGraphPtr compute_graph, const map<string, string> &anchor_to_symbol,
+                                   const map<string, list<NodeIndexIO>> &symbol_to_anchors)
+    : mem_offset_(0),
+      compute_graph_(std::move(compute_graph)),
+      symbol_to_anchors_(symbol_to_anchors),
+      anchor_to_symbol_(anchor_to_symbol),
+      life_time_(0) {}
 
 BlockMemAssigner::~BlockMemAssigner() {
   for (MemoryBlock *memory_block : memory_blocks_) {
@@ -412,11 +419,6 @@ BlockMemAssigner::~BlockMemAssigner() {
 }
 
 void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) {
-  if (GraphUtils::GetRefMapping(compute_graph_, symbol_to_anchors_, anchor_to_symbol_) != GRAPH_SUCCESS) {
-    GELOGE(FAILED, "Get ref-mapping for graph %s failed.", compute_graph_->GetName().c_str());
-    return;
-  }
-
   vector<int64_t> temp;
   for (const NodePtr &n : compute_graph_->GetAllNodes()) {
     auto node_op_desc = n->GetOpDesc();
@@ -692,13 +694,16 @@ bool BlockMemAssigner::IsPostReuse(const MemoryBlock *mem_block) const {
 /// @ingroup GE
 /// @brief check if symbol of cur node_index_io has block
 /// @param [in] node_index_io
+/// @param [out] symbol
 /// @return bool
 ///
-bool BlockMemAssigner::IsSymbolExist(const NodeIndexIO &node_index_io) {
+bool BlockMemAssigner::IsSymbolExist(const NodeIndexIO &node_index_io, string &symbol) {
   auto iter = anchor_to_symbol_.find(node_index_io.ToString());
   if (iter == anchor_to_symbol_.end()) {
     return false;
   }
+
+  symbol = iter->second;
   return symbol_blocks_.find(iter->second) != symbol_blocks_.end();
 }
 
@@ -883,8 +888,8 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index,
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetNoAlignSize(*node_op_desc, index, no_align_size) != SUCCESS, return nullptr,
                                  "Get no align size failed");
 
-  if (IsSymbolExist(node_index_io)) {
-    const std::string &symbol = anchor_to_symbol_[node_index_io.ToString()];
+  std::string symbol;
+  if (IsSymbolExist(node_index_io, symbol)) {
     block = symbol_blocks_[symbol];
     block->AddNodeTypeIndex({n, kOutput, index, true}, size, no_align_size);
     block->ref_count_++;
@@ -949,8 +954,8 @@ bool IsOutputBlock(const ge::InDataAnchorPtr &in_data_anchor) {
   GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, GELOGE(FAILED, "Peer out anchor is nullptr."); return false);
   auto src = peer_out_anchor->GetOwnerNode();
   int32_t index = peer_out_anchor->GetIdx();
-  auto iter = domi::GetContext().out_nodes_map.find(src->GetName());
-  if (iter != domi::GetContext().out_nodes_map.end()) {
+  auto iter = GetLocalOmgContext().out_nodes_map.find(src->GetName());
+  if (iter != GetLocalOmgContext().out_nodes_map.end()) {
     for (auto id : iter->second) {
       if (index == id) {
         return true;
diff --git a/src/ge/graph/build/memory/block_mem_assigner.h b/src/ge/graph/build/memory/block_mem_assigner.h
index eedc7bec..7e37fe8e 100644
--- a/src/ge/graph/build/memory/block_mem_assigner.h
+++ b/src/ge/graph/build/memory/block_mem_assigner.h
@@ -159,7 +159,8 @@ class MemoryBlock {
 
 class BlockMemAssigner : public MemAssigner {
  public:
-  explicit BlockMemAssigner(ge::ComputeGraphPtr compute_graph);
+  BlockMemAssigner(ComputeGraphPtr compute_graph, const std::map<std::string, std::string> &anchor_to_symbol,
+                   const std::map<std::string, std::list<NodeIndexIO>> &symbol_to_anchors);
 
   BlockMemAssigner(const BlockMemAssigner &) = delete;
 
@@ -241,9 +242,10 @@ class BlockMemAssigner : public MemAssigner {
   /// @ingroup GE
   /// @brief check if symbol of cur node_index_io has block
   /// @param [in] node_index_io
+  /// @param [out] symbol
   /// @return bool
   ///
-  bool IsSymbolExist(const NodeIndexIO &node_index_io);
+  bool IsSymbolExist(const NodeIndexIO &node_index_io, std::string &symbol);
 
   ///
   /// @ingroup GE
@@ -261,8 +263,8 @@ class BlockMemAssigner : public MemAssigner {
   std::vector<NodeTypeIndex> zero_memory_list_;
 
   // ref mapping
-  std::map<std::string, std::list<NodeIndexIO>> symbol_to_anchors_;
-  std::map<std::string, std::string> anchor_to_symbol_;
+  const std::map<std::string, std::list<NodeIndexIO>> &symbol_to_anchors_;
+  const std::map<std::string, std::string> &anchor_to_symbol_;
   std::map<std::string, bool> pre_reuse_flag_;
   std::map<std::string, bool> post_reuse_flag_;
   std::map<std::string, size_t> symbol_size_;
diff --git a/src/ge/graph/build/memory/graph_mem_assigner.cc b/src/ge/graph/build/memory/graph_mem_assigner.cc
index affa82c8..c9a6b8a2 100644
--- a/src/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/src/ge/graph/build/memory/graph_mem_assigner.cc
@@ -18,6 +18,7 @@
 #include <cstring>
 #include <set>
 #include "common/math/math_util.h"
+#include "common/util/error_manager/error_manager.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/build/memory/hybrid_mem_assigner.h"
 #include "graph/build/memory/var_mem_assign_util.h"
@@ -226,6 +227,7 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, size_t &mem_offse
   if (mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) {
     GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", mem_offset,
            VarManager::Instance(session_id)->GetGraphMemoryMaxSize());
+    ErrorManager::GetInstance().ATCReportErrMessage("E19022");
     return ge::FAILED;
   }
   return SUCCESS;
diff --git a/src/ge/graph/build/memory/hybrid_mem_assigner.cc b/src/ge/graph/build/memory/hybrid_mem_assigner.cc
index 925d742a..a75487de 100644
--- a/src/ge/graph/build/memory/hybrid_mem_assigner.cc
+++ b/src/ge/graph/build/memory/hybrid_mem_assigner.cc
@@ -41,10 +41,17 @@ Status HybridMemAssigner::AssignMemory(std::unique_ptr<BlockMemAssigner> &block_
 }
 
 Status HybridMemAssigner::Assign() {
-  std::unique_ptr<BlockMemAssigner> binary_assigner(new (std::nothrow) BinaryBlockMemAssigner(compute_graph_));
+  if (GraphUtils::GetRefMapping(compute_graph_, symbol_to_anchors_, anchor_to_symbol_) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "Get ref-mapping for graph %s failed.", compute_graph_->GetName().c_str());
+    return FAILED;
+  }
+
+  std::unique_ptr<BlockMemAssigner> binary_assigner(
+    new (std::nothrow) BinaryBlockMemAssigner(compute_graph_, anchor_to_symbol_, symbol_to_anchors_));
   GE_CHECK_NOTNULL(binary_assigner);
 
-  std::unique_ptr<BlockMemAssigner> max_assigner(new (std::nothrow) MaxBlockMemAssigner(compute_graph_));
+  std::unique_ptr<BlockMemAssigner> max_assigner(
+    new (std::nothrow) MaxBlockMemAssigner(compute_graph_, anchor_to_symbol_, symbol_to_anchors_));
   GE_CHECK_NOTNULL(max_assigner);
 
   size_t bin_mem_size = 0;
diff --git a/src/ge/graph/build/memory/hybrid_mem_assigner.h b/src/ge/graph/build/memory/hybrid_mem_assigner.h
index db3741d4..fba70a59 100644
--- a/src/ge/graph/build/memory/hybrid_mem_assigner.h
+++ b/src/ge/graph/build/memory/hybrid_mem_assigner.h
@@ -54,6 +54,9 @@ class HybridMemAssigner : public MemAssigner {
   ge::ComputeGraphPtr compute_graph_;
 
   BlockMemAssignerPtr priority_assigner_;
+
+  std::map<std::string, std::string> anchor_to_symbol_;
+  std::map<std::string, std::list<NodeIndexIO>> symbol_to_anchors_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_BUILD_MEMORY_HYBRID_MEM_ASSIGNER_H_
diff --git a/src/ge/graph/build/memory/max_block_mem_assigner.h b/src/ge/graph/build/memory/max_block_mem_assigner.h
index cb46880a..f5626ebf 100644
--- a/src/ge/graph/build/memory/max_block_mem_assigner.h
+++ b/src/ge/graph/build/memory/max_block_mem_assigner.h
@@ -23,7 +23,9 @@
 namespace ge {
 class MaxBlockMemAssigner : public BlockMemAssigner {
  public:
-  explicit MaxBlockMemAssigner(ge::ComputeGraphPtr compute_graph) : BlockMemAssigner(std::move(compute_graph)) {}
+  MaxBlockMemAssigner(ComputeGraphPtr compute_graph, const std::map<std::string, std::string> &anchor_to_symbol,
+                      const std::map<std::string, std::list<NodeIndexIO>> &symbol_to_anchors)
+      : BlockMemAssigner(std::move(compute_graph), anchor_to_symbol, symbol_to_anchors) {}
 
   MaxBlockMemAssigner(const MaxBlockMemAssigner &) = delete;
 
diff --git a/src/ge/graph/build/model_builder.cc b/src/ge/graph/build/model_builder.cc
index 9a314d80..9a37478d 100644
--- a/src/ge/graph/build/model_builder.cc
+++ b/src/ge/graph/build/model_builder.cc
@@ -28,6 +28,7 @@
 #include "graph/build/stream_allocator.h"
 #include "graph/common/omg_util.h"
 #include "graph/common/ge_call_wrapper.h"
+#include "graph/common/local_context.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/ge_attr_value.h"
 #include "graph/ge_context.h"
@@ -244,7 +245,7 @@ Status ModelBuilder::SetInputOutputDesc() {
     }
     // if user set input node format ND, the expected node for data and netoutput format is ND in
     // final graph.
-    if ((domi::GetContext().format == domi::DOMI_TENSOR_ND) && (!node_op_desc->HasAttr("_is_single_op")) &&
+    if ((GetLocalOmgContext().format == domi::DOMI_TENSOR_ND) && (!node_op_desc->HasAttr("_is_single_op")) &&
         ((node_op_desc->GetType() == DATA_TYPE) || (node_op_desc->GetType() == NETOUTPUT))) {
       GELOGI("The node [%s] format should be set ND.", node_op_desc->GetName().c_str());
       auto inputDescsPtr = node_op_desc->GetAllInputsDescPtr();
@@ -406,7 +407,7 @@ Status ModelBuilder::BuildModelDef(ge::Model &model) {
   GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_ZERO_COPY_MEMORY_SIZE, zero_copy_mem_size_),
                    GELOGE(FAILED, "SetInt of ATTR_MODEL_ZERO_COPY_MEMORY_SIZE failed.");
                    return FAILED);
-  GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&model, ATTR_MODEL_OUT_NODES_NAME, domi::GetContext().net_out_nodes),
+  GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&model, ATTR_MODEL_OUT_NODES_NAME, GetLocalOmgContext().net_out_nodes),
                    GELOGE(FAILED, "SetListStr of ATTR_MODEL_OUT_NODES_NAME failed.");
                    return FAILED);
   GELOGI("For model, max_mem_offset_: %zu, zero_copy_mem_size_: %zu", max_mem_offset_, zero_copy_mem_size_);
@@ -571,26 +572,59 @@ Status ModelBuilder::SaveDataToModel(ge::Model &model, ge::GeModel &ge_model) {
   // Add weight
   ge_model.SetWeight(weight_buffer_);
 
-  // Add TBE Kernels
-  std::set<std::string> name_set;
+  // Add TBE Kernels and custom aicpu op bin
+  std::set<std::string> tbe_name_set;
+  std::set<std::string> aicpu_name_set;
   for (const ge::NodePtr &n : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) {
     auto node_op_desc = n->GetOpDesc();
     GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
     TBEKernelPtr tbe_kernel = node_op_desc->TryGetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr());
+    if (tbe_kernel == nullptr) {
+      std::string kernel_name;
+      GeAttrValue::BYTES kernel_buffer;
+      (void)AttrUtils::GetStr(node_op_desc, ATTR_NAME_TBE_KERNEL_NAME, kernel_name);
+      (void)AttrUtils::GetBytes(node_op_desc, ATTR_NAME_TBE_KERNEL_BUFFER, kernel_buffer);
+      if (!kernel_name.empty() && (kernel_buffer.GetSize() > 0)) {
+        GE_CHECK_NOTNULL(kernel_buffer.GetData());
+        std::vector<char> data(kernel_buffer.GetData(), kernel_buffer.GetData() + kernel_buffer.GetSize());
+        tbe_kernel = std::make_shared<OpKernelBin>(kernel_name, std::move(data));
+      }
+    }
     GE_IF_BOOL_EXEC(tbe_kernel == nullptr, continue);
-    if (name_set.count(tbe_kernel->GetName()) > 0) {
+    if (tbe_name_set.count(tbe_kernel->GetName()) > 0) {
       GELOGE(FAILED, "tbe_kernel name %s can't be the same", tbe_kernel->GetName().c_str());
       return FAILED;
     }
-    name_set.insert(tbe_kernel->GetName());
+    tbe_name_set.insert(tbe_kernel->GetName());
     tbe_kernel_store_.AddTBEKernel(tbe_kernel);
-    GELOGD("Add tbe kernel bin %s", tbe_kernel->GetName().c_str());
+    GELOGI("Add tbe kernel bin %s", tbe_kernel->GetName().c_str());
+  }
+
+  for (const ge::NodePtr &n : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) {
+    auto node_op_desc = n->GetOpDesc();
+    GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
+    CustAICPUKernelPtr cust_aicpu_kernel =
+      node_op_desc->TryGetExtAttr(ge::OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr());
+    GE_IF_BOOL_EXEC(cust_aicpu_kernel == nullptr, continue);
+    if (aicpu_name_set.count(cust_aicpu_kernel->GetName()) > 0) {
+      GELOGE(FAILED, "aicpu_kernel name %s can't be the same", cust_aicpu_kernel->GetName().c_str());
+      return FAILED;
+    }
+    aicpu_name_set.insert(cust_aicpu_kernel->GetName());
+    cust_aicpu_kernel_store_.AddCustAICPUKernel(cust_aicpu_kernel);
+    GELOGI("Add cust aicpu kernel bin %s", cust_aicpu_kernel->GetName().c_str());
   }
+
   if (!tbe_kernel_store_.Build()) {
     GELOGE(FAILED, "TBE Kernels store build failed!");
     return FAILED;
   }
+  if (!cust_aicpu_kernel_store_.Build()) {
+    GELOGE(FAILED, "custom AICPU kernels store build failed!");
+    return FAILED;
+  }
   ge_model.SetTBEKernelStore(tbe_kernel_store_);
+  ge_model.SetCustAICPUKernelStore(cust_aicpu_kernel_store_);
 
   // Add task
   GeAttrValue::BYTES task_def_bytes;
@@ -744,7 +778,7 @@ Status ModelBuilder::CompileSingleOp() {
       string kernel_lib_name = op_desc->GetOpKernelLibName();
       if (kernel_lib_name.empty()) {
         // Reset op kernel lib
-        (void)instance->DNNEngineManagerObj().GetDNNEngineName(op_desc);
+        (void)instance->DNNEngineManagerObj().GetDNNEngineName(node);
         kernel_lib_name = op_desc->GetOpKernelLibName();
         if (kernel_lib_name.empty()) {
           GELOGE(ge::INTERNAL_ERROR, "Get node:%s(%s) kernel lib failed.", node->GetName().c_str(),
diff --git a/src/ge/graph/build/model_builder.h b/src/ge/graph/build/model_builder.h
index 86b34c6d..e54d6695 100644
--- a/src/ge/graph/build/model_builder.h
+++ b/src/ge/graph/build/model_builder.h
@@ -25,6 +25,7 @@
 #include <vector>
 #include "common/op/ge_op_utils.h"
 #include "common/tbe_kernel_store.h"
+#include "common/cust_aicpu_kernel_store.h"
 #include "common/types.h"
 #include "common/util.h"
 #include "graph/compute_graph.h"
@@ -108,6 +109,7 @@ class ModelBuilder {
   size_t zero_copy_mem_size_;
 
   TBEKernelStore tbe_kernel_store_;
+  CustAICPUKernelStore cust_aicpu_kernel_store_;
 
   uint8_t platform_type_;
   bool is_loop_graph_;
diff --git a/src/ge/graph/build/stream_allocator.cc b/src/ge/graph/build/stream_allocator.cc
index b7643e47..bcfea1d8 100644
--- a/src/ge/graph/build/stream_allocator.cc
+++ b/src/ge/graph/build/stream_allocator.cc
@@ -15,8 +15,8 @@
  */
 
 #include "graph/build/stream_allocator.h"
-#include <memory>
 #include <algorithm>
+#include <memory>
 #include "common/ge/ge_util.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/fmk_error_codes.h"
@@ -1062,12 +1062,12 @@ Status StreamAllocator::SetActiveStreamsForLoop() {
         GELOGI("there are %zu next iterator target streams has streamswitch node.", streams_skip_iterator_event.size());
         for (auto iter : stream_id_to_last_node) {
           if (streams_skip_iterator_event.find(iter.first) != streams_skip_iterator_event.end()) {
-            GELOGI("skip stream %ld which has streamswitch node when add event to next iterator active node",
+            GELOGI("Skip stream %ld which has streamswitch node when adding event to next iterator active node",
                    iter.first);
             continue;
           }
           if (iter.second->GetOwnerComputeGraph()->GetParentGraph() != nullptr) {
-            GELOGI("skip stream %ld which last node in subgraph when add event to next iterator active node",
+            GELOGI("Skip stream %ld which is last node in subgraph when adding event to next iterator active node",
                    iter.first);
             continue;
           }
@@ -1264,15 +1264,6 @@ void StreamAllocator::DumpEvents() {
 }
 
 Status StreamAllocator::GetMaxStreamAndTask(bool huge_stream, uint32_t &max_stream_count, uint32_t &max_task_count) {
-  const char *buffer_optimize_on = std::getenv("BUFFER_OPTIMIZE_ON");
-  if (buffer_optimize_on != nullptr) {
-    rtError_t ret = rtSetPlatformType(PLATFORM_MINI_V1);
-    if (ret != RT_ERROR_NONE) {
-      GELOGE(FAILED, "Get max stream and task count by rts failed.");
-      return FAILED;
-    }
-  }
-
   uint32_t stream_type = RT_NORMAL_STREAM;
   if (huge_stream) {
     stream_type = RT_HUGE_STREAM;
diff --git a/src/ge/graph/build/stream_graph_optimizer.cc b/src/ge/graph/build/stream_graph_optimizer.cc
index a3e8044d..49ecc674 100644
--- a/src/ge/graph/build/stream_graph_optimizer.cc
+++ b/src/ge/graph/build/stream_graph_optimizer.cc
@@ -102,12 +102,9 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com
           continue;
         }
 
-        const char *buffer_optimize_on = std::getenv("BUFFER_OPTIMIZE_ON");
-        if (buffer_optimize_on == nullptr) {
-          if (!IsSameStreamId(subgraph)) {
-            GELOGI("There are more than one stream in subgraph %s", subgraph->GetName().c_str());
-            continue;
-          }
+        if (!IsSameStreamId(subgraph)) {
+          GELOGI("There are more than one stream in subgraph %s", subgraph->GetName().c_str());
+          continue;
         }
         OpDescPtr op_desc = nodes.at(0)->GetOpDesc();
         GE_CHECK_NOTNULL(op_desc);
diff --git a/src/ge/graph/build/task_generator.cc b/src/ge/graph/build/task_generator.cc
index 91f70f2a..cf6b7a0d 100644
--- a/src/ge/graph/build/task_generator.cc
+++ b/src/ge/graph/build/task_generator.cc
@@ -31,6 +31,8 @@
 #include "graph/utils/type_utils.h"
 #include "graph/common/ge_call_wrapper.h"
 #include "init/gelib.h"
+#include "graph/ge_local_context.h"
+#include "ge/ge_api_types.h"
 
 using domi::LogTimeStampDef;
 using domi::ModelTaskDef;
@@ -527,7 +529,7 @@ Status TaskGenerator::MarkNodeAndSetIndex(ComputeGraphPtr &graph) {
 
     // Reset op kernel lib name
     if (op_desc->GetOpKernelLibName().empty()) {
-      (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(op_desc);
+      (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(node);
     }
 
     all_stream_ops[op_desc->GetStreamId()].emplace_back(op_desc);
@@ -762,24 +764,26 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin
   return SUCCESS;
 }
 
-Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
-                                             vector<uint32_t> &all_reduce_nodes) const {
-  GELOGI("Start FindProfilingTaskIndex.");
-  GE_CHECK_NOTNULL(graph);
-  const char *profiling_mode = std::getenv(kProfilingMode);
-  bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn();
-  if (!is_profiling) {
+Status TaskGenerator::GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
+                                   vector<uint32_t> &all_reduce_nodes, std::string &fp_point_str,
+                                   std::string &bp_point_str) const {
+  if (ge::GetContext().GetOption(OPTION_EXEC_PROFILING_FPPONIT_OPTIONS, fp_point_str) == SUCCESS &&
+      ge::GetContext().GetOption(OPTION_EXEC_PROFILING_BPPONIT_OPTIONS, bp_point_str) == SUCCESS &&
+      !fp_point_str.empty() && !bp_point_str.empty()) {
     return SUCCESS;
   }
 
+  Status ret = SUCCESS;
   const char *fp_point = std::getenv(kProfilingFpPoint);
-  Status ret;
   if (fp_point == nullptr) {
     ret = AutoFindFpOpIndex(graph, profiling_point);
     if (ret != SUCCESS) {
       GELOGW("First forward profiling op_index not set and FindFpOpIndex failed.");
-      return SUCCESS;
+      return FAILED;
     }
+  } else {
+    fp_point_str = string(fp_point);
+    GELOGI("Get fp_point_str from env %s", fp_point_str.c_str());
   }
 
   const char *bp_point = std::getenv(kProfilingBpPoint);
@@ -787,20 +791,47 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi
     ret = AutoFindBpOpIndex(graph, profiling_point, all_reduce_nodes);
     if (ret != SUCCESS) {
       GELOGW("Last backward profiling op_index not set and FindBpOpIndex failed.");
-      return SUCCESS;
+      return FAILED;
     }
+  } else {
+    bp_point_str = string(bp_point);
+    GELOGI("Get bp_point_str from env %s", bp_point_str.c_str());
   }
 
-  if (fp_point != nullptr) {
-    string fp_point_str = string(fp_point);
+  return SUCCESS;
+}
+
+Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
+                                             vector<uint32_t> &all_reduce_nodes) const {
+  GELOGI("Start FindProfilingTaskIndex.");
+  GE_CHECK_NOTNULL(graph);
+  const char *profiling_mode = std::getenv(kProfilingMode);
+  bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn();
+  if (!is_profiling) {
+    GELOGW("Profiling is not open.");
+    return SUCCESS;
+  }
+
+  GELOGI("Start get FP/BP index.");
+  std::string fp_point_str;
+  std::string bp_point_str;
+  Status ret = GetFpBpIndex(graph, profiling_point, all_reduce_nodes, fp_point_str, bp_point_str);
+  if (ret != SUCCESS) {
+    GELOGW("Get FP_POINT BP_POINT failed.");
+    return SUCCESS;
+  }
+
+  GELOGI("fp_point_str:%s, bp_point_str:%s.", fp_point_str.c_str(), bp_point_str.c_str());
+
+  if (!fp_point_str.empty()) {
     ret = FindFpOfEnv(graph, fp_point_str, profiling_point);
     if (ret != SUCCESS) {
       GELOGW("First backward profiling op name set but FindFpOfEnv failed.");
       return SUCCESS;
     }
   }
-  if (bp_point != nullptr) {
-    string bp_point_str = string(bp_point);
+
+  if (!bp_point_str.empty()) {
     ret = FindBpOfEnv(graph, bp_point_str, profiling_point, all_reduce_nodes);
     if (ret != SUCCESS) {
       GELOGW("Last backward profiling op name set but FindBpOfEnv failed.");
diff --git a/src/ge/graph/build/task_generator.h b/src/ge/graph/build/task_generator.h
index b2ca4470..6bd3ab03 100644
--- a/src/ge/graph/build/task_generator.h
+++ b/src/ge/graph/build/task_generator.h
@@ -118,6 +118,9 @@ class TaskGenerator {
   Status FindBpOfEnv(const ComputeGraphPtr &graph, const std::string &bp_point_str, ProfilingPoint &profiling_point,
                      vector<uint32_t> &all_reduce_nodes) const;
 
+  Status GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, vector<uint32_t> &all_reduce_nodes,
+                      std::string &fp_point_str, std::string &bp_point_str) const;
+
   Status FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
                                 std::vector<uint32_t> &all_reduce_nodes) const;
   Status InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point,
diff --git a/src/ge/graph/common/local_context.cc b/src/ge/graph/common/local_context.cc
new file mode 100644
index 00000000..43d3bc7c
--- /dev/null
+++ b/src/ge/graph/common/local_context.cc
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/common/local_context.h"
+
+#include "common/ge_inner_error_codes.h"
+#include "common/debug/ge_log.h"
+#include "omg/omg_inner_types.h"
+
+namespace ge {
+namespace {
+thread_local OmgContext *omg_context = nullptr;
+}
+
+void SetLocalOmgContext(OmgContext &context) { omg_context = &context; }
+
+OmgContext &GetLocalOmgContext() {
+  if (omg_context != nullptr) {
+    return *omg_context;
+  } else {
+    GELOGW("omg_context is nullptr.");
+    return domi::GetContext();
+  }
+}
+}  // namespace ge
diff --git a/src/ge/graph/common/local_context.h b/src/ge/graph/common/local_context.h
new file mode 100644
index 00000000..1cdd2ca1
--- /dev/null
+++ b/src/ge/graph/common/local_context.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GRAPH_COMMON_LOCAL_CONTEXT_H_
+#define GE_GRAPH_COMMON_LOCAL_CONTEXT_H_
+
+#include "omg/omg_inner_types.h"
+
+namespace ge {
+void SetLocalOmgContext(OmgContext &context);
+OmgContext &GetLocalOmgContext();
+}  // namespace ge
+#endif  // GE_GRAPH_COMMON_LOCAL_CONTEXT_H_
diff --git a/src/ge/graph/load/graph_loader.cc b/src/ge/graph/load/graph_loader.cc
index d181f3a5..c173d67a 100644
--- a/src/ge/graph/load/graph_loader.cc
+++ b/src/ge/graph/load/graph_loader.cc
@@ -121,70 +121,50 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) {
 Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string &key_path, int32_t priority,
                                      ModelData &model_data) {
   Status ret;
-  try {
-    if (!CheckInputPathValid(path)) {
-      GELOGE(GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str());
-      return GE_EXEC_MODEL_PATH_INVALID;
-    }
-
-    GELOGI("Load model begin, model path is: %s", path.c_str());
-    if (!key_path.empty() && !CheckInputPathValid(key_path)) {
-      GELOGE(GE_EXEC_MODEL_KEY_PATH_INVALID, "decrypt_key path is invalid: %s", key_path.c_str());
-      return GE_EXEC_MODEL_KEY_PATH_INVALID;
-    }
-
-    ret = DavinciModelParser::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret);
-      return ret;
-    }
+  if (!CheckInputPathValid(path)) {
+    GELOGE(GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str());
+    return GE_EXEC_MODEL_PATH_INVALID;
+  }
 
-    return SUCCESS;
-  } catch (std::bad_alloc &) {
-    GELOGE(MEMALLOC_FAILED, "Load model from file failed, bad memory allocation");
-    ret = MEMALLOC_FAILED;
-  } catch (...) {
-    GELOGE(FAILED, "Load model from file failed with exception");
-    ret = FAILED;
+  GELOGI("Load model begin, model path is: %s", path.c_str());
+  if (!key_path.empty() && !CheckInputPathValid(key_path)) {
+    GELOGE(GE_EXEC_MODEL_KEY_PATH_INVALID, "decrypt_key path is invalid: %s", key_path.c_str());
+    return GE_EXEC_MODEL_KEY_PATH_INVALID;
   }
 
-  if (model_data.model_data != nullptr) {
-    delete[] static_cast<char *>(model_data.model_data);
-    model_data.model_data = nullptr;
+  ret = DavinciModelParser::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret);
+    if (model_data.model_data != nullptr) {
+      delete[] static_cast<char *>(model_data.model_data);
+      model_data.model_data = nullptr;
+    }
+    return ret;
   }
-  return ret;
+  return SUCCESS;
 }
 
 Status GraphLoader::LoadModelFromFile(const std::string &path, const std::string &key_path, int32_t priority,
                                       const std::shared_ptr<ModelListener> &listener, uint32_t &model_id) {
   Status ret;
   ModelData model_data;
-
-  try {
-    ret = LoadDataFromFile(path, key_path, priority, model_data);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret);
-      if (model_data.model_data != nullptr) {
-        delete[] static_cast<char *>(model_data.model_data);
-        model_data.model_data = nullptr;
-      }
-      return ret;
+  ret = LoadDataFromFile(path, key_path, priority, model_data);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret);
+    if (model_data.model_data != nullptr) {
+      delete[] static_cast<char *>(model_data.model_data);
+      model_data.model_data = nullptr;
     }
+    return ret;
+  }
 
-    ret = LoadModel(model_data, listener, model_id);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "LoadModel: Load failed. ret = %u", ret);
-      if (model_data.model_data != nullptr) {
-        delete[] static_cast<char *>(model_data.model_data);
-        model_data.model_data = nullptr;
-      }
+  ret = LoadModel(model_data, listener, model_id);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "LoadModel: Load failed. ret = %u", ret);
+    if (model_data.model_data != nullptr) {
+      delete[] static_cast<char *>(model_data.model_data);
+      model_data.model_data = nullptr;
     }
-  } catch (std::bad_alloc &) {
-    GELOGE(MEMALLOC_FAILED, "Load model from file failed, bad memory allocation");
-    ret = MEMALLOC_FAILED;
-  } catch (...) {
-    GELOGE(FAILED, "Load model from file failed with exception");
-    ret = FAILED;
   }
 
   if (model_data.model_data != nullptr) {
@@ -197,36 +177,27 @@ Status GraphLoader::LoadModelFromFile(const std::string &path, const std::string
 
 Status GraphLoader::LoadModel(const ModelData &model_data, const std::shared_ptr<ModelListener> &listener,
                               uint32_t &model_id) {
-  try {
-    GELOGI("Load model begin, model_id:%u.", model_id);
+  GELOGI("Load model begin, model_id:%u.", model_id);
 
-    // For GeOp, Open Device 0 here.
-    GE_CHK_RT_RET(rtSetDevice(0));
-    auto model_manager = ModelManager::GetInstance();
-    GE_CHECK_NOTNULL(model_manager);
-    Status ret = model_manager->LoadModelOffline(model_id, model_data, listener);
-    if (ret != SUCCESS) {
-      GE_CHK_RT(rtDeviceReset(0));
-      GELOGE(ret, "LoadModel: Load failed.");
-      return ret;
-    }
-    ret = model_manager->Start(model_id);
-    if (ret != SUCCESS) {
-      if (model_manager->Unload(model_id) != SUCCESS) {
-        GELOGE(FAILED, "LoadModel: Unload failed while trying to unload after a failed start.");
-      }
-      GELOGE(ret, "LoadModel: Start failed.");
-      return ret;
+  // For GeOp, Open Device 0 here.
+  GE_CHK_RT_RET(rtSetDevice(0));
+  auto model_manager = ModelManager::GetInstance();
+  GE_CHECK_NOTNULL(model_manager);
+  Status ret = model_manager->LoadModelOffline(model_id, model_data, listener);
+  if (ret != SUCCESS) {
+    GE_CHK_RT(rtDeviceReset(0));
+    GELOGE(ret, "LoadModel: Load failed.");
+    return ret;
+  }
+  ret = model_manager->Start(model_id);
+  if (ret != SUCCESS) {
+    if (model_manager->Unload(model_id) != SUCCESS) {
+      GELOGE(FAILED, "LoadModel: Unload failed while trying to unload after a failed start.");
     }
-    GELOGI("LoadModel: Start model success, model_id:%u.", model_id);
-  } catch (std::bad_alloc &) {
-    GELOGE(MEMALLOC_FAILED, "Load model failed, bad memory allocation occur !");
-    return MEMALLOC_FAILED;
-  } catch (...) {
-    GELOGE(FAILED, "Load model failed, some exceptions occur !");
-    return FAILED;
+    GELOGE(ret, "LoadModel: Start failed.");
+    return ret;
   }
-
+  GELOGI("LoadModel: Start model success, model_id:%u.", model_id);
   return SUCCESS;
 }
 
@@ -255,28 +226,16 @@ Status GraphLoader::CommandHandle(const Command &command) {
 
 Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model_data, void *dev_ptr, size_t memsize,
                                       void *weight_ptr, size_t weightsize) {
-  try {
-    GELOGI("Load model begin, model_id:%u.", model_id);
-
-    // For ACL, Open Device from App.
-    auto model_manager = ModelManager::GetInstance();
-    GE_CHECK_NOTNULL(model_manager);
-    Status ret =
-      model_manager->LoadModelOffline(model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Load model failed, model_id:%u.", model_id);
-      return ret;
-    }
-
-    GELOGI("Load model success, model_id:%u.", model_id);
-  } catch (std::bad_alloc &) {
-    GELOGE(MEMALLOC_FAILED, "Load model failed, bad memory allocation occur !");
-    return MEMALLOC_FAILED;
-  } catch (...) {
-    GELOGE(FAILED, "Load model failed, some exceptions occur !");
-    return FAILED;
+  GELOGI("Load model begin, model_id:%u.", model_id);
+  // For ACL, Open Device from App.
+  auto model_manager = ModelManager::GetInstance();
+  GE_CHECK_NOTNULL(model_manager);
+  Status ret = model_manager->LoadModelOffline(model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Load model failed, model_id:%u.", model_id);
+    return ret;
   }
-
+  GELOGI("Load model success, model_id:%u.", model_id);
   return SUCCESS;
 }
 
diff --git a/src/ge/graph/load/new_model_manager/data_dumper.cc b/src/ge/graph/load/new_model_manager/data_dumper.cc
index b94add80..e4e3a63f 100644
--- a/src/ge/graph/load/new_model_manager/data_dumper.cc
+++ b/src/ge/graph/load/new_model_manager/data_dumper.cc
@@ -16,21 +16,28 @@
 
 #include "graph/load/new_model_manager/data_dumper.h"
 
+#include <sys/time.h>
+#include <cstdlib>
 #include <ctime>
 #include <map>
 #include <utility>
 #include <vector>
 
+#include "common/debug/memory_dumper.h"
 #include "common/properties_manager.h"
+#include "common/util.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/util.h"
 #include "graph/anchor.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/load/new_model_manager/model_utils.h"
+#include "graph/manager/util/debug.h"
 #include "graph/utils/attr_utils.h"
 #include "graph/utils/tensor_utils.h"
+#include "proto/dump_task.pb.h"
 #include "proto/ge_ir.pb.h"
 #include "proto/op_mapping_info.pb.h"
+#include "runtime/base.h"
 #include "runtime/mem.h"
 
 namespace {
@@ -66,6 +73,16 @@ static bool ParseNameIndex(const std::string &node_name_index, std::string &node
 static bool IsTensorDescWithSkipDumpAddrType(bool has_mem_type_attr, vector<int64_t> v_memory_type, size_t i) {
   return has_mem_type_attr && (v_memory_type[i] == RT_MEMORY_L1);
 }
+
+static uint64_t GetNowTime() {
+  uint64_t ret = 0;
+  struct timeval tv;
+  if (gettimeofday(&tv, NULL) == 0) {
+    ret = tv.tv_sec * 1000000ULL + tv.tv_usec;
+  }
+
+  return ret;
+}
 }  // namespace
 
 static int32_t GetIrDataType(ge::DataType data_type) {
@@ -176,6 +193,7 @@ void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr
   GELOGD("Start SaveDumpOpInfo of task_id: %u, stream_id: %u", task_id, stream_id);
   OpDescInfo op_desc_info;
   op_desc_info.op_name = op->GetName();
+  op_desc_info.op_type = op->GetType();
   op_desc_info.task_id = task_id;
   op_desc_info.stream_id = stream_id;
   for (size_t i = 0; i < op->GetInputsSize(); ++i) {
@@ -183,12 +201,28 @@ void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr
     op_desc_info.input_format.emplace_back(input_desc.GetFormat());
     op_desc_info.input_shape.emplace_back(input_desc.GetShape().GetDims());
     op_desc_info.input_data_type.emplace_back(input_desc.GetDataType());
+    int64_t input_size = 0;
+    auto tensor_descs = op->GetAllInputsDesc();
+    if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(i), input_size) != SUCCESS) {
+      GELOGW("Get input size failed");
+      return;
+    }
+    GELOGI("Save dump op info, the input size is %ld", input_size);
+    op_desc_info.input_size.emplace_back(input_size);
   }
   for (size_t j = 0; j < op->GetOutputsSize(); ++j) {
     GeTensorDesc output_desc = op->GetOutputDesc(j);
     op_desc_info.output_format.emplace_back(output_desc.GetFormat());
     op_desc_info.output_shape.emplace_back(output_desc.GetShape().GetDims());
     op_desc_info.output_data_type.emplace_back(output_desc.GetDataType());
+    int64_t output_size = 0;
+    auto tensor_descs = op->GetAllOutputsDesc();
+    if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(j), output_size) != SUCCESS) {
+      GELOGW("Get input size failed");
+      return;
+    }
+    GELOGI("Save dump op info, the output size is %ld", output_size);
+    op_desc_info.output_size.emplace_back(output_size);
   }
   op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op);
   op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op);
@@ -810,4 +844,90 @@ void DataDumper::PrintCheckLog(string &dump_list_key) {
     }
   }
 }
+
+Status DataDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) {
+  GELOGI("Start to dump exception input");
+  for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) {
+    if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) {
+      GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i);
+      return PARAM_INVALID;
+    }
+  }
+  return SUCCESS;
+}
+
+Status DataDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) {
+  GELOGI("Start to dump exception output");
+  for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) {
+    if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) !=
+        SUCCESS) {
+      GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i);
+      return PARAM_INVALID;
+    }
+  }
+  return SUCCESS;
+}
+
+Status DataDumper::DumpExceptionInfo(const std::vector<rtExceptionInfo> exception_infos) {
+  GELOGI("Start to dump exception info");
+  for (const rtExceptionInfo &iter : exception_infos) {
+    OpDescInfo op_desc_info;
+    if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) {
+      toolkit::dumpdata::DumpData dump_data;
+      dump_data.set_version("2.0");
+      dump_data.set_dump_time(GetNowTime());
+      for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) {
+        toolkit::dumpdata::OpInput input;
+        input.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.input_data_type[i])));
+        input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i]));
+        for (auto dim : op_desc_info.input_shape[i]) {
+          input.mutable_shape()->add_dim(dim);
+        }
+        input.set_size(op_desc_info.input_size[i]);
+        GELOGI("The input size int exception is %ld", op_desc_info.input_size[i]);
+        dump_data.mutable_input()->Add(std::move(input));
+      }
+      for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) {
+        toolkit::dumpdata::OpOutput output;
+        output.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.output_data_type[j])));
+        output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j]));
+        for (auto dim : op_desc_info.output_shape[j]) {
+          output.mutable_shape()->add_dim(dim);
+        }
+        output.set_size(op_desc_info.output_size[j]);
+        GELOGI("The output size int exception is %ld", op_desc_info.output_size[j]);
+        dump_data.mutable_output()->Add(std::move(output));
+      }
+      uint64_t now_time = GetNowTime();
+      string dump_file_path = "./" + op_desc_info.op_type + "." + op_desc_info.op_name + "." +
+                              to_string(op_desc_info.task_id) + "." + to_string(now_time);
+      uint64_t proto_size = dump_data.ByteSizeLong();
+      unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
+      bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size);
+      if (!ret || proto_size == 0) {
+        GELOGE(PARAM_INVALID, "Dump data proto serialize failed");
+        return PARAM_INVALID;
+      }
+
+      GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)),
+                        "Failed to dump proto size");
+      GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size),
+                        "Failed to dump proto msg");
+      if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) {
+        GELOGE(PARAM_INVALID, "Dump exception input failed");
+        return PARAM_INVALID;
+      }
+
+      if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) {
+        GELOGE(PARAM_INVALID, "Dump exception output failed");
+        return PARAM_INVALID;
+      }
+      GELOGI("Dump exception info SUCCESS");
+    } else {
+      GELOGE(PARAM_INVALID, "Get op desc info failed,task id:%u,stream id:%u", iter.taskid, iter.streamid);
+      return PARAM_INVALID;
+    }
+  }
+  return SUCCESS;
+}
 }  // namespace ge
diff --git a/src/ge/graph/load/new_model_manager/data_dumper.h b/src/ge/graph/load/new_model_manager/data_dumper.h
index cb5bbd41..0a1c2274 100644
--- a/src/ge/graph/load/new_model_manager/data_dumper.h
+++ b/src/ge/graph/load/new_model_manager/data_dumper.h
@@ -31,6 +31,7 @@
 #include "runtime/mem.h"
 #include "task_info/task_info.h"
 #include "framework/common/ge_types.h"
+#include "runtime/base.h"
 
 namespace ge {
 class DataDumper {
@@ -88,6 +89,11 @@ class DataDumper {
   const DumpProperties &GetDumpProperties() const { return dump_properties_; }
   bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const;
 
+  // Dump exception info
+  Status DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file);
+  Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file);
+  Status DumpExceptionInfo(const std::vector<rtExceptionInfo> exception_infos);
+
  private:
   void ReleaseDevMem(void **ptr) noexcept;
 
diff --git a/src/ge/graph/load/new_model_manager/davinci_model.cc b/src/ge/graph/load/new_model_manager/davinci_model.cc
index 7daeb1b8..45cec2cf 100644
--- a/src/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/src/ge/graph/load/new_model_manager/davinci_model.cc
@@ -43,6 +43,7 @@
 #include "graph/graph.h"
 #include "graph/load/new_model_manager/cpu_queue_schedule.h"
 #include "graph/load/new_model_manager/tbe_handle_store.h"
+#include "graph/load/new_model_manager/model_manager.h"
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph/manager/trans_var_data_utils.h"
@@ -253,13 +254,7 @@ Status DavinciModel::Assign(const GeModelPtr &ge_model) {
 ///
 void DavinciModel::Shrink() {
   ge_model_.reset();  // delete object.
-
-  // Old dump need op list, clear when closed.
-  char *ge_dump_env = std::getenv("DUMP_OP");
-  int dump_op_switch = (ge_dump_env != nullptr) ? std::strtol(ge_dump_env, nullptr, kDecimal) : 0;
-  if (dump_op_switch == 0) {
-    op_list_.clear();
-  }
+  op_list_.clear();
 }
 
 Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) {
@@ -295,8 +290,8 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
       GELOGE(GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED, "Alloc feature map memory failed. size: %zu", data_size);
       return GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED;
     }
-    GELOGI("[IMAS]InitModelMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
-           mem_base_, data_size);
+    GEEVENT("[IMAS]InitModelMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
+            mem_base_, data_size);
 
     weights_mem_base_ = mem_base_;
 
@@ -337,8 +332,8 @@ Status DavinciModel::InitVariableMem() {
       return ret;
     }
     var_mem_base_ = VarManager::Instance(session_id_)->GetVarMemoryBase(RT_MEMORY_HBM);
-    GELOGI("[IMAS]InitVariableMem graph_%u MallocMemory type[V] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
-           var_mem_base_, TotalVarMemSize());
+    GEEVENT("[IMAS]InitVariableMem graph_%u MallocMemory type[V] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
+            var_mem_base_, TotalVarMemSize());
   }
   runtime_param_.var_base = var_mem_base_;
   return SUCCESS;
@@ -774,6 +769,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
   map<uint32_t, OpDescPtr> data_by_index;
   auto nodes = compute_graph->GetAllNodes();
   const TBEKernelStore &tbekernel_store = ge_model_->GetTBEKernelStore();
+  const CustAICPUKernelStore &aicpu_kernel_store = ge_model_->GetCustAICPUKernelStore();
   for (size_t i = 0; i < nodes.size(); i++) {
     auto node = nodes.at(i);
     auto op_desc = node->GetOpDesc();
@@ -786,6 +782,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
 
     GE_TIMESTAMP_RESTART(LoadTBEKernelBinToOpDesc);
     tbekernel_store.LoadTBEKernelBinToOpDesc(op_desc);
+    aicpu_kernel_store.LoadCustAICPUKernelBinToOpDesc(op_desc);
     GE_TIMESTAMP_ADD(LoadTBEKernelBinToOpDesc);
 
     if (IsDataOp(op_desc->GetType())) {
@@ -1076,30 +1073,42 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) {
 ///
 /// @ingroup ge
 /// @brief output zero copy node Initialize.
-/// @param [in] NodePtr: netoutput Op or merge op.
+/// @param [in] NodePtr: netoutput Op.
 /// @return Status
 ///
 Status DavinciModel::InitOutputZeroCopyNodes(const NodePtr &node) {
+  set<NodePtr> nodes_need_record;
   for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
     auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
     if (peer_out_data_anchor == nullptr) {
       continue;
     }
-    auto node = peer_out_data_anchor->GetOwnerNode();
-    auto op_desc = node->GetOpDesc();
-    if (op_desc == nullptr) {
-      GELOGE(FAILED, "Op desc is nullptr");
-      return FAILED;
-    }
+    auto peer_node = peer_out_data_anchor->GetOwnerNode();
+    nodes_need_record.emplace(peer_node);
 
     // Merge node output multiplexed input, upstream nodes need to be considered in multiple batch scenarios
-    if (node->GetType() == MERGE) {
-      if (InitOutputZeroCopyNodes(node) != SUCCESS) {
-        GELOGE(PARAM_INVALID, "Output merge zero copy nodes init failed!");
-        return PARAM_INVALID;
+    if (peer_node->GetType() == MERGE) {
+      for (const auto &merge_peer_in_data_anchor : peer_node->GetAllInDataAnchors()) {
+        auto merge_peer_out_data_anchor = merge_peer_in_data_anchor->GetPeerOutAnchor();
+        if (merge_peer_out_data_anchor == nullptr) {
+          continue;
+        }
+        auto merge_peer_node = merge_peer_out_data_anchor->GetOwnerNode();
+        nodes_need_record.emplace(merge_peer_node);
+      }
+    } else {
+      for (const auto &other_in_data_anchor : peer_out_data_anchor->GetPeerInDataAnchors()) {
+        auto other_in_node = other_in_data_anchor->GetOwnerNode();
+        if (other_in_node->GetType() != NETOUTPUT) {
+          nodes_need_record.emplace(other_in_node);
+        }
       }
     }
+  }
 
+  for (const auto &node_need_record : nodes_need_record) {
+    auto op_desc = node_need_record->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
     string batch_label;
     (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
     if (batch_label.empty()) {
@@ -2152,7 +2161,6 @@ void DavinciModel::SetProfileTime(ModelProcStage stage, int64_t endTime) {
 Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, rtMemcpyKind_t kind) {
   if (output_op_list_.empty()) {
     Status ret = SyncVarData();
-    DumpOpInputOutput();
     return ret;
   }
 
@@ -2198,8 +2206,6 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r
            runtime_param_.graph_id, output.first, output.second.GetBasicAddr(), data_size, buffer_length);
     GE_CHK_RT_RET(rtMemcpy(buffer_addr, buffer_length, output.second.GetBasicAddr(), data_size, kind));
   }
-
-  DumpOpInputOutput();
   return SUCCESS;
 }
 
@@ -2264,6 +2270,14 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b
   // return result is not required
   if (!rslt_flg && !seq_end_flag) {
     GELOGW("Compute failed, model id: %u", model_id_);
+    auto model_manager = ModelManager::GetInstance();
+    GE_CHECK_NOTNULL(model_manager);
+    auto exception_infos = model_manager->GetExceptionInfos();
+    if (exception_infos.size() > 0) {
+      GE_CHK_STATUS_RET(data_dumper_.DumpExceptionInfo(exception_infos), "Dump exception info failed");
+    } else {
+      GELOGI("Exception info is null");
+    }
     GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR, outputs), "OnComputeDone failed.");
     return INTERNAL_ERROR;
   }
@@ -2302,7 +2316,6 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b
   GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, SUCCESS, outputs), "OnComputeDone failed");
   return SUCCESS;
 }
-
 ///
 /// @ingroup ge
 /// @brief return not output to upper layer for cloud case
@@ -2318,114 +2331,12 @@ Status DavinciModel::ReturnNoOutput(uint32_t data_id) {
                      op_desc->GetName().c_str());
   }
 
-  DumpOpInputOutput();
   GE_CHK_BOOL_EXEC(listener_ != nullptr, return PARAM_INVALID, "listener_ is null!");
   std::vector<ge::OutputTensorInfo> outputs;
   GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, SUCCESS, outputs), "OnComputeDone failed.");
   return SUCCESS;
 }
 
-///
-/// @ingroup ge
-/// @brief dump all op input and output information
-/// @return void
-///
-void DavinciModel::DumpOpInputOutput() {
-  char *ge_dump_env = std::getenv("DUMP_OP");
-  int dump_op_switch = (ge_dump_env != nullptr) ? std::strtol(ge_dump_env, nullptr, kDecimal) : 0;
-  if (dump_op_switch == 0) {
-    GELOGI("need to set DUMP_OP for dump op input and output");
-    return;
-  }
-
-  if (op_list_.empty()) {
-    GELOGW("op list is empty");
-    return;
-  }
-
-  int64_t cnt = 1;
-  for (auto it : op_list_) {
-    if (maxDumpOpNum_ != 0 && cnt > maxDumpOpNum_) {
-      GELOGW("dump op cnt > maxDumpOpNum, maxDumpOpNum: %ld", maxDumpOpNum_);
-      return;
-    }
-
-    cnt++;
-    if (DumpSingleOpInputOutput(it.second) != SUCCESS) {
-      GELOGW("dump single op failed, model_id: %u", model_id_);
-      return;
-    }
-  }
-}
-
-///
-/// @ingroup ge
-/// @brief dump single op input and output information
-/// @param [in] op_def: the op_desc which will be dump
-/// @return Status result
-///
-Status DavinciModel::DumpSingleOpInputOutput(const OpDescPtr &op_def) {
-  GE_CHK_BOOL_EXEC(nullptr != op_def, return PARAM_INVALID, "op_def is null!");
-  string op_name = ge::StringUtils::ReplaceAll(op_def->GetName(), "/", "-");
-  GELOGI("dump op name:%s, type:%s, model_id: %u.", op_def->GetName().c_str(), op_def->GetType().c_str(), model_id_);
-  string model_path = "./dump" + to_string(model_id_);
-  if (mmAccess(model_path.c_str()) != EN_OK) {
-    int32_t ret = mmMkdir(model_path.c_str(), S_IRUSR | S_IWUSR | S_IXUSR);
-    if (ret != EN_OK) {
-      GELOGE(FAILED, "make dir failed, model_id: %u", model_id_);
-      return FAILED;
-    }
-  }
-  const vector<int64_t> input_size_vec = ModelUtils::GetInputSize(op_def);
-  const vector<void *> input_addr_vec = ModelUtils::GetInputDataAddrs(runtime_param_, op_def);
-  vector<int64_t> v_memory_type;
-  bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_def, ATTR_NAME_INPUT_MEM_TYPE_LIST, v_memory_type);
-  GELOGD("DumpSingleOp[%s], input size[%zu], input memory type size[%zu]", op_def->GetName().c_str(),
-         op_def->GetInputsSize(), v_memory_type.size());
-  for (size_t i = 0; i < input_addr_vec.size(); i++) {
-    if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {
-      continue;
-    }
-    int64_t input_size = input_size_vec.at(i);
-    char input_file_name[PATH_MAX] = {0};
-    if ((sprintf_s(input_file_name, PATH_MAX, "%s/dump_%u_%s_%s_input_%zu.bin", model_path.c_str(), model_id_,
-                   op_def->GetType().c_str(), op_name.c_str(), i)) == -1) {
-      GELOGE(FAILED, "construct input dump file path failed.");
-      return FAILED;
-    }
-    if ((Debug::DumpDevMem(input_file_name, input_addr_vec.at(i), input_size)) != SUCCESS) {
-      GELOGE(FAILED, "dump to input_file failed");
-      return FAILED;
-    }
-  }
-
-  const vector<int64_t> output_size_vec = ModelUtils::GetOutputSize(op_def);
-  const vector<void *> output_addr_vec = ModelUtils::GetOutputDataAddrs(runtime_param_, op_def);
-  v_memory_type.clear();
-  has_mem_type_attr = ge::AttrUtils::GetListInt(op_def, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, v_memory_type);
-  GELOGD("DumpSingleOp[%s], output size[%zu], output memory type size[%zu]", op_def->GetName().c_str(),
-         op_def->GetOutputsSize(), v_memory_type.size());
-  if (!(op_def->GetType() == "Const")) {
-    for (size_t i = 0; i < output_addr_vec.size(); i++) {
-      if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {
-        continue;
-      }
-      int64_t output_size = output_size_vec.at(i);
-      char output_file_name[PATH_MAX] = {0};
-      if ((sprintf_s(output_file_name, PATH_MAX, "%s/dump_%u_%s_%s_output_%zu.bin", model_path.c_str(), model_id_,
-                     op_def->GetType().c_str(), op_name.c_str(), i)) == -1) {
-        GELOGE(FAILED, "construct output dump file path failed.");
-        return FAILED;
-      }
-      if ((Debug::DumpDevMem(output_file_name, output_addr_vec.at(i), output_size)) != SUCCESS) {
-        GELOGE(FAILED, "dump to output_file failed");
-        return FAILED;
-      }
-    }
-  }
-  return SUCCESS;
-}
-
 void *DavinciModel::Run(DavinciModel *model) {
   GE_CHK_BOOL_EXEC(model != nullptr,
                    CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
@@ -3127,8 +3038,8 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> &
       void *addr = data.second.GetDataInfo().at(count).second;
       void *buffer_addr =
         reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(buffer.data) + data.second.GetRelativeOffset().at(count));
-      GELOGI("[ZCPY] Copy blobs_index %u, virtual_addr: %p, size: %ld, user_data_addr: %p", data.first, addr, size,
-             buffer_addr);
+      GELOGI("[ZCPY] Copy %s blobs_index %u, virtual_addr: %p, size: %ld, user_data_addr: %p", input_or_output.c_str(),
+             data.first, addr, size, buffer_addr);
       // For input data, just copy for rts task.
       for (ZeroCopyTask &task : zero_copy_tasks_) {
         uintptr_t addr_val = reinterpret_cast<uintptr_t>(addr);
@@ -3486,7 +3397,6 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa
   is_async_mode_ = async_mode;
   GELOGI("Model Run begin, model id:%u, data index:%u, flag:%d.", model_id_, input_data.index, is_async_mode_);
   GE_CHK_STATUS_RET(InitModelStream(stream), "Init model stream failed.");
-
   is_dynamic_ = input_data.is_dynamic_batch;
   if (!is_dynamic_) {
     zero_copy_batch_label_addrs_.clear();
diff --git a/src/ge/graph/load/new_model_manager/davinci_model.h b/src/ge/graph/load/new_model_manager/davinci_model.h
index e77c5510..ea94c22c 100644
--- a/src/ge/graph/load/new_model_manager/davinci_model.h
+++ b/src/ge/graph/load/new_model_manager/davinci_model.h
@@ -345,21 +345,6 @@ class DavinciModel {
 
   Status ReturnNoOutput(uint32_t data_id);
 
-  ///
-  /// @ingroup ge
-  /// @brief dump all op input and output information
-  /// @return void
-  ///
-  void DumpOpInputOutput();
-
-  ///
-  /// @ingroup ge
-  /// @brief dump single op input and output information
-  /// @param [in] dump_op model_id
-  /// @return Status
-  ///
-  Status DumpSingleOpInputOutput(const OpDescPtr &dump_op);
-
   Status ModelRunStart();
 
   ///
diff --git a/src/ge/graph/load/new_model_manager/model_manager.cc b/src/ge/graph/load/new_model_manager/model_manager.cc
index 33e39847..9f0b114b 100644
--- a/src/ge/graph/load/new_model_manager/model_manager.cc
+++ b/src/ge/graph/load/new_model_manager/model_manager.cc
@@ -18,9 +18,9 @@
 
 #include <string>
 
+#include "common/dump/dump_manager.h"
 #include "common/l2_cache_optimize.h"
 #include "common/profiling/profiling_manager.h"
-#include "common/dump/dump_manager.h"
 #include "common/properties_manager.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/util.h"
@@ -38,6 +38,7 @@ const int kDumpCmdPairSize = 2;
 }  // namespace
 
 DumpProperties ModelManager::dump_properties_;
+std::mutex ModelManager::exeception_infos_mutex_;
 
 std::shared_ptr<ModelManager> ModelManager::GetInstance() {
   static const std::shared_ptr<ModelManager> instance_ptr =
@@ -154,6 +155,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
     GELOGI("The session: %lu not created.", session_id);
     return;
   } else {
+    GE_CHK_RT(rtSetDevice(static_cast<int32_t>(GetContext().DeviceId())));
     Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0);
     if (ret != SUCCESS) {
       GELOGW("The session: %lu destroy failed.", session_id);
@@ -161,6 +163,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
       (void)sess_ids_.erase(session_id);
       GELOGI("The session: %lu destroyed.", session_id);
     }
+    GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));
   }
 }
 
@@ -369,7 +372,8 @@ Status ModelManager::Unload(uint32_t model_id) {
   } else {
     GELOGI("Unload model %u success.no need reset device,device_count: %u", model_id, device_count);
   }
-
+  std::lock_guard<std::mutex> lock(exeception_infos_mutex_);
+  exception_infos_.clear();
   return SUCCESS;
 }
 
@@ -1106,4 +1110,23 @@ Status ModelManager::GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint3
   return FAILED;
 }
 
+Status ModelManager::EnableExceptionDump(const std::map<string, string> &options) {
+  auto iter = options.find(OPTION_EXEC_ENABLE_EXCEPTION_DUMP);
+  if (iter != options.end()) {
+    GELOGI("Find option enable_exeception_dump is %s", iter->second.c_str());
+    if (iter->second == "1") {
+      rtError_t rt_ret = rtSetTaskFailCallback(ExceptionCallback);
+      if (rt_ret != RT_ERROR_NONE) {
+        GELOGE(RT_FAILED, "rtSetTaskFailCallback failed");
+        return RT_ERROR_TO_GE_STATUS(rt_ret);
+      }
+    } else {
+      GELOGI("Option enable exception dump is %s", iter->second.c_str());
+    }
+  } else {
+    GELOGI("Not find option enable exception dump");
+  }
+  return SUCCESS;
+}
+
 }  // namespace ge
diff --git a/src/ge/graph/load/new_model_manager/model_manager.h b/src/ge/graph/load/new_model_manager/model_manager.h
index a25b56a8..2c650c82 100644
--- a/src/ge/graph/load/new_model_manager/model_manager.h
+++ b/src/ge/graph/load/new_model_manager/model_manager.h
@@ -274,6 +274,22 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
   bool IsDynamicShape(uint32_t model_id);
   ge::Status GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info);
 
+  ge::Status EnableExceptionDump(const std::map<string, string> &options);
+
+  const std::vector<rtExceptionInfo> &GetExceptionInfos() { return exception_infos_; }
+
+  void AddExceptionInfo(const rtExceptionInfo &exception_info) { exception_infos_.emplace_back(exception_info); }
+
+  static void ExceptionCallback(rtExceptionInfo *exception_info) {
+    std::lock_guard<std::mutex> lock(exeception_infos_mutex_);
+    auto instance = ModelManager::GetInstance();
+    if (instance == nullptr) {
+      GELOGE(FAILED, "Instance is nullptr");
+      return;
+    }
+    instance->AddExceptionInfo(*exception_info);
+  }
+
  private:
   ///
   /// @ingroup domi_ome
@@ -309,8 +325,10 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
   std::mutex map_mutex_;
   std::mutex sess_ids_mutex_;
   std::mutex session_id_create_mutex_;
+  static ::std::mutex exeception_infos_mutex_;
   uint64_t session_id_bias_;
   std::set<uint64_t> sess_ids_;
+  std::vector<rtExceptionInfo> exception_infos_;
 
   static DumpProperties dump_properties_;
 };
diff --git a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
index 98d1d5a4..11eaaca9 100644
--- a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
@@ -258,7 +258,7 @@ Status HcclTaskInfo::SetAddrs(const std::shared_ptr<OpDesc> &op_desc,
     return SUCCESS;
   }
 
-  hcclRedOp_t op_type = HCCL_REP_OP_SUM;
+  HcclReduceOp op_type = HCCL_REDUCE_SUM;
   GE_CHECK_NOTNULL(davinci_model_);
   GELOGI("Calc opType[%s] input address before. Node name[%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str());
   if (!davinci_model_->IsKnownNode()) {
diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
index 7c873c68..0cac91eb 100644
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
@@ -37,11 +37,17 @@ const uint8_t kL2NotLoadToDdr = 0;
 // for skt
 constexpr int64_t kInvalidGroupKey = -1;
 constexpr uint32_t kSKTSingleSize = 1;
-constexpr uint32_t kSKTMaxSizeLimit = 20000;
 const char *kIsLastNode = "is_last_node";
 const char *kIsFirstNode = "is_first_node";
 const int64_t kCloseSkt = 100;
 const uint32_t kAddrLen = sizeof(void *);
+const char *const kLoadOpFromBuf = "loadOpFromBuf";
+struct CustAicpuSoBuf {
+  uint64_t kernelSoBuf;
+  uint32_t kernelSoBufLen;
+  uint64_t kernelSoName;
+  uint32_t kernelSoNameLen;
+} __attribute__((packed));
 }  // namespace
 
 namespace ge {
@@ -49,10 +55,7 @@ KernelTaskInfo::SuperKernelTaskInfo KernelTaskInfo::skt_info_ = {
   0, 0, 0, 0, nullptr, nullptr, {}, {}, {}, {}, {}, RT_KERNEL_DEFAULT, kInvalidGroupKey, 0, nullptr};
 
 Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
-  if (davinci_model == nullptr) {
-    GELOGE(PARAM_INVALID, "davinci model is null!");
-    return PARAM_INVALID;
-  }
+  GE_CHECK_NOTNULL(davinci_model);
   davinci_model_ = davinci_model;
   is_l1_fusion_enable_ = davinci_model_->GetL1FusionEnableOption();
   GELOGD("KernelTaskInfo init start, ge.enableL1Fusion in davinci model is %d.", is_l1_fusion_enable_);
@@ -71,16 +74,12 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
   kernel_type_ = static_cast<cce::ccKernelType>(context.kernel_type());
   // get opdesc
   op_desc_ = davinci_model_->GetOpByIndex(context.op_index());
-  if (op_desc_ == nullptr) {
-    GELOGE(INTERNAL_ERROR, "Get op desc failed, index is out of range!");
-    return INTERNAL_ERROR;
-  }
+  GE_CHECK_NOTNULL(op_desc_);
   (void)AttrUtils::GetBool(*op_desc_, ATTR_N_BATCH_SPILT, is_n_batch_spilt_);
   GELOGD("node[%s] is_n_batch_spilt %d", op_desc_->GetName().c_str(), is_n_batch_spilt_);
   (void)AttrUtils::GetInt(*op_desc_, ATTR_NAME_FUSION_GROUP_KEY, group_key_);
   has_group_key_ = (group_key_ != kInvalidGroupKey);
   GELOGD("node[%s] has_group_key_ %ld, group key is [%ld]", op_desc_->GetName().c_str(), has_group_key_, group_key_);
-
   // fusion_op_info
   vector<std::string> original_op_names;
   bool result = AttrUtils::GetListStr(op_desc_, ge::ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, original_op_names);
@@ -99,7 +98,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
     GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. stub_func: %s",
                                                     kernel_def.stub_func().c_str());
                     return RT_ERROR_TO_GE_STATUS(rt_ret););
-  } else if (kernel_type_ != cce::ccKernelType::AI_CPU) {
+  } else if (kernel_type_ == cce::ccKernelType::TE) {
     rtError_t rt_ret;
     rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_);
     GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE,
@@ -127,7 +126,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
     ret = InitTVMTask(args_offset_tmp[0], kernel_def);
   } else if (kernel_type_ == cce::ccKernelType::CUSTOMIZED) {
     ret = InitAICPUCustomTask(context.op_index(), kernel_def);
-  } else if (kernel_type_ == cce::ccKernelType::AI_CPU) {
+  } else if (kernel_type_ == cce::ccKernelType::AI_CPU || kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
     ret = InitAicpuTask(context.op_index(), kernel_def);
   } else {
     if (kernel_def.args().empty() || args_size_ == 0) {
@@ -332,10 +331,6 @@ bool KernelTaskInfo::DoubleCallSKTSaveCheck() { return (!is_n_batch_spilt_ && !h
 
 Status KernelTaskInfo::SuperKernelDistribute() {
   Status ret;
-  char *skt_task_num = getenv("SKT_TASK_NUM");
-  auto task_num = static_cast<uint64_t>((skt_task_num != nullptr) ? strtol(skt_task_num, nullptr, 10)
-                                                                  : kSKTMaxSizeLimit);  // 10 for decimal number
-  GELOGI("SKT: SuperKernel Distribute Task num[skt_id:%lu]", task_num);
   if (FirstCallSKTLaunchCheck()) {
     ret = SuperKernelLaunch();
     if (ret != SUCCESS) {
@@ -381,7 +376,8 @@ Status KernelTaskInfo::Distribute() {
   char *skt_enable_env = getenv("SKT_ENABLE");
   int64_t env_flag = (skt_enable_env != nullptr) ? strtol(skt_enable_env, nullptr, 10) : 0;
   bool call_skt = ((env_flag != 0) || is_l1_fusion_enable_);
-  if (kernel_type_ == cce::ccKernelType::AI_CPU) {
+  if (kernel_type_ == cce::ccKernelType::AI_CPU || kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
+    GELOGI("distribute task info kernel_type %d, flag %d", kernel_type_, dump_flag_);
     // blockDim is reserved parameter, set to 1
     rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(so_name_.c_str()),
                                        reinterpret_cast<const void *>(kernel_name_.c_str()), 1, args_, args_size_,
@@ -865,10 +861,98 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) {
   return SUCCESS;
 }
 
+Status KernelTaskInfo::LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def) {
+  CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr());
+  if (aicpu_kernel == nullptr) {
+    GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str());
+    return INTERNAL_ERROR;
+  }
+  const void *aicpu_data = aicpu_kernel->GetBinData();
+  uint32_t aicpu_data_length = aicpu_kernel->GetBinDataSize();
+
+  void *d_aicpu_data = nullptr;
+  rtError_t status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+
+  status = rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+
+  void *d_so_name = nullptr;
+  status = rtMalloc(&d_so_name, so_name_.size(), RT_MEMORY_HBM);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+
+  status = rtMemcpy(d_so_name, so_name_.size(), reinterpret_cast<const void *>(so_name_.c_str()), so_name_.size(),
+                    RT_MEMCPY_HOST_TO_DEVICE);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+
+  CustAicpuSoBuf cust_aicpu_so_buf;
+  cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_aicpu_data));
+  cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length;
+  cust_aicpu_so_buf.kernelSoName = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_so_name));
+  cust_aicpu_so_buf.kernelSoNameLen = so_name_.size();
+
+  void *args = nullptr;
+  uint32_t args_size = sizeof(CustAicpuSoBuf);
+  status = rtMalloc(&args, args_size, RT_MEMORY_HBM);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+  GELOGI("loadOpFromBuf kernelSoBuf %p, kernelSoBufLen %u, kernelSoName %p, kernelSoNameLen %u.", d_aicpu_data,
+         aicpu_data_length, d_so_name, so_name_.size());
+
+  status = rtMemcpy(args, args_size, static_cast<void *>(&cust_aicpu_so_buf), args_size, RT_MEMCPY_HOST_TO_DEVICE);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+
+  rtStream_t stream = nullptr;
+  status = rtStreamCreate(&stream, 0);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt create stream failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+
+  status = rtCpuKernelLaunch(nullptr, kLoadOpFromBuf, 1, args, args_size, nullptr, stream);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt CpuKernelLaunch loadOpFromBuf failed, status: 0x%X", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+  GELOGI("Cpu kernel launch loadOpFromBuf.");
+
+  status = rtStreamSynchronize(stream);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+
+  GE_CHK_RT(rtFree(args));
+  GE_CHK_RT(rtFree(d_aicpu_data));
+  GE_CHK_RT(rtFree(d_so_name));
+
+  GELOGI("Cpu kernel launch loadOpFromBuf task success.");
+  return SUCCESS;
+}
+
 Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &kernel_def) {
   GELOGI("Do InitAicpuTask");
   so_name_ = kernel_def.so_name();
   kernel_name_ = kernel_def.kernel_name();
+  GELOGI("node[%s] test so name %s, kernel name %s", op_desc_->GetName().c_str(), so_name_.c_str(),
+         kernel_name_.c_str());
 
   OpDescPtr op_desc = davinci_model_->GetOpByIndex(op_index);
   if (op_desc == nullptr) {
@@ -876,6 +960,10 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
     return INTERNAL_ERROR;
   }
 
+  if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
+    GE_CHK_STATUS_RET(LaunchCustAicpuSo(op_desc, kernel_def), "launch cust aicpu so failed");
+  }
+
   // copy args to new host memory
   std::unique_ptr<uint8_t[]> args_addr(new (std::nothrow) uint8_t[args_size_]);
   GE_PRINT_DYNAMIC_MEMORY(new, "cce task physical memory.", sizeof(uint8_t) * args_size_)
@@ -940,6 +1028,9 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
     }
     dump_args_ = static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead);
   }
+  if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
+    dump_flag_ |= RT_KERNEL_CUSTOM_AICPU;
+  }
 
   davinci_model_->SetZeroCopyAddr(op_desc, io_addrs, args_addr.get(), args_, args_size_, sizeof(aicpu::AicpuParamHead));
 
@@ -1195,16 +1286,6 @@ uint8_t KernelTaskInfo::IsL2CpToDDR(uint8_t origain_L2_load_to_ddr) {
   if (dump_flag_ == RT_KERNEL_DUMPFLAG) {
     return kL2LoadToDdr;
   }
-
-  static char *ge_dump_env = std::getenv("DUMP_OP");
-  if (ge_dump_env != nullptr) {
-    static std::string ge_dump_str(ge_dump_env);
-    static std::string open_ge_dump("1");
-    if (ge_dump_str == open_ge_dump) {
-      return kL2LoadToDdr;
-    }
-  }
-
   return kL2NotLoadToDdr;
 }
 
diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
index 8ada2082..1c45682e 100644
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
@@ -106,6 +106,8 @@ class KernelTaskInfo : public TaskInfo {
 
   Status InitAicpuTaskExtInfo(const std::string &ext_info);
 
+  Status LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def);
+
   Status StoreInputOutputTensor(const std::vector<void *> &input_data_addrs,
                                 const std::vector<void *> &output_data_addrs,
                                 const std::vector<::tagCcAICPUTensor> &input_descs,
diff --git a/src/ge/graph/load/new_model_manager/zero_copy_task.cc b/src/ge/graph/load/new_model_manager/zero_copy_task.cc
index 00920aad..30ce8a86 100644
--- a/src/ge/graph/load/new_model_manager/zero_copy_task.cc
+++ b/src/ge/graph/load/new_model_manager/zero_copy_task.cc
@@ -130,8 +130,8 @@ Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const ma
       }
 
       auto dst_addr = static_cast<uint8_t *>(buffer_addr);
-      GELOGI("[ZCPY] %s update task, args_addr: %p, size: %zu, offset: %zu, virtual_addr: 0x%lx", name_.c_str(),
-             args_addr_, args_size_, offset, addr);
+      GELOGI("[ZCPY] %s update task, args_addr: %p, size: %zu, offset: %zu, virtual_addr: 0x%lx, user_data_addr: %p",
+             name_.c_str(), args_addr_, args_size_, offset, addr, buffer_addr);
       *(uintptr_t *)(args_info + offset) = reinterpret_cast<uintptr_t>(dst_addr);
       is_updated_ = true;
     }
diff --git a/src/ge/graph/manager/graph_caching_allocator.h b/src/ge/graph/manager/graph_caching_allocator.h
index 94a5066a..850a73e8 100644
--- a/src/ge/graph/manager/graph_caching_allocator.h
+++ b/src/ge/graph/manager/graph_caching_allocator.h
@@ -29,6 +29,7 @@
 
 #include "framework/common/ge_inner_error_codes.h"
 #include "graph/node.h"
+#include "graph/manager/block_memory.h"
 #include "runtime/mem.h"
 
 namespace ge {
@@ -38,30 +39,8 @@ constexpr size_t kKByteSize = 1024;
 constexpr size_t kMByteSize = 1024 * 1024;
 constexpr size_t kGByteSize = 1024 * 1024 * 1024;
 
-struct Block;
-typedef bool (*Comparison)(const Block *, const Block *);
-using BlockBin = std::set<Block *, Comparison>;
 static const uint32_t kNumBins = 8;
 
-struct Block {
-  uint32_t device_id;  // npu device id
-  size_t size;         // block size in bytes
-  BlockBin *bin;       // owning block bin
-  uint8_t *ptr;        // memory address
-  bool allocated;      // in-use flag
-  Block *prev;         // prev block if split from a larger allocation
-  Block *next;         // next block if split from a larger allocation
-
-  Block(uint32_t device, size_t size, BlockBin *bin, uint8_t *ptr)
-      : device_id(device), size(size), bin(bin), ptr(ptr), allocated(0), prev(nullptr), next(nullptr) {}
-
-  // constructor for search key
-  Block(uint32_t device, size_t size, uint8_t *ptr)
-      : device_id(device), size(size), bin(nullptr), ptr(ptr), allocated(0), prev(nullptr), next(nullptr) {}
-
-  bool IsSplit() const { return (prev != nullptr) || (next != nullptr); }
-};
-
 class MemoryAllocator;
 
 class CachingAllocator {
diff --git a/src/ge/graph/manager/graph_manager.cc b/src/ge/graph/manager/graph_manager.cc
index 582b206a..08f7ec9e 100644
--- a/src/ge/graph/manager/graph_manager.cc
+++ b/src/ge/graph/manager/graph_manager.cc
@@ -33,7 +33,9 @@
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/ge_types.h"
+#include "analyzer/analyzer.h"
 #include "graph/common/ge_call_wrapper.h"
+#include "graph/common/local_context.h"
 #include "graph/common/transop_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/ge_context.h"
@@ -42,6 +44,7 @@
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/util/rt_context_util.h"
 #include "graph/partition/dynamic_shape_partition.h"
+#include "graph/passes/enter_pass.h"
 #include "graph/passes/addn_pass.h"
 #include "graph/passes/bitcast_pass.h"
 #include "graph/passes/atomic_addr_clean_pass.h"
@@ -110,6 +113,9 @@ const char *const kSend = "Send";
 const char *const kRecv = "Recv";
 const char *const kCheckPointForGetVar = "CheckPointGraphForGetVar";
 const char *const kCheckPointGraph = "checkpoint_graph";
+const char *const kVectorEngine = "VectorEngine";
+const char *const kAIcoreEngine = "AIcoreEngine";
+const char *const kOffOptimize = "off_optimize";
 
 bool IsTailingOptimization() {
   string is_tailing_optimization_option;
@@ -125,7 +131,10 @@ bool IsTailingOptimization() {
 }  // namespace
 
 namespace ge {
-GraphManager::GraphManager() : thread_run_flag_(false), graph_run_listener_(nullptr), init_flag_(false) {}
+GraphManager::GraphManager(OmgContext &omg_context)
+    : thread_run_flag_(false), graph_run_listener_(nullptr), init_flag_(false), omg_context_(omg_context) {
+  SetLocalOmgContext(omg_context);
+}
 
 Status GraphManager::Initialize(const std::map<string, string> &options) {
   if (init_flag_) {
@@ -321,14 +330,56 @@ Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::Com
   return SUCCESS;
 }
 
-Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph) {
+Status GraphManager::CopySubGraphAndMarkFusion(const ComputeGraphPtr &compute_graph,
+                                               Graph2SubGraphInfoList &sub_graph_map,
+                                               std::unordered_map<std::string, ComputeGraphPtr> &copy_graphs) {
+  GE_CHECK_NOTNULL(compute_graph);
+  vector<ComputeGraphPtr> old_compute_graphs;
+  const auto &root_subgraph_list = sub_graph_map[compute_graph];
+  for (const auto &subgraph : root_subgraph_list) {
+    old_compute_graphs.emplace_back(subgraph->GetSubGraph());
+  }
+  for (const auto &function_graph : compute_graph->GetAllSubgraphs()) {
+    const auto &subgraph_list = sub_graph_map[function_graph];
+    for (const auto &subgraph : subgraph_list) {
+      old_compute_graphs.emplace_back(subgraph->GetSubGraph());
+    }
+  }
+
+  for (const auto &old_compute_graph : old_compute_graphs) {
+    std::vector<NodePtr> input_nodes;
+    std::vector<NodePtr> output_nodes;
+    ComputeGraphPtr new_compute_graph = GraphUtils::CloneGraph(old_compute_graph, "", input_nodes, output_nodes);
+    if (new_compute_graph == nullptr) {
+      GELOGE(INTERNAL_ERROR, "Clone graph failed.");
+      return INTERNAL_ERROR;
+    }
+    copy_graphs.emplace(old_compute_graph->GetName(), new_compute_graph);
+    if (!AttrUtils::SetBool(old_compute_graph, ATTR_NAME_NEED_LX_FUSION, true)) {
+      GELOGE(INTERNAL_ERROR, "Set attr lx_fusion to graph failed.");
+      return INTERNAL_ERROR;
+    }
+  }
+
+  GELOGI("Copy %zu graphs successfully.", copy_graphs.size());
+  return SUCCESS;
+}
+
+Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_graph,
+                                                      Graph2SubGraphInfoList &sub_graph_map, uint64_t session_id) {
+  GE_CHECK_NOTNULL(compute_graph);
   // use default 16 multi thread
   const uint32_t thread_num = 16;
   ThreadPool executor(thread_num);
-  auto sub_graph_map = graph_partitioner_.GetSubGraphMap();
   std::vector<std::future<Status>> vector_future;
   const auto &root_subgraph_list = sub_graph_map[compute_graph];
+  std::string op_compile_strategy;
+  (void)AttrUtils::GetStr(compute_graph, ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy);
+  GELOGI("OptimizeSubGraphWithMultiThreads Process op_compile_strategy:%s", op_compile_strategy.c_str());
   for (const auto &subgraph : root_subgraph_list) {
+    if (!op_compile_strategy.empty()) {
+      (void)AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy);
+    }
     std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, subgraph, session_id,
                                             GetThreadLocalContext());
     if (!f.valid()) {
@@ -341,6 +392,9 @@ Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_gr
   for (auto &function_graph : compute_graph->GetAllSubgraphs()) {
     auto subgraph_list = sub_graph_map[function_graph];
     for (const auto &subgraph : subgraph_list) {
+      if (!op_compile_strategy.empty()) {
+        (void)AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy);
+      }
       std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, subgraph, session_id,
                                               GetThreadLocalContext());
       if (!f.valid()) {
@@ -361,6 +415,130 @@ Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_gr
   return SUCCESS;
 }
 
+bool GraphManager::CheckAllFusionOptimizeSuccess(const ComputeGraphPtr &compute_graph,
+                                                 Graph2SubGraphInfoList &sub_graph_map) {
+  if (compute_graph == nullptr) {
+    GELOGE(PARAM_INVALID, "Input param compute_graph is nullptr.");
+    return false;
+  }
+
+  /// 1. FE will set attr optimize_group with true(false) while lx fusion is success(fail);
+  /// 2. FE will not set attr optimize_group while fe.ini set l2fusion enable false;
+  /// 3. Other engine will not set attr optimize_group.
+  const auto &root_subgraph_list = sub_graph_map[compute_graph];
+  for (const auto &subgraph : root_subgraph_list) {
+    bool optimize_group = true;
+    (void)AttrUtils::GetBool(subgraph->GetSubGraph(), ATTR_NAME_OPTIMIZE_GROUP, optimize_group);
+    if (!optimize_group) {
+      GELOGW("Run lx optimize for subgraph:%s failed.", subgraph->GetSubGraph()->GetName().c_str());
+      return false;
+    }
+  }
+  for (auto &function_graph : compute_graph->GetAllSubgraphs()) {
+    const auto &subgraph_list = sub_graph_map[function_graph];
+    for (const auto &subgraph : subgraph_list) {
+      bool optimize_group = true;
+      (void)AttrUtils::GetBool(subgraph->GetSubGraph(), ATTR_NAME_OPTIMIZE_GROUP, optimize_group);
+      if (!optimize_group) {
+        GELOGW("Run lx optimize for subgraph:%s failed.", subgraph->GetSubGraph()->GetName().c_str());
+        return false;
+      }
+    }
+  }
+  GELOGI("All subgraph are optimized successfully, no need to reuse buffer optimize.");
+  return true;
+}
+
+Status GraphManager::ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_graph,
+                                                 Graph2SubGraphInfoList &sub_graph_map,
+                                                 std::unordered_map<std::string, ComputeGraphPtr> &copy_graphs) {
+  GE_CHECK_NOTNULL(compute_graph);
+  const auto &root_subgraph_list = sub_graph_map[compute_graph];
+  for (const auto &subgraph : root_subgraph_list) {
+    auto iter = copy_graphs.find(subgraph->GetSubGraph()->GetName());
+    if (iter == copy_graphs.end()) {
+      GELOGE(FAILED, "Can not find subgraph:%s in copy graphs.", subgraph->GetSubGraph()->GetName().c_str());
+      return FAILED;
+    }
+    subgraph->SetSubGraph(iter->second);
+  }
+
+  for (auto &function_graph : compute_graph->GetAllSubgraphs()) {
+    const auto &subgraph_list = sub_graph_map[function_graph];
+    for (const auto &subgraph : subgraph_list) {
+      auto iter = copy_graphs.find(subgraph->GetSubGraph()->GetName());
+      if (iter == copy_graphs.end()) {
+        GELOGE(FAILED, "Can not find subgraph:%s in copy graphs.", subgraph->GetSubGraph()->GetName().c_str());
+        return FAILED;
+      }
+      subgraph->SetSubGraph(iter->second);
+    }
+  }
+  GELOGI("All subgraphs are successfully replaced.");
+  return SUCCESS;
+}
+
+Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph) {
+  GE_CHECK_NOTNULL(compute_graph);
+  auto sub_graph_map = graph_partitioner_.GetSubGraphMap();
+  std::string buffer_optimize;
+  graphStatus graph_status = ge::GetContext().GetOption(BUFFER_OPTIMIZE, buffer_optimize);
+  bool need_lx_fusion = (graph_status == GRAPH_SUCCESS) && (buffer_optimize != kOffOptimize);
+  if (options_.build_mode.empty() && need_lx_fusion) {
+    GELOGI("Enter normal mode with buffer_optimize:%s.", buffer_optimize.c_str());
+    /// 1. Copy subgraph for buffer optimize while lx fusion failed.
+    /// 2. Set graph with attr "lx_fusion" for fusion optimize.
+    std::unordered_map<std::string, ComputeGraphPtr> copy_graphs;
+    GE_TIMESTAMP_START(CopySubGraphAndMarkFusion);
+    Status ret = CopySubGraphAndMarkFusion(compute_graph, sub_graph_map, copy_graphs);
+    GE_TIMESTAMP_EVENT_END(CopySubGraphAndMarkFusion, "SetSubgraph:CopySubGraphAndMarkFusion");
+    if (ret != SUCCESS) {
+      GELOGE(ret, "CopySubGraphAndMarkFusion failed.");
+      return ret;
+    }
+
+    // Multiply optimize subgraph with lx fusion
+    ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Multiply optimize subgraph with lx fusion failed.");
+      return ret;
+    }
+
+    // Check whether all subgraph lx fusion success
+    GE_TIMESTAMP_START(CheckAllFusionOptimizeSuccess);
+    if (CheckAllFusionOptimizeSuccess(compute_graph, sub_graph_map)) {
+      GE_TIMESTAMP_EVENT_END(CheckAllFusionOptimizeSuccess, "SetSubgraph:CheckAllFusionOptimizeSuccess");
+      return SUCCESS;
+    }
+
+    // Replace subgraph with original graph for lx buffer
+    ret = ReplaceSubgraphWithOriGraph(compute_graph, sub_graph_map, copy_graphs);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Replace subgraph with original graph failed.");
+      return ret;
+    }
+
+    // Multiply optimize subgraph with lx buffer
+    ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Multiply optimize subgraph with lx buffer failed.");
+      return ret;
+    }
+  } else {
+    /// Multiply optimize subgraph:
+    /// 1. run lx buffer while build_mode is normal and buffer_optimize is empty or "off_optimize";
+    /// 2. run lx fusion or buffer according build_mode and build_step in fe.
+    GELOGI("Directly optimize subgraph with build mode:%s, and step:%s, buffer_optimize:%s.",
+           options_.build_mode.c_str(), options_.build_step.c_str(), buffer_optimize.c_str());
+    Status ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Multiply optimize subgraph with lx buffer");
+      return ret;
+    }
+  }
+  return SUCCESS;
+}
+
 #define GM_RUN_AND_DUMP_PERF(name, func, ...)                                                                    \
   do {                                                                                                           \
     GE_RUN_PERF(GraphManager, func, __VA_ARGS__);                                                                \
@@ -368,18 +546,10 @@ Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_gr
     GELOGI("Run %s on graph %s(%u) success.", name, compute_graph->GetName().c_str(), graph_node->GetGraphId()); \
   } while (0)
 
-Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs,
-                            GeRootModelPtr &ge_root_model, uint64_t session_id) {
+Status GraphManager::PreRunOptimizeOriginalGraph(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs,
+                                                 ge::ComputeGraphPtr &compute_graph, uint64_t session_id) {
   GE_CHECK_NOTNULL(graph_node);
-  GE_CHECK_NOTNULL(graph_node->GetGraph());
-  auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph());
   GE_CHECK_NOTNULL(compute_graph);
-
-  GEEVENT("PreRun start, graph node size %zu, session id %lu, graph id %u, graph name %s",
-          compute_graph->GetDirectNodesSize(), session_id, compute_graph->GetGraphID(),
-          compute_graph->GetName().c_str());
-  GE_DUMP(compute_graph, "PreRunBegin");
-
   GM_RUN_AND_DUMP_PERF("OptimizeGraphPrepare", graph_optimize_.OptimizeOriginalGraphForQuantize, compute_graph);
   GM_RUN_AND_DUMP_PERF("HandleSummaryOp", graph_optimize_.HandleSummaryOp, compute_graph);
   GM_RUN_AND_DUMP_PERF("Prepare", graph_preparer_.PrepareDynShape, graph_node->GetGraph(), inputs, compute_graph,
@@ -388,10 +558,6 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector<Ge
 
   GM_RUN_AND_DUMP_PERF("PrepareRunningFormatRefiner", graph_preparer_.PrepareRunningFormatRefiner);
   GM_RUN_AND_DUMP_PERF("RefineRunningFormat", graph_optimize_.OptimizeOriginalGraphJudgeInsert, compute_graph);
-  if (std::getenv("AnalyzeMode")) {
-    GELOGI("Do return failed after refine_running_format when in analyze mode!");
-    return FAILED;
-  }
   GM_RUN_AND_DUMP_PERF("SubexpressionMigration", SubexpressionMigration, compute_graph);
   GE_RUN(GraphManager, graph_preparer_.RecordAIPPInfo, compute_graph);
   if (IsTailingOptimization()) {
@@ -399,18 +565,124 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector<Ge
   }
   GM_RUN_AND_DUMP_PERF("Optimize1", OptimizeStage1, compute_graph);
   GM_RUN_AND_DUMP_PERF("InferShape2", compute_graph->InferShapeInNeed);
-  const char *unknown_shape_skip = std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION");
-  if (unknown_shape_skip != nullptr) {
-    PassManager graph_pass;
-    GE_CHK_STATUS_RET(graph_pass.AddPass("PreRun::CtrlEdgeTransferPass", new (std::nothrow) CtrlEdgeTransferPass))
-    GE_CHK_STATUS_RET(graph_pass.Run(compute_graph));
-  }
+
+  PassManager graph_pass;
+  GE_CHK_STATUS_RET(graph_pass.AddPass("PreRun::CtrlEdgeTransferPass", new (std::nothrow) CtrlEdgeTransferPass))
+  GE_CHK_STATUS_RET(graph_pass.Run(compute_graph));
 
   GE_CHK_STATUS_RET(graph_optimize_.IdentifyReference(compute_graph), "Identify reference failed.");
+  GELOGI("PreRun:PreRunOptimizeOriginalGraph success.");
+  return SUCCESS;
+}
+
+Status GraphManager::PreRunOptimizeSubGraph(const GraphNodePtr &graph_node, ge::ComputeGraphPtr &compute_graph,
+                                            uint64_t session_id) {
+  GE_CHECK_NOTNULL(graph_node);
+  GE_CHECK_NOTNULL(compute_graph);
   GM_RUN_AND_DUMP_PERF("OptimizeSubgraph", OptimizeSubgraph, graph_node, compute_graph, session_id);
+
+  // Dump graph to tuning path
+  if (options_.build_mode == BUILD_MODE_TUNING && options_.build_step == BUILD_STEP_AFTER_UB_MATCH) {
+    std::string tuning_path;
+    (void)GetContext().GetOption(TUNING_PATH, tuning_path);
+    GELOGI("Dump path:%s.", tuning_path.c_str());
+    GraphUtils::DumpGEGraph(compute_graph, "", true, tuning_path);
+  }
+  GELOGI("PreRun:PreRunOptimizeSubGraph success.");
+  return SUCCESS;
+}
+
+Status GraphManager::PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph,
+                                                 GeRootModelPtr &ge_root_model, uint64_t session_id) {
+  GE_CHECK_NOTNULL(graph_node);
+  GE_CHECK_NOTNULL(compute_graph);
   GM_RUN_AND_DUMP_PERF("Optimize2", OptimizeStage2, compute_graph);
   GM_RUN_AND_DUMP_PERF("OptimizeGraphBeforeBuildForRts", graph_optimize_.OptimizeGraphBeforeBuildForRts, compute_graph);
   GM_RUN_AND_DUMP_PERF("Build", Build, graph_node, compute_graph, ge_root_model, session_id);
+  GELOGI("PreRun:PreRunAfterOptimizeSubGraph success.");
+  return SUCCESS;
+}
+
+Status GraphManager::SetRtContext(rtContext_t rt_context, rtCtxMode_t mode, uint64_t session_id, uint32_t graph_id) {
+  GELOGI("set rt_context, session id: %lu, graph id: %u, mode %d, device id:%u.", session_id, graph_id,
+         static_cast<int>(mode), ge::GetContext().DeviceId());
+
+  rtError_t rt_ret = rtCtxCreate(&rt_context, mode, ge::GetContext().DeviceId());
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
+    return FAILED;
+  }
+  rt_ret = rtCtxSetCurrent(rt_context);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
+    return FAILED;
+  }
+  RtContextUtil::GetInstance().AddRtContext(session_id, graph_id, rt_context);
+  return SUCCESS;
+}
+
+Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs,
+                            GeRootModelPtr &ge_root_model, uint64_t session_id) {
+  GE_CHECK_NOTNULL(graph_node);
+  GE_CHECK_NOTNULL(graph_node->GetGraph());
+  auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph());
+  GE_CHECK_NOTNULL(compute_graph);
+  compute_graph->SetSessionID(session_id);
+  auto analyzer_instance = Analyzer::GetInstance();
+  GE_CHK_STATUS_RET(analyzer_instance->BuildJsonObject(session_id, compute_graph->GetGraphID()),
+                    "BuildJsonObject Failed")
+
+  GEEVENT("PreRun start, graph node size %zu, session id %lu, graph id %u, graph name %s",
+          compute_graph->GetDirectNodesSize(), session_id, compute_graph->GetGraphID(),
+          compute_graph->GetName().c_str());
+  GE_DUMP(compute_graph, "PreRunBegin");
+  // rtContext_t
+  Status ret = SetRtContext(rtContext_t(), RT_CTX_GEN_MODE, session_id, compute_graph->GetGraphID());
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Set rt context failed.");
+    return ret;
+  }
+
+  /// 1. BUILD_MODE_TUNING with BUILD_STEP_AFTER_UB_MATCH no need PreRunOptimizeOriginalGraph;
+  /// 2. BUILD_MODE_TUNING with BUILD_STEP_AFTER_MERGE no need PreRunOptimizeOriginalGraph.
+  /// 3. BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER_SUB no need PreRunOptimizeOriginalGraph.
+  bool run_optimize_original_graph =
+    !((options_.build_mode == BUILD_MODE_TUNING) &&
+      (options_.build_step == BUILD_STEP_AFTER_UB_MATCH || options_.build_step == BUILD_STEP_AFTER_MERGE ||
+       options_.build_step == BUILD_STEP_AFTER_BUILDER_SUB));
+  if (run_optimize_original_graph) {
+    Status ret = PreRunOptimizeOriginalGraph(graph_node, inputs, compute_graph, session_id);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Run PreRunOptimizeOriginalGraph failed for graph:%s.", compute_graph->GetName().c_str());
+      return ret;
+    }
+  }
+
+  // BUILD_MODE_TUNING with BUILD_STEP_AFTER_MERGE no need PreRunOptimizeSubGraph.
+  bool run_optimize_subgraph =
+    !((options_.build_mode == BUILD_MODE_TUNING) && (options_.build_step == BUILD_STEP_AFTER_MERGE));
+  if (run_optimize_subgraph) {
+    Status ret = PreRunOptimizeSubGraph(graph_node, compute_graph, session_id);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Run PreRunOptimizeSubGraph failed for graph:%s.", compute_graph->GetName().c_str());
+      return ret;
+    }
+  }
+
+  /// 1. BUILD_MODE_TUNING with BUILD_STEP_BEFORE_UB_MATCH no need PreRunAfterOptimizeSubGraph;
+  /// 2. BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER no need PreRunAfterOptimizeSubGraph.
+  /// 3. BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER_SUB no need PreRunAfterOptimizeSubGraph.
+  bool run_after_optimize_subgraph =
+    !((options_.build_mode == BUILD_MODE_TUNING) &&
+      (options_.build_step == BUILD_STEP_BEFORE_UB_MATCH || options_.build_step == BUILD_STEP_AFTER_BUILDER ||
+       options_.build_step == BUILD_STEP_AFTER_BUILDER_SUB));
+  if (run_after_optimize_subgraph) {
+    Status ret = PreRunAfterOptimizeSubGraph(graph_node, compute_graph, ge_root_model, session_id);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Run PreRunAfterOptimizeSubGraph failed for graph:%s.", compute_graph->GetName().c_str());
+      return ret;
+    }
+  }
 
   // when set incre build, save om model and var manager
   GeModelPtr ge_model = nullptr;
@@ -456,7 +728,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std:
     if (ret != SUCCESS) {
       ret = PreRun(graph_node, inputs, ge_root_model, session_id);
       // release rts generate context
-      RtContextUtil::GetInstance().DestroyRtContexts(session_id);
+      RtContextUtil::GetInstance().DestroyRtContexts(session_id, graph_node->GetGraphId());
       if (ret != SUCCESS) {
         GELOGE(ret, "PreRun Failed.");
         return ret;
@@ -1065,7 +1337,7 @@ Status GraphManager::ParseOptions(const std::map<std::string, std::string> &opti
   // net output node dataType
   ParseOption(options, OUTPUT_DATATYPE, options_.output_datatype);
   if (!options_.output_datatype.empty()) {
-    domi::GetContext().output_type = options_.output_datatype;
+    omg_context_.output_type = options_.output_datatype;
   }
 
   // Set save_original_model flag (ge.save_original_model)
@@ -1074,6 +1346,10 @@ Status GraphManager::ParseOptions(const std::map<std::string, std::string> &opti
   // Original model file name
   ParseOption(options, ORIGINAL_MODEL_FILE, options_.original_model_file);
 
+  // Set Build model and step
+  ParseOption(options, BUILD_MODE, options_.build_mode);
+  ParseOption(options, BUILD_STEP, options_.build_step);
+
   return SUCCESS;
 }
 
@@ -1659,6 +1935,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
   ReshapeRemovePass reshape_remove_pass;
   ConstantFoldingPass constant_folding_pass;
   DimensionAdjustPass dimension_adjust_pass;
+  EnterPass enter_pass;
   AddNPass addn_pass;
   SwitchDeadBranchElimination switch_dead_branch_elimination;
   SwitchLogicRemovePass switch_logic_remove_pass;
@@ -1667,15 +1944,16 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
   TransposeTransDataPass transpose_transdata_pass;
   TransOpSymmetryEliminationPass symmetry_elimination_pass;
   DimensionComputePass dimension_compute_pass;
+  names_to_passes.emplace_back("EnterPass", &enter_pass);
   names_to_passes.emplace_back("AddNPass", &addn_pass);
   names_to_passes.emplace_back("SwitchDeadBranchElimination", &switch_dead_branch_elimination);
   names_to_passes.emplace_back("SwitchLogicRemovePass", &switch_logic_remove_pass);
   names_to_passes.emplace_back("MergePass", &merge_pass);
   names_to_passes.emplace_back("CastRemovePass", &cast_remove_pass);
   names_to_passes.emplace_back("TransposeTransDataPass", &transpose_transdata_pass);
+  names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass);
   names_to_passes.emplace_back("TransOpSymmetryEliminationPass", &symmetry_elimination_pass);
   names_to_passes.emplace_back("TransOpNearbyAllreduceFusionPass", &trans_op_nearby_allreduce_fusion_pass);
-  names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass);
   names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass);
   names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass);
   names_to_passes.emplace_back("DimensionAdjustPass", &dimension_adjust_pass);
@@ -1975,6 +2253,7 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager
   Status ret = SUCCESS;
   GetThreadLocalContext() = ge_context;
   if (sub_graph_info_ptr != nullptr && graph_manager != nullptr) {
+    SetLocalOmgContext(graph_manager->omg_context_);
     ComputeGraphPtr compute_graph_tmp = sub_graph_info_ptr->GetSubGraph();
     const std::string &engine_name = sub_graph_info_ptr->GetEngineName();
     GELOGI("ProcessSubGraphWithMultiThreads start, graph name is %s, engine_name is %s, thread id is %lu",
@@ -2079,6 +2358,8 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
   if (prctl(PR_SET_NAME, ("GE_PreRun")) != 0) {
     GELOGW("Set thread name failed.");
   }
+  SetLocalOmgContext(graph_manager->omg_context_);
+
   PreRunArgs args;
   while (graph_manager->thread_run_flag_) {
     bool pop_status = graph_manager->prerun_args_q_.Pop(args);
@@ -2146,10 +2427,10 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
       if (graph_manager->IncreBuild(graph_node, ge_model) != SUCCESS) {
         ret = graph_manager->PreRun(graph_node, ge_inputs, ge_root_model, args.session_id);
         // release rts generate context
-        RtContextUtil::GetInstance().DestroyRtContexts(args.session_id);
+        RtContextUtil::GetInstance().DestroyRtContexts(args.session_id, graph_node->GetGraphId());
         if (ret != SUCCESS) {
           graph_node->SetRunFlag(false);
-          if (!std::getenv("AnalyzeMode")) {
+          if (!ge::Analyzer::GetInstance()->IsEnableNetAnalyzeDebug()) {
             ReturnError(graph_manager, args.callback, ret, "PreRun Failed, thread exit..");
             graph_node->Unlock();
             return;
@@ -2176,6 +2457,8 @@ void GraphManager::RunThread(GraphManager *graph_manager) {
   if (prctl(PR_SET_NAME, ("GE_Run")) != 0) {
     GELOGW("Set thread name failed.");
   }
+  SetLocalOmgContext(graph_manager->omg_context_);
+
   RunArgs args;
   while (graph_manager->thread_run_flag_) {
     bool pop_status = graph_manager->run_args_q_.Pop(args);
@@ -2287,17 +2570,11 @@ void GraphManager::ReturnError(GraphManager *graph_manager, GraphNodePtr &graph_
         return;
       }
       tensor.length = len * size;
-      auto pbuff = new (std::nothrow) uint8_t[tensor.length];
-      if (!pbuff) {
-        GELOGE(MEMALLOC_FAILED, "new buff failed!");
-        callback(GRAPH_FAILED, outputs);
-        return;
-      }
+      tensor.data.reset(new (std::nothrow) uint8_t[tensor.length]);
       // To avoid global step too small and can not stop, totally set a bigger value
       for (int64_t i = 0; i < tensor.length; i++) {
-        *(pbuff + i) = 0x7F;  // here stands for a positive max value
+        tensor.data[i] = 0x7F;  // here stands for a positive max value
       }
-      tensor.data.reset(pbuff);
       outputs.emplace_back(std::move(tensor));
     }
   }
@@ -2373,6 +2650,20 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra
     return ret;
   }
   GE_TIMESTAMP_EVENT_END(SetSubgraph, "OptimizeSubgraph::SetSubGraph");
+  if ((options_.build_mode == BUILD_MODE_TUNING) &&
+      (options_.build_step == BUILD_STEP_BEFORE_UB_MATCH || options_.build_step == BUILD_STEP_AFTER_BUILDER ||
+       options_.build_step == BUILD_STEP_AFTER_BUILDER_SUB)) {
+    GE_TIMESTAMP_START(ConvertGraphToFile);
+    std::string tuning_path;
+    (void)GetContext().GetOption(TUNING_PATH, tuning_path);
+    Status ret = ConvertGraphToFile(compute_graph, tuning_path, (options_.build_step == BUILD_STEP_AFTER_BUILDER));
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Convert graph[%s] to file failed", compute_graph->GetName().c_str());
+      return ret;
+    }
+    GE_TIMESTAMP_EVENT_END(ConvertGraphToFile, "OptimizeSubgraph::ConvertGraphToFile");
+    return SUCCESS;
+  }
 
   ComputeGraphPtr merged_compute_graph = nullptr;
   std::vector<ComputeGraphPtr> merged_sub_graph_list;
@@ -2400,6 +2691,32 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra
   }
   return SUCCESS;
 }
+
+Status GraphManager::ConvertGraphToFile(ComputeGraphPtr &compute_graph, std::string path, bool exe_flag) {
+  GE_CHECK_NOTNULL(compute_graph);
+  GELOGI("compute_graph [%s] path [%s] Enter ConvertGraphToFile.", compute_graph->GetName().c_str(), path.c_str());
+  std::vector<ComputeGraphPtr> non_tuning_subgraphs;
+  auto input_node_sub_graph_map = graph_partitioner_.graph_2_input_subgraph_;
+  const auto &input_subgraph_info = input_node_sub_graph_map[compute_graph];
+  GE_CHECK_NOTNULL(input_subgraph_info);
+  ComputeGraphPtr input_graph_tmp = input_subgraph_info->GetSubGraph();
+  non_tuning_subgraphs.push_back(input_graph_tmp);
+  auto sub_graph_map = graph_partitioner_.GetSubGraphMap();
+  const auto &subgraph_infos = sub_graph_map[compute_graph];
+  std::vector<ComputeGraphPtr> tuning_subgraphs;
+  for (const auto &sub_graph_info_ptr : subgraph_infos) {
+    GE_CHECK_NOTNULL(sub_graph_info_ptr);
+    ComputeGraphPtr sub_graph_tmp = sub_graph_info_ptr->GetSubGraph();
+    // need to tuning
+    if (sub_graph_info_ptr->GetEngineName() == kVectorEngine || sub_graph_info_ptr->GetEngineName() == kAIcoreEngine) {
+      tuning_subgraphs.push_back(sub_graph_tmp);
+    } else {
+      non_tuning_subgraphs.push_back(sub_graph_tmp);
+    }
+  }
+  return TuningUtils::ConvertGraphToFile(tuning_subgraphs, non_tuning_subgraphs, exe_flag, path);
+}
+
 Status GraphManager::Build(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph,
                            GeRootModelPtr &ge_root_model, uint64_t session_id) {
   // build
diff --git a/src/ge/graph/manager/graph_manager.h b/src/ge/graph/manager/graph_manager.h
index 6dc83120..9096f4a8 100644
--- a/src/ge/graph/manager/graph_manager.h
+++ b/src/ge/graph/manager/graph_manager.h
@@ -39,12 +39,13 @@
 #include "graph/optimize/graph_optimize.h"
 #include "graph/partition/graph_partition.h"
 #include "graph/preprocess/graph_preprocess.h"
+#include "graph/tuning_utils.h"
 #include "model/ge_model.h"
 
 namespace ge {
 class GraphManager {
  public:
-  GraphManager();
+  GraphManager(OmgContext &omg_context);
 
   ~GraphManager() = default;
 
@@ -248,6 +249,8 @@ class GraphManager {
 
   Status MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::ComputeGraphPtr &original_compute_graph);
 
+  Status ConvertGraphToFile(ComputeGraphPtr &compute_graph, std::string file_path, bool exe_flag = false);
+
   Status SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph);
 
   void SetAttrForHcomBroadCastOp(ge::ComputeGraphPtr &compute_graph);
@@ -304,6 +307,25 @@ class GraphManager {
 
   void ChangeConstTypeWhenTraining(const ComputeGraphPtr &compute_graph);
 
+  Status PreRunOptimizeOriginalGraph(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs,
+                                     ge::ComputeGraphPtr &compute_graph, uint64_t session_id);
+  Status PreRunOptimizeSubGraph(const GraphNodePtr &graph_node, ge::ComputeGraphPtr &compute_graph,
+                                uint64_t session_id);
+  Status PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph,
+                                     GeRootModelPtr &ge_root_model, uint64_t session_id);
+
+  Status CopySubGraphAndMarkFusion(const ComputeGraphPtr &compute_graph, Graph2SubGraphInfoList &sub_graph_map,
+                                   std::unordered_map<std::string, ComputeGraphPtr> &copy_graphs);
+
+  Status OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_graph, Graph2SubGraphInfoList &sub_graph_map,
+                                          uint64_t session_id);
+
+  bool CheckAllFusionOptimizeSuccess(const ComputeGraphPtr &compute_graph, Graph2SubGraphInfoList &sub_graph_map);
+
+  Status ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_graph, Graph2SubGraphInfoList &sub_graph_map,
+                                     std::unordered_map<std::string, ComputeGraphPtr> &copy_graphs);
+  Status SetRtContext(rtContext_t rt_context, rtCtxMode_t mode, uint64_t session_id, uint32_t graph_id);
+
   std::atomic_bool thread_run_flag_;
   BlockingQueue<PreRunArgs> prerun_args_q_{};
   BlockingQueue<RunArgs> run_args_q_{};
@@ -326,6 +348,7 @@ class GraphManager {
   bool init_flag_;
 
   GraphManagerOptions options_;
+  OmgContext &omg_context_;
 
   GraphPrepare graph_preparer_;
   GraphOptimize graph_optimize_;
diff --git a/src/ge/graph/manager/graph_manager_utils.cc b/src/ge/graph/manager/graph_manager_utils.cc
index 90f91c8e..edacadb9 100644
--- a/src/ge/graph/manager/graph_manager_utils.cc
+++ b/src/ge/graph/manager/graph_manager_utils.cc
@@ -163,42 +163,4 @@ bool HasCalcOp(const ComputeGraphPtr &graph) {
 
   return false;
 }
-
-Status ParseOutNodes(const string &out_nodes) {
-  try {
-    if (!out_nodes.empty()) {
-      domi::GetContext().out_nodes_map.clear();
-      domi::GetContext().user_out_nodes.clear();
-
-      vector<string> nodes_v = StringUtils::Split(out_nodes, ';');
-      for (const string &node : nodes_v) {
-        vector<string> key_value_v = StringUtils::Split(node, ':');
-        if (key_value_v.size() != 2) {  // must contain 2 items
-          GELOGE(GE_GRAPH_PARAM_NULLPTR, "Invalid outNodes: %s", node.c_str());
-          return GE_GRAPH_PARAM_NULLPTR;
-        }
-        auto iter = domi::GetContext().out_nodes_map.find(key_value_v[0]);
-        int32_t index = std::stoi(StringUtils::Trim(key_value_v[1]));
-        if (iter != domi::GetContext().out_nodes_map.end()) {
-          iter->second.emplace_back(index);
-        } else {
-          std::vector<int32_t> index_v;
-          index_v.emplace_back(index);
-          domi::GetContext().out_nodes_map.emplace(key_value_v[0], index_v);
-        }
-        domi::GetContext().user_out_nodes.emplace_back(key_value_v[0], index);
-      }
-    }
-  } catch (std::invalid_argument &) {
-    GELOGE(PARAM_INVALID, "out nodes: %s, key value[1] is invalid argument", out_nodes.c_str());
-    return PARAM_INVALID;
-  } catch (std::out_of_range &) {
-    GELOGE(PARAM_INVALID, "out nodes: %s, key value[1] is out of range", out_nodes.c_str());
-    return PARAM_INVALID;
-  } catch (...) {
-    GELOGE(GE_GRAPH_PARAM_NULLPTR, "Invalid outNodes: %s", out_nodes.c_str());
-    return GE_GRAPH_PARAM_NULLPTR;
-  }
-  return SUCCESS;
-}
 }  // namespace ge
diff --git a/src/ge/graph/manager/graph_manager_utils.h b/src/ge/graph/manager/graph_manager_utils.h
index 869d4a81..be39df21 100644
--- a/src/ge/graph/manager/graph_manager_utils.h
+++ b/src/ge/graph/manager/graph_manager_utils.h
@@ -116,6 +116,7 @@ class SubGraphInfo {
 
 using SubGraphInfoPtr = std::shared_ptr<ge::SubGraphInfo>;
 using Graph2SubGraphInfoList = std::unordered_map<ComputeGraphPtr, std::vector<SubGraphInfoPtr>>;
+using Graph2InputNodesSubGraphInfo = std::unordered_map<ComputeGraphPtr, SubGraphInfoPtr>;
 
 // for run graph async listener
 class RunAsyncListener : public ge::ModelListener {
@@ -220,8 +221,6 @@ class GraphModelListener : public ge::ModelListener {
   std::condition_variable &condition_;
 };
 
-Status ParseOutNodes(const string &out_nodes);
-
 struct GraphManagerOptions {
   int32_t stream_num;
   int32_t perf_level;
@@ -248,6 +247,8 @@ struct GraphManagerOptions {
   std::string output_datatype;
   std::string original_model_file;
   std::string save_original_model;
+  std::string build_mode;
+  std::string build_step;
   GraphManagerOptions()
       : stream_num(1),
         perf_level(domi::GEN_TASK_WITHOUT_FUSION),
@@ -269,7 +270,9 @@ struct GraphManagerOptions {
         hcom_parallel(false),
         enable_print_op_pass(true),
         is_single_op(false),
-        save_original_model("false") {}
+        save_original_model("false"),
+        build_mode(""),
+        build_step("") {}
 };
 }  // namespace ge
 
diff --git a/src/ge/graph/manager/graph_mem_allocator.cc b/src/ge/graph/manager/graph_mem_allocator.cc
index e63039dc..20ca12ae 100644
--- a/src/ge/graph/manager/graph_mem_allocator.cc
+++ b/src/ge/graph/manager/graph_mem_allocator.cc
@@ -15,13 +15,13 @@
  */
 
 #include "graph/manager/graph_mem_allocator.h"
-#include "graph/manager/graph_caching_allocator.h"
 
 #include <set>
 #include <string>
-#include <utility>
 
 #include "framework/common/debug/ge_log.h"
+#include "graph/manager/graph_caching_allocator.h"
+#include "graph/manager/rdma_pool_allocator.h"
 
 namespace ge {
 void MemoryAllocator::Initialize(uint32_t device_id) {
@@ -185,30 +185,36 @@ Status MemManager::Initialize(const std::vector<rtMemType_t> &memory_type) {
     }
   }
 
-  return InitCachingAllocator(memory_type);
+  if (InitAllocator(memory_type, caching_allocator_map_) != SUCCESS) {
+    GELOGE(ge::INTERNAL_ERROR, "Create CachingAllocator failed.");
+    return ge::INTERNAL_ERROR;
+  }
+  if (InitAllocator(memory_type, rdma_allocator_map_) != SUCCESS) {
+    GELOGE(ge::INTERNAL_ERROR, "Create RdmaAllocator failed.");
+    return ge::INTERNAL_ERROR;
+  }
+  return SUCCESS;
 }
 
-void MemManager::Finalize() noexcept {
-  GELOGI("Finalize.");
-  std::lock_guard<std::recursive_mutex> lock(allocator_mutex_);
-  // caching allocator use memory allocator, so finalize it first
-  for (auto &caching_allocator : caching_allocator_map_) {
-    if (caching_allocator.second != nullptr) {
-      caching_allocator.second->Finalize();
-      delete caching_allocator.second;
-      caching_allocator.second = nullptr;
+template <typename T>
+void FinalizeAllocatorMap(std::map<rtMemType_t, T *> &allocate_map) {
+  for (auto &allocator : allocate_map) {
+    if (allocator.second != nullptr) {
+      allocator.second->Finalize();
+      delete allocator.second;
+      allocator.second = nullptr;
     }
   }
-  caching_allocator_map_.clear();
+  allocate_map.clear();
+}
 
-  for (auto &memory_allocator : memory_allocator_map_) {
-    if (memory_allocator.second != nullptr) {
-      memory_allocator.second->Finalize();
-      delete memory_allocator.second;
-      memory_allocator.second = nullptr;
-    }
-  }
-  memory_allocator_map_.clear();
+void MemManager::Finalize() noexcept {
+  GELOGI("Finalize.");
+  std::lock_guard<std::recursive_mutex> lock(allocator_mutex_);
+  // caching and rdma allocator use memory allocator, so finalize them first
+  FinalizeAllocatorMap(caching_allocator_map_);
+  FinalizeAllocatorMap(rdma_allocator_map_);
+  FinalizeAllocatorMap(memory_allocator_map_);
 }
 
 MemoryAllocator *MemManager::GetMemoryAllocator(rtMemType_t memory_type) {
@@ -229,53 +235,11 @@ MemoryAllocator *MemManager::GetMemoryAllocator(rtMemType_t memory_type) {
   return memory_allocator;
 }
 
-Status MemManager::InitCachingAllocator(const std::vector<rtMemType_t> &memory_type) {
-  CachingAllocator *caching_allocator = nullptr;
-  for (unsigned int index : memory_type) {
-    auto it = caching_allocator_map_.find(index);
-    if (it == caching_allocator_map_.end()) {
-      caching_allocator = new (std::nothrow) CachingAllocator(index);
-      if (caching_allocator != nullptr) {
-        caching_allocator_map_[index] = caching_allocator;
-        GELOGI("Create CachingAllocator memory type[%u] success.", index);
-      } else {
-        GELOGE(ge::INTERNAL_ERROR, "Alloc CachingAllocator failed.");
-      }
-    } else {
-      caching_allocator = it->second;
-    }
-
-    if (caching_allocator == nullptr) {
-      GELOGE(ge::INTERNAL_ERROR, "Create CachingAllocator failed.");
-      return ge::INTERNAL_ERROR;
-    } else {
-      if (caching_allocator->Initialize() != ge::SUCCESS) {
-        return ge::INTERNAL_ERROR;
-      }
-    }
-  }
-  return ge::SUCCESS;
-}
-
-CachingAllocator &MemManager::GetCachingAllocator(rtMemType_t memory_type) {
-  std::lock_guard<std::recursive_mutex> lock(allocator_mutex_);
-  CachingAllocator *caching_allocator = nullptr;
-  auto it = caching_allocator_map_.find(memory_type);
-  if (it != caching_allocator_map_.end()) {
-    caching_allocator = it->second;
-  }
-
-  // Usually impossible
-  if (caching_allocator == nullptr) {
-    GELOGE(ge::INTERNAL_ERROR, "GetCachingAllocator failed, memory type is %u.", memory_type);
-    static CachingAllocator default_caching_allocator(RT_MEMORY_RESERVED);
-    return default_caching_allocator;
-    ;
-  }
-  return *caching_allocator;
+CachingAllocator &MemManager::CachingInstance(rtMemType_t memory_type) {
+  return Instance().GetAllocator(memory_type, caching_allocator_map_);
 }
 
-CachingAllocator &MemManager::CachingInstance(rtMemType_t memory_type) {
-  return Instance().GetCachingAllocator(memory_type);
+RdmaPoolAllocator &MemManager::RdmaPoolInstance(rtMemType_t memory_type) {
+  return Instance().GetAllocator(memory_type, rdma_allocator_map_);
 }
 }  // namespace ge
diff --git a/src/ge/graph/manager/graph_mem_allocator.h b/src/ge/graph/manager/graph_mem_allocator.h
index e4eeded3..bebdedb6 100644
--- a/src/ge/graph/manager/graph_mem_allocator.h
+++ b/src/ge/graph/manager/graph_mem_allocator.h
@@ -24,6 +24,7 @@
 #include <string>
 #include <vector>
 
+#include "framework/common/debug/ge_log.h"
 #include "framework/common/ge_inner_error_codes.h"
 #include "graph/node.h"
 #include "runtime/mem.h"
@@ -136,6 +137,7 @@ class MemoryAllocator {
 
 using MemoryAllocatorPtr = std::shared_ptr<MemoryAllocator>;
 class CachingAllocator;
+class RdmaPoolAllocator;
 
 class MemManager {
  public:
@@ -143,7 +145,8 @@ class MemManager {
   virtual ~MemManager();
   static MemManager &Instance();
   static MemoryAllocator *Instance(rtMemType_t memory_type);
-  static CachingAllocator &CachingInstance(rtMemType_t memory_type);
+  CachingAllocator &CachingInstance(rtMemType_t memory_type);
+  RdmaPoolAllocator &RdmaPoolInstance(rtMemType_t memory_type);
   MemManager(const MemManager &) = delete;
   MemManager &operator=(const MemManager &) = delete;
   ///
@@ -172,22 +175,65 @@ class MemManager {
 
   ///
   /// @ingroup ge_graph
-  /// @brief ge caching allocator
   /// @param [in] memory_type memory type
-  /// @return CachingAllocator ptr
+  /// @param [in] allocate_map memory allocator map
+  /// @return Status result of function
   ///
-  CachingAllocator &GetCachingAllocator(rtMemType_t memory_type);
-
+  template <typename T>
+  Status InitAllocator(const std::vector<rtMemType_t> &memory_type, std::map<rtMemType_t, T *> &allocate_map) {
+    T *allocator = nullptr;
+    for (unsigned int index : memory_type) {
+      auto it = allocate_map.find(index);
+      if (it == allocate_map.end()) {
+        allocator = new (std::nothrow) T(index);
+        if (allocator != nullptr) {
+          allocate_map[index] = allocator;
+          GELOGI("Create Allocator memory type[%u] success.", index);
+        } else {
+          GELOGE(INTERNAL_ERROR, "Alloc Allocator failed.");
+        }
+      } else {
+        allocator = it->second;
+      }
+
+      if (allocator == nullptr) {
+        GELOGE(INTERNAL_ERROR, "Create Allocator failed.");
+        return INTERNAL_ERROR;
+      } else {
+        if (allocator->Initialize() != SUCCESS) {
+          return INTERNAL_ERROR;
+        }
+      }
+    }
+    return SUCCESS;
+  }
   ///
   /// @ingroup ge_graph
-  /// @brief ge create caching allocator
   /// @param [in] memory_type memory type
-  /// @return Status result of function
-  ///
-  Status InitCachingAllocator(const std::vector<rtMemType_t> &memory_type);
+  /// @param [in] allocate_map memory allocator map
+  /// @return Allocator ptr
+  ///
+  template <typename T>
+  T &GetAllocator(rtMemType_t memory_type, std::map<rtMemType_t, T *> allocate_map) {
+    std::lock_guard<std::recursive_mutex> lock(allocator_mutex_);
+    T *allocator = nullptr;
+    auto it = allocate_map.find(memory_type);
+    if (it != allocate_map.end()) {
+      allocator = it->second;
+    }
+
+    // Usually impossible
+    if (allocator == nullptr) {
+      GELOGE(ge::INTERNAL_ERROR, "Get allocator failed, memory type is %u.", memory_type);
+      static T default_allocator(RT_MEMORY_RESERVED);
+      return default_allocator;
+    }
+    return *allocator;
+  }
 
   std::map<rtMemType_t, MemoryAllocator *> memory_allocator_map_;
   std::map<rtMemType_t, CachingAllocator *> caching_allocator_map_;
+  std::map<rtMemType_t, RdmaPoolAllocator *> rdma_allocator_map_;
   std::recursive_mutex allocator_mutex_;
 };
 }  // namespace ge
diff --git a/src/ge/graph/manager/rdma_pool_allocator.cc b/src/ge/graph/manager/rdma_pool_allocator.cc
index 1ff77e92..ef82deff 100644
--- a/src/ge/graph/manager/rdma_pool_allocator.cc
+++ b/src/ge/graph/manager/rdma_pool_allocator.cc
@@ -15,7 +15,11 @@
  */
 
 #include "graph/manager/rdma_pool_allocator.h"
+
+#include <framework/common/debug/log.h>
 #include "framework/common/debug/ge_log.h"
+#include "graph/ge_context.h"
+#include "runtime/dev.h"
 
 namespace {
 const size_t kAlignedSize = 512;
@@ -52,31 +56,41 @@ Status RdmaPoolAllocator::Initialize() {
   return ge::SUCCESS;
 }
 void RdmaPoolAllocator::Finalize() {
+  GELOGD("Rdma pool finalize start.");
   for (auto it = allocated_blocks_.begin(); it != allocated_blocks_.end();) {
     auto block = it->second;
-    allocated_blocks_.erase(it);
+    it = allocated_blocks_.erase(it);
     delete block;
   }
   for (auto it = block_bin_.begin(); it != block_bin_.end();) {
     auto block = *it;
-    block_bin_.erase(it);
+    it = block_bin_.erase(it);
     delete block;
   }
 
   if (rdma_base_addr_ != nullptr) {
+    GELOGD("Start to free rdma pool memory.");
     if (memory_allocator_->FreeMemory(rdma_base_addr_) != SUCCESS) {
       GELOGW("Free rdma pool memory failed");
     }
+    rdma_base_addr_ = nullptr;
   }
 }
 
-Status RdmaPoolAllocator::InitMemory(size_t mem_size, uint32_t device_id) {
+Status RdmaPoolAllocator::InitMemory(size_t mem_size) {
+  auto device_id = GetContext().DeviceId();
+  GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id);
   if (rdma_base_addr_ != nullptr) {
     GELOGE(GE_MULTI_INIT, "Rdma pool has been malloced");
     return GE_MULTI_INIT;
   }
   const std::string purpose = "Memory for rdma pool.";
   std::lock_guard<std::recursive_mutex> lock(mutex_);
+  auto dev_id = static_cast<int32_t>(device_id);
+  GE_CHK_RT_RET(rtSetDevice(dev_id));
+  // DeviceReset before memory finished!
+  GE_MAKE_GUARD(not_used_var, [&] { GE_CHK_RT(rtDeviceReset(dev_id)); });
+
   rdma_base_addr_ = memory_allocator_->MallocMemory(purpose, mem_size, device_id);
   if (rdma_base_addr_ == nullptr) {
     GELOGE(GE_GRAPH_MALLOC_FAILED, "Rdma pool memory malloc failed");
@@ -94,6 +108,7 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size, uint32_t device_id) {
 }
 
 uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) {
+  GELOGI("start to malloc rdma memory size:%zu, device id = %u", size, device_id);
   auto aligned_size = GetAlignedBlockSize(size);
   Block key(device_id, aligned_size, nullptr);
   std::lock_guard<std::recursive_mutex> lock(mutex_);
@@ -107,9 +122,9 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) {
       return nullptr;
     }
     allocated_blocks_.emplace(block->ptr, block);
-    GELOGI("Find block size = %zu", block->size);
 
     if (ShouldSplit(block, aligned_size)) {
+      GELOGD("Block will be splited block size = %zu, aligned_size:%zu", block->size, aligned_size);
       auto *new_block =
         new (std::nothrow) Block(device_id, block->size - aligned_size, nullptr, block->ptr + aligned_size);
       if (new_block == nullptr) {
@@ -126,12 +141,14 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) {
       block_bin_.insert(new_block);
     }
     return block->ptr;
+    GELOGD("Find block size = %zu", block->size);
   }
+  GELOGW("Memory block not founded.");
   return nullptr;
 }
 
 Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) {
-  GELOGI("Free device id = %u", device_id);
+  GELOGI("Free rdma memory, device id = %u", device_id);
   if (memory_addr == nullptr) {
     GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer");
     return GE_GRAPH_FREE_FAILED;
@@ -143,27 +160,41 @@ Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) {
     GELOGE(PARAM_INVALID, "Invalid memory pointer");
     return PARAM_INVALID;
   }
+
   Block *block = it->second;
   block->allocated = false;
   allocated_blocks_.erase(it);
+
+  Block *merge_blocks[] = {block->prev, block->next};
+  for (Block *merge_block : merge_blocks) {
+    MergeBlocks(block, merge_block);
+  }
   block_bin_.insert(block);
-  // Each time merge with its pre and next.
-  MergeBlockNearby(block, block->next);
-  MergeBlockNearby(block->prev, block);
+
   return SUCCESS;
 }
 
-void RdmaPoolAllocator::MergeBlockNearby(Block *pre_block, Block *block) {
-  if (!(CanMerge(pre_block) && CanMerge(block))) {
+void RdmaPoolAllocator::MergeBlocks(Block *dst, Block *src) {
+  if (!CanMerge(dst) || !CanMerge(src)) {
     return;
   }
-  pre_block->size += block->size;
-  pre_block->next = block->next;
-  if (block->next != nullptr) {
-    block->next->prev = pre_block;
+
+  if (dst->prev == src) {
+    dst->ptr = src->ptr;
+    dst->prev = src->prev;
+    if (dst->prev != nullptr) {
+      dst->prev->next = dst;
+    }
+  } else {
+    dst->next = src->next;
+    if (dst->next != nullptr) {
+      dst->next->prev = dst;
+    }
   }
-  block_bin_.erase(block);
-  delete block;
+
+  dst->size += src->size;
+  block_bin_.erase(src);
+  delete src;
 }
 
 Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) {
diff --git a/src/ge/graph/manager/rdma_pool_allocator.h b/src/ge/graph/manager/rdma_pool_allocator.h
index e1da29a9..4d8cf71e 100644
--- a/src/ge/graph/manager/rdma_pool_allocator.h
+++ b/src/ge/graph/manager/rdma_pool_allocator.h
@@ -40,12 +40,12 @@ class RdmaPoolAllocator {
 
   RdmaPoolAllocator &operator=(const RdmaPoolAllocator &) = delete;
 
-  ~RdmaPoolAllocator() { Finalize(); }
+  ~RdmaPoolAllocator() = default;
 
   Status Initialize();
   void Finalize();
 
-  Status InitMemory(size_t mem_size, uint32_t device_id = 0);
+  Status InitMemory(size_t mem_size);
 
   uint8_t *Malloc(size_t size, uint32_t device_id = 0);
 
@@ -54,7 +54,7 @@ class RdmaPoolAllocator {
   Status GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size);
 
  private:
-  void MergeBlockNearby(Block *pre_block, Block *block);
+  void MergeBlocks(Block *dst, Block *src);
 
   rtMemType_t memory_type_;
   size_t rdma_mem_size_ = 0;  // Total rdma memory size to be allocated.
diff --git a/src/ge/graph/manager/util/hcom_util.cc b/src/ge/graph/manager/util/hcom_util.cc
index 5f31c982..614f8527 100644
--- a/src/ge/graph/manager/util/hcom_util.cc
+++ b/src/ge/graph/manager/util/hcom_util.cc
@@ -63,7 +63,7 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc,
   return SUCCESS;
 }
 
-Status HcomOmeUtil::GetHcclTypeSize(hcclDataType_t data_type, int32_t &size) {
+Status HcomOmeUtil::GetHcclTypeSize(HcclDataType data_type, int32_t &size) {
   auto iter = kConstOpHcclDataTypeSize.find(data_type);
   GE_CHK_BOOL_EXEC(iter != kConstOpHcclDataTypeSize.end(), return PARAM_INVALID,
                    "HcomOmeUtil::HcomDataTypeSize , No DataTypeSize!");
@@ -72,7 +72,7 @@ Status HcomOmeUtil::GetHcclTypeSize(hcclDataType_t data_type, int32_t &size) {
   return SUCCESS;
 }
 
-Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType_t data_type, bool is_allgather,
+Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType data_type, bool is_allgather,
                                  int &count) {
   GE_CHECK_NOTNULL(op_desc);
   if (!IsHCOMOp(op_desc->GetType())) {
@@ -149,7 +149,7 @@ Status HcomOmeUtil::GetHorovodCount(const ge::ConstOpDescPtr &op_desc,
   int64_t align_size = 512;
   int32_t size = 0;
   for (size_t i = 0; i < op_desc->GetInputsSize(); i++) {
-    GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast<tagHcclDataType>(kernel_hccl_infos[i].dataType), size),
+    GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast<HcclDataType>(kernel_hccl_infos[i].dataType), size),
                       "GetHorovodCount: GetHcclTypeSize fail!");
     int64_t input_size = 0;
     int64_t block_size = 0;
@@ -187,7 +187,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc,
   GELOGI("GetHcclCount start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str());
   if (IsHCOMOp(op_desc->GetType())) {
     int32_t count = 0;
-    ret = GetHcomCount(op_desc, static_cast<tagHcclDataType>(kernel_hccl_infos[0].dataType),
+    ret = GetHcomCount(op_desc, static_cast<HcclDataType>(kernel_hccl_infos[0].dataType),
                        kernel_hccl_infos[0].hccl_type == HCOMALLGATHER, count);
     if (ret != SUCCESS) {
       GELOGE(ret, "HcomOmeUtil:: Node: %s Optype: %s get the Hcom operator hccl count fail.",
@@ -209,7 +209,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc,
   return SUCCESS;
 }
 
-Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hcclRedOp_t &op_type) {
+Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, HcclReduceOp &op_type) {
   GE_CHECK_NOTNULL(op_desc);
 
   if (IsHCOMOp(op_desc->GetType())) {
@@ -219,13 +219,13 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hccl
                      op_desc->GetName().c_str(), op_desc->GetType().c_str());
 
     if (hcom_op_type == "min") {
-      op_type = HCCL_REP_OP_MIN;
+      op_type = HCCL_REDUCE_MIN;
     } else if (hcom_op_type == "max") {
-      op_type = HCCL_REP_OP_MAX;
+      op_type = HCCL_REDUCE_MAX;
     } else if (hcom_op_type == "prod") {
-      op_type = HCCL_REP_OP_PROD;
+      op_type = HCCL_REDUCE_PROD;
     } else if (hcom_op_type == "sum") {
-      op_type = HCCL_REP_OP_SUM;
+      op_type = HCCL_REDUCE_SUM;
     } else {
       GELOGE(PARAM_INVALID, "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [%s] not support!", hcom_op_type.c_str());
       return PARAM_INVALID;
@@ -239,7 +239,7 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hccl
                      "HcomOmeUtil:: Node: %s Optype: %s Get ATTR_HOROVOD_ATTR_REDUCE_TYPE fail, not support!",
                      op_desc->GetName().c_str(), op_desc->GetType().c_str());
 
-    auto iter = kHorovodRedOpToHcclRedOp.find(static_cast<horovodRedOp_t>(horovod_op_type));
+    auto iter = kHorovodRedOpToHcclRedOp.find(static_cast<HorovodReduceOp>(horovod_op_type));
     if (iter == kHorovodRedOpToHcclRedOp.end()) {
       GELOGE(PARAM_INVALID, "HcomOmeUtil::  Node: %s Optype: %s HcomOpType cann't support! Current HcomOpType : %ld",
              op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type);
diff --git a/src/ge/graph/manager/util/hcom_util.h b/src/ge/graph/manager/util/hcom_util.h
index e31e3ef0..064058f8 100644
--- a/src/ge/graph/manager/util/hcom_util.h
+++ b/src/ge/graph/manager/util/hcom_util.h
@@ -34,24 +34,24 @@ namespace ge {
 using std::string;
 using std::vector;
 
-static std::map<int64_t, hcclDataType_t> kConstOpHcclDataType = {
-  {ge::DT_FLOAT, HCCL_DATA_TYPE_FLOAT},
-  {ge::DT_FLOAT16, HCCL_DATA_TYPE_HALF},
+static std::map<int64_t, HcclDataType> kConstOpHcclDataType = {
+  {ge::DT_FLOAT, HCCL_DATA_TYPE_FP32},
+  {ge::DT_FLOAT16, HCCL_DATA_TYPE_FP16},
   {ge::DT_INT8, HCCL_DATA_TYPE_INT8},
-  {ge::DT_INT32, HCCL_DATA_TYPE_INT},
+  {ge::DT_INT32, HCCL_DATA_TYPE_INT32},
 };
 
-static std::map<hcclDataType_t, int32_t> kConstOpHcclDataTypeSize = {
-  {HCCL_DATA_TYPE_FLOAT, sizeof(float)},
-  {HCCL_DATA_TYPE_HALF, sizeof(float) / 2},
+static std::map<HcclDataType, int32_t> kConstOpHcclDataTypeSize = {
+  {HCCL_DATA_TYPE_FP32, sizeof(float)},
+  {HCCL_DATA_TYPE_FP16, sizeof(float) / 2},
   {HCCL_DATA_TYPE_INT8, sizeof(int8_t)},
-  {HCCL_DATA_TYPE_INT, sizeof(int32_t)},
+  {HCCL_DATA_TYPE_INT32, sizeof(int32_t)},
 };
 
-static std::map<horovodRedOp_t, hcclRedOp_t> kHorovodRedOpToHcclRedOp = {
-  {HOROVOD_REP_OP_SUM, HCCL_REP_OP_SUM},           {HOROVOD_REP_OP_MIN, HCCL_REP_OP_MIN},
-  {HOROVOD_REP_OP_MAX, HCCL_REP_OP_MAX},           {HOROVOD_REP_OP_PROD, HCCL_REP_OP_PROD},
-  {HOROVOD_REP_OP_RESERVED, HCCL_REP_OP_RESERVED},
+static std::map<HorovodReduceOp, HcclReduceOp> kHorovodRedOpToHcclRedOp = {
+  {HOROVOD_REDUCE_SUM, HCCL_REDUCE_SUM},           {HOROVOD_REDUCE_MIN, HCCL_REDUCE_MIN},
+  {HOROVOD_REDUCE_MAX, HCCL_REDUCE_MAX},           {HOROVOD_REDUCE_PROD, HCCL_REDUCE_PROD},
+  {HOROVOD_REDUCE_RESERVED, HCCL_REDUCE_RESERVED},
 };
 
 class HcomOmeUtil {
@@ -71,7 +71,7 @@ class HcomOmeUtil {
   /// @return SUCCESS
   /// @return FAIL
   ///
-  static Status GetHcclTypeSize(hcclDataType_t data_type, int32_t &size);
+  static Status GetHcclTypeSize(HcclDataType data_type, int32_t &size);
 
   ///
   /// @ingroup domi_ome
@@ -87,7 +87,7 @@ class HcomOmeUtil {
   /// @return SUCCESS
   /// @return FAIL
   ///
-  static Status GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hcclRedOp_t &op_type);
+  static Status GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, HcclReduceOp &op_type);
 
   ///
   /// @ingroup domi_ome
@@ -150,8 +150,7 @@ class HcomOmeUtil {
   /// @return SUCCESS
   /// @return FAIL
   ///
-  static Status GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType_t data_type, bool is_allgather,
-                             int &count);
+  static Status GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType data_type, bool is_allgather, int &count);
 
  private:
   ///
diff --git a/src/ge/graph/manager/util/rt_context_util.cc b/src/ge/graph/manager/util/rt_context_util.cc
index 63f217a9..75b25740 100644
--- a/src/ge/graph/manager/util/rt_context_util.cc
+++ b/src/ge/graph/manager/util/rt_context_util.cc
@@ -19,31 +19,57 @@
 #include "framework/common/debug/ge_log.h"
 
 namespace ge {
+namespace {
+const int64_t kDefaultGraphId = -1;
+}
+
 void RtContextUtil::AddRtContext(uint64_t session_id, rtContext_t context) {
   std::lock_guard<std::mutex> lock(ctx_mutex_);
-  rt_contexts_[session_id].emplace_back(context);
+  rt_contexts_[session_id][kDefaultGraphId].emplace_back(context);
+}
+
+void RtContextUtil::AddRtContext(uint64_t session_id, uint32_t graph_id, rtContext_t context) {
+  std::lock_guard<std::mutex> lock(ctx_mutex_);
+  rt_contexts_[session_id][static_cast<int64_t>(graph_id)].emplace_back(context);
 }
 
 void RtContextUtil::DestroyRtContexts(uint64_t session_id) {
   std::lock_guard<std::mutex> lock(ctx_mutex_);
-  auto &contexts = rt_contexts_[session_id];
-  DestroyRtContexts(session_id, contexts);
+  auto &session_ctxs = rt_contexts_[session_id];
+  for (auto &graph_ctx_pair : session_ctxs) {
+    DestroyRtContexts(session_id, graph_ctx_pair.first, graph_ctx_pair.second);
+  }
+
   auto iter = rt_contexts_.find(session_id);
   if (iter != rt_contexts_.end()) {
     rt_contexts_.erase(iter);
   }
 }
 
+void RtContextUtil::DestroyRtContexts(uint64_t session_id, uint32_t graph_id) {
+  std::lock_guard<std::mutex> lock(ctx_mutex_);
+  auto &session_ctxs = rt_contexts_[session_id];
+  auto &graph_ctxs = session_ctxs[graph_id];
+  DestroyRtContexts(session_id, static_cast<int64_t>(graph_id), graph_ctxs);
+
+  auto iter = session_ctxs.find(graph_id);
+  if (iter != session_ctxs.end()) {
+    session_ctxs.erase(iter);
+  }
+}
+
 void RtContextUtil::DestroyAllRtContexts() {
   std::lock_guard<std::mutex> lock(ctx_mutex_);
-  for (auto &ctx_pair : rt_contexts_) {
-    DestroyRtContexts(ctx_pair.first, ctx_pair.second);
+  for (auto &session_ctx_pair : rt_contexts_) {
+    for (auto &graph_ctx_pair : session_ctx_pair.second) {
+      DestroyRtContexts(session_ctx_pair.first, graph_ctx_pair.first, graph_ctx_pair.second);
+    }
   }
   rt_contexts_.clear();
 }
 
-void RtContextUtil::DestroyRtContexts(uint64_t session_id, std::vector<rtContext_t> &contexts) {
-  GELOGI("Runtime context handle number of session %lu is %zu.", session_id, contexts.size());
+void RtContextUtil::DestroyRtContexts(uint64_t session_id, int64_t graph_id, std::vector<rtContext_t> &contexts) {
+  GELOGI("Destroy %zu rts contexts for graph %ld of session %lu.", contexts.size(), graph_id, session_id);
   for (auto &rtContext : contexts) {
     (void)rtCtxDestroy(rtContext);
   }
diff --git a/src/ge/graph/manager/util/rt_context_util.h b/src/ge/graph/manager/util/rt_context_util.h
index 58cc0803..50f0fbed 100644
--- a/src/ge/graph/manager/util/rt_context_util.h
+++ b/src/ge/graph/manager/util/rt_context_util.h
@@ -32,12 +32,9 @@ class RtContextUtil {
   }
 
   void AddRtContext(uint64_t session_id, rtContext_t context);
-
-  const rtContext_t GetNormalModeContext() const { return before_prerun_ctx_; }
-
-  void SetNormalModeContext(rtContext_t context) { before_prerun_ctx_ = context; }
-
+  void AddRtContext(uint64_t session_id, uint32_t graph_id, rtContext_t context);
   void DestroyRtContexts(uint64_t session_id);
+  void DestroyRtContexts(uint64_t session_id, uint32_t graph_id);
   void DestroyAllRtContexts();
 
   RtContextUtil &operator=(const RtContextUtil &) = delete;
@@ -47,11 +44,9 @@ class RtContextUtil {
   RtContextUtil() = default;
   ~RtContextUtil() {}
 
-  void DestroyRtContexts(uint64_t session_id, std::vector<rtContext_t> &contexts);
-
-  std::map<uint64_t, std::vector<rtContext_t>> rt_contexts_;
-  rtContext_t before_prerun_ctx_ = nullptr;
+  void DestroyRtContexts(uint64_t session_id, int64_t graph_id, std::vector<rtContext_t> &contexts);
 
+  std::map<uint64_t, std::map<int64_t, std::vector<rtContext_t>>> rt_contexts_;
   std::mutex ctx_mutex_;
 };
 }  // namespace ge
diff --git a/src/ge/graph/optimize/graph_optimize.cc b/src/ge/graph/optimize/graph_optimize.cc
index a8de6701..214f68eb 100644
--- a/src/ge/graph/optimize/graph_optimize.cc
+++ b/src/ge/graph/optimize/graph_optimize.cc
@@ -17,6 +17,7 @@
 #include "graph/optimize/graph_optimize.h"
 
 #include "graph/ge_context.h"
+#include "graph/common/local_context.h"
 #include "graph/passes/dimension_adjust_pass.h"
 #include "inc/pass_manager.h"
 #include "init/gelib.h"
@@ -68,7 +69,7 @@ void AddNodeInputProperty(ComputeGraphPtr &compute_graph) {
       src_index_list.emplace_back(peer_out_anchor->GetIdx());
       node_op_desc->SetSrcName(src_name_list);
       node_op_desc->SetSrcIndex(src_index_list);
-      GE_IF_BOOL_EXEC(!(node_op_desc->GetType() == NETOUTPUT && domi::GetContext().type == domi::TENSORFLOW),
+      GE_IF_BOOL_EXEC(!(node_op_desc->GetType() == NETOUTPUT && GetLocalOmgContext().type == domi::TENSORFLOW),
                       ge::NodePtr peer_owner_node = peer_out_anchor->GetOwnerNode();
                       input_name_list.emplace_back(
                         peer_owner_node->GetName() +
@@ -102,6 +103,17 @@ Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std
       return SUCCESS;
     }
 
+    if (build_mode_ == BUILD_MODE_TUNING && build_step_ == BUILD_STEP_AFTER_UB_MATCH) {
+      for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) {
+        Status ret = (*iter)->OptimizeFusedGraphAfterGraphSlice(*(compute_graph));
+        if (ret != SUCCESS) {
+          GELOGE(ret, "[OptimizeSubGraph][OptimizeFusedGraphStage2]: graph optimize failed, ret:%d", ret);
+          return ret;
+        }
+      }
+      return SUCCESS;
+    }
+
     for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) {
       ret = (*iter)->OptimizeFusedGraph(*(compute_graph));
       if (ret != SUCCESS) {
@@ -264,6 +276,8 @@ Status GraphOptimize::SetOptions(const ge::GraphManagerOptions &options) {
   local_fmk_op_flag_ = options.local_fmk_op_flag;
   func_bin_path_ = options.func_bin_path;
   core_type_ = options.core_type;
+  build_mode_ = options.build_mode;
+  build_step_ = options.build_step;
   return SUCCESS;
 }
 
diff --git a/src/ge/graph/optimize/graph_optimize.h b/src/ge/graph/optimize/graph_optimize.h
index 0bbeb0f7..3d2db782 100644
--- a/src/ge/graph/optimize/graph_optimize.h
+++ b/src/ge/graph/optimize/graph_optimize.h
@@ -89,6 +89,8 @@ class GraphOptimize {
   // record the summary names for filter sumarry result.
   std::map<uint32_t, std::map<string, size_t>> summary_output_indexes_ = {};
   std::string func_bin_path_;
+  std::string build_mode_;
+  std::string build_step_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_OPTIMIZE_GRAPH_OPTIMIZE_H_
diff --git a/src/ge/graph/optimize/mem_rw_conflict_optimize.cc b/src/ge/graph/optimize/mem_rw_conflict_optimize.cc
index 3ecc201a..9c166f4d 100644
--- a/src/ge/graph/optimize/mem_rw_conflict_optimize.cc
+++ b/src/ge/graph/optimize/mem_rw_conflict_optimize.cc
@@ -136,7 +136,7 @@ NodePtr CreateIdentityAfterSrcNode(const Node &src_node, int out_anchor_idx) {
   if (src_node.GetOpDesc() == nullptr) {
     return nullptr;
   }
-  static std::atomic<int> identity_num(0);
+  static std::atomic_long identity_num(0);
   auto next_num = identity_num.fetch_add(1);
   // 1. create new identity op desc
   string identity_name = src_node.GetName() + "_" + IDENTITY + std::to_string(next_num);
@@ -541,9 +541,8 @@ Status SplitIdentity(const NodePtr &node) {
   GE_CHECK_NOTNULL(pre_out_data_anchor);
   auto pre_node = pre_out_data_anchor->GetOwnerNode();
   GE_CHECK_NOTNULL(pre_node);
-  Status ret = SUCCESS;
   for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
-    ret = SplitIdentityAlongAnchor(out_data_anchor, peer_in_data_anchor, pre_out_data_anchor, pre_node);
+    Status ret = SplitIdentityAlongAnchor(out_data_anchor, peer_in_data_anchor, pre_out_data_anchor, pre_node);
     if (ret != SUCCESS) {
       GELOGE(ret, "Split identity node along anchor failed.");
       return ret;
@@ -551,7 +550,7 @@ Status SplitIdentity(const NodePtr &node) {
   }
   // 2.isolate Identity node with no data output
   if (node->GetOutDataNodesSize() == 0) {
-    ret = GraphUtils::IsolateNode(node, {});
+    Status ret = GraphUtils::IsolateNode(node, {});
     if (ret != SUCCESS) {
       GELOGE(FAILED, "IsolateAndDelete identity node %s.", node->GetName().c_str());
       return FAILED;
diff --git a/src/ge/graph/partition/dynamic_shape_partition.cc b/src/ge/graph/partition/dynamic_shape_partition.cc
index e5a33b37..9cc7d0f4 100644
--- a/src/ge/graph/partition/dynamic_shape_partition.cc
+++ b/src/ge/graph/partition/dynamic_shape_partition.cc
@@ -43,18 +43,13 @@
 #define REQUIRE_SUCCESS(cond, ...) REQUIRE(((cond) == SUCCESS), __VA_ARGS__)
 #define REQUIRE_GRAPH_SUCCESS(cond, ...) REQUIRE(((cond) == GRAPH_SUCCESS), __VA_ARGS__)
 
-bool IsExperimental() {
-  const static bool kIsExperimental = (std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION") != nullptr);
-  return kIsExperimental;
-}
-
 namespace ge {
 using Cluster = DynamicShapePartitioner::Cluster;
 using ClusterPtr = std::shared_ptr<Cluster>;
 
 Status DynamicShapePartitioner::Partition() {
   REQUIRE_NOT_NULL(root_graph_, "Graph is nullptr.");
-  if (!IsExperimental()) {
+  if (!GraphUtils::IsUnknownShapeGraph(root_graph_)) {
     GELOGD("Skip dynamic shape partition as not in experimental mode.");
     REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, false),
             "Failed set dynamic shape partitioned flag on root graph.");
@@ -872,7 +867,8 @@ void Cluster::Clear() {
   control_outputs_.clear();
   partition_node_.reset();
   subgraph_.reset();
+  unique_id_ = 0;
 }
 
-size_t Cluster::unique_id_ = 0;
+thread_local size_t Cluster::unique_id_ = 0;
 }  // namespace ge
diff --git a/src/ge/graph/partition/dynamic_shape_partition.h b/src/ge/graph/partition/dynamic_shape_partition.h
index b851a084..06a94833 100644
--- a/src/ge/graph/partition/dynamic_shape_partition.h
+++ b/src/ge/graph/partition/dynamic_shape_partition.h
@@ -81,7 +81,7 @@ class DynamicShapePartitioner {
     void Clear();
 
    private:
-    static size_t unique_id_;
+    static thread_local size_t unique_id_;
     size_t id_;
     // Each Cluster records the maximum and minimum topological order of its node
     size_t min_;  // maximum topological order
diff --git a/src/ge/graph/partition/engine_place.cc b/src/ge/graph/partition/engine_place.cc
index 2d1a7f13..ba651c88 100644
--- a/src/ge/graph/partition/engine_place.cc
+++ b/src/ge/graph/partition/engine_place.cc
@@ -15,19 +15,25 @@
  */
 
 #include "graph/partition/engine_place.h"
+
 #include <climits>
 #include <memory>
 #include <string>
 #include <utility>
+#include <mutex>
+
 #include "common/op/ge_op_utils.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/utils/op_desc_utils.h"
 #include "init/gelib.h"
 #include "opskernel_manager/ops_kernel_manager.h"
+#include "analyzer/analyzer.h"
 
 namespace ge {
-Status EnginePlacer::Run() {
-  GELOGI("Engine placer starts.");
+namespace {
+std::mutex check_support_cost_mutex;
+}
+Status EnginePlacer::Check() const {
   if (compute_graph_ == nullptr) {
     GELOGE(GE_GRAPH_NULL_INPUT, "compute_graph_ is null.");
     return FAILED;
@@ -37,23 +43,48 @@ Status EnginePlacer::Run() {
     GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Run enginePlacer failed");
     return FAILED;
   }
+  return SUCCESS;
+}
+
+Status EnginePlacer::Run() {
+  std::lock_guard<std::mutex> lock(check_support_cost_mutex);
+
+  GELOGI("Engine placer starts.");
+  if (Check() != SUCCESS) {
+    return FAILED;
+  }
+  bool is_check_support_success = true;
   // Assign engine for each node in the graph
-  instance_ptr->DNNEngineManagerObj().InitPerformanceStaistic();
+  ge::GELib::GetInstance()->DNNEngineManagerObj().InitPerformanceStaistic();
   for (const auto &node_ptr : compute_graph_->GetDirectNode()) {
     GE_CHECK_NOTNULL(node_ptr);
-    GE_CHECK_NOTNULL(node_ptr->GetOpDesc());
+    auto op_desc = node_ptr->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
     std::string engine_name;
+    std::string kernel_name;
     // Check if this node has assigned engine
-    if ((!node_ptr->GetOpDesc()->GetOpKernelLibName().empty())) {
-      engine_name = node_ptr->GetOpDesc()->GetOpEngineName();
+    bool has_engine_attr =
+      AttrUtils::GetStr(op_desc, ATTR_NAME_ENGINE_NAME_FOR_LX, engine_name) && !engine_name.empty();
+    bool has_kernel_attr =
+      AttrUtils::GetStr(op_desc, ATTR_NAME_KKERNEL_LIB_NAME_FOR_LX, kernel_name) && !kernel_name.empty();
+    bool use_exist_engine_name = !op_desc->GetOpKernelLibName().empty() || (has_kernel_attr && has_engine_attr);
+    if (use_exist_engine_name) {
+      if (op_desc->GetOpEngineName().empty()) {
+        GELOGI("Op %s set engine_name %s engine_name %s from attrs", op_desc->GetName().c_str(), engine_name.c_str(),
+               kernel_name.c_str());
+        op_desc->SetOpEngineName(engine_name);
+        op_desc->SetOpKernelLibName(kernel_name);
+      }
+      engine_name = op_desc->GetOpEngineName();
     } else {
       // Call placer cost model to get the "best" engine for this node
-      engine_name = instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node_ptr->GetOpDesc());
-      // If can't get op's engine name, return failed
+      engine_name = ge::GELib::GetInstance()->DNNEngineManagerObj().GetDNNEngineName(node_ptr);
+      // If can't get op's engine name, keep check support finish and return failed
       if (engine_name.empty()) {
+        is_check_support_success = false;
         GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Can not find engine of op type %s",
                node_ptr->GetOpDesc()->GetType().c_str());
-        return FAILED;
+        continue;
       }
     }
     if (AssignEngineAndLog(node_ptr, engine_name) != SUCCESS) {
@@ -61,11 +92,12 @@ Status EnginePlacer::Run() {
       return FAILED;
     }
   }
-  for (auto &it : instance_ptr->DNNEngineManagerObj().GetCheckSupportCost()) {
+
+  for (auto &it : ge::GELib::GetInstance()->DNNEngineManagerObj().GetCheckSupportCost()) {
     GEEVENT("The time cost of %s::CheckSupported is [%lu] micro second.", it.first.c_str(), it.second);
   }
   GELOGI("Engine placer ends.");
-  return SUCCESS;
+  return is_check_support_success ? SUCCESS : FAILED;
 }
 
 Status EnginePlacer::AssignEngineAndLog(ge::ConstNodePtr node_ptr, const std::string &engine_name) {
diff --git a/src/ge/graph/partition/engine_place.h b/src/ge/graph/partition/engine_place.h
index 8a3e83a5..1672df0d 100644
--- a/src/ge/graph/partition/engine_place.h
+++ b/src/ge/graph/partition/engine_place.h
@@ -46,6 +46,7 @@ class EnginePlacer {
 
  private:
   Status AssignEngineAndLog(ConstNodePtr node_ptr, const std::string &engine_name);
+  Status Check() const;
 
   ComputeGraphPtr compute_graph_;
   NodeEngineMap node_engine_map_;
diff --git a/src/ge/graph/partition/graph_partition.cc b/src/ge/graph/partition/graph_partition.cc
index 15f298c0..b280074e 100644
--- a/src/ge/graph/partition/graph_partition.cc
+++ b/src/ge/graph/partition/graph_partition.cc
@@ -362,13 +362,18 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr
   }
   GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "peerIndex", graph_info_.num_of_pld_end_),
                   GELOGW("SetInt peerIndex failed");)
+  GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_peerNodeName", new_end_node->GetName()),
+                  GELOGW("SetStr _peerNodeName failed");)
   GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentOpType", src_node->GetType()),
                   GELOGW("SetStr parentOpType failed");)
+  GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_parentNodeName", src_node->GetName()),
+                  GELOGW("SetStr parentOpName failed");)
   GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentId", end_graph->GetName() + ":" + std::to_string(node_id)),
                   GELOGW("SetStr parentId failed");)
   GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "anchorIndex", AnchorUtils::GetIdx(out_anchor)),
                   GELOGW("SetInt anchorIndex failed");)
   GE_IF_BOOL_EXEC(!pld_op_desc->SetExtAttr("parentNode", src_node), GELOGW("SetPldExtAttr parentNode failed");)
+
   OpDescPtr src_node_op_desc = src_node->GetOpDesc();
   GE_CHECK_NOTNULL(src_node_op_desc);
   GE_IF_BOOL_EXEC(
@@ -530,6 +535,10 @@ Status ge::GraphPartitioner::Initialize(ge::ComputeGraphPtr compute_graph) {
       ClusterPtr cluster = MakeShared<Cluster>(temp_index, kEngineDefaultData, temp_stream);
       new_cluster = cluster;
     } else {
+      if (node_engine_map->count(node) == 0) {
+        GELOGE(FAILED, "node[%s] does not owner engine!", node->GetName().c_str());
+        return FAILED;
+      }
       ClusterPtr cluster = MakeShared<Cluster>(temp_index, node_engine_map->at(node), temp_stream);
       new_cluster = cluster;
     }
@@ -577,32 +586,33 @@ Status ge::GraphPartitioner::AddPartitionsToGraphNode(vector<ge::SubGraphInfoPtr
     // flush parent node of subgraph
     sub_graph->SetParentNode(compute_graph->GetParentNode());
     (void)AttrUtils::SetStr(*sub_graph, ATTR_NAME_PARENT_GRAPH_NAME, compute_graph->GetName());
+    auto sgi = MakeShared<SubGraphInfo>();
+    if (sgi == nullptr) {
+      GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MakeShared sub graph info failed.");
+      return FAILED;
+    }
+    // set engine name
+    sgi->SetEngineName(engine_name);
+    // set stream label
+    string sub_graph_stream;
+    if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) {
+      sgi->SetStreamLabel(sub_graph_stream);
+    }
+    /// for now inputFlag is the same before and after partition. It should
+    /// be changed according to the real partition
+    std::vector<bool> sub_graph_input(graph_info_.input_size_, true);
+    std::vector<bool> sub_graph_output(graph_info_.output_size_, true);
+    sgi->SetSubGraph(sub_graph);
+    sgi->SetOutputFlag(sub_graph_output);
+    sgi->SetInputFlag(sub_graph_input);
+    sgi->SetOutputContext(graph_info_.output_name_);
+    AddEndPldInformationToSubGraphInfo(sgi);
+    GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s", engine_name.c_str(),
+           sub_graph->GetName().c_str(), sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str());
     if (engine_name != input_subgraph_name) {  // do not add Data subGraph into SubGraphInfo
-      auto sgi = MakeShared<SubGraphInfo>();
-      if (sgi == nullptr) {
-        GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MakeShared sub graph info failed.");
-        return FAILED;
-      }
-      // set engine name
-      sgi->SetEngineName(engine_name);
-      // set stream label
-      string sub_graph_stream;
-      if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) {
-        sgi->SetStreamLabel(sub_graph_stream);
-      }
-      /// for now inputFlag is the same before and after partition. It should
-      /// be changed according to the real partition
-      std::vector<bool> sub_graph_input(graph_info_.input_size_, true);
-      std::vector<bool> sub_graph_output(graph_info_.output_size_, true);
-      sgi->SetSubGraph(sub_graph);
-      sgi->SetOutputFlag(sub_graph_output);
-      sgi->SetInputFlag(sub_graph_input);
-      sgi->SetOutputContext(graph_info_.output_name_);
-      AddEndPldInformationToSubGraphInfo(sgi);
-      GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s",
-             engine_name.c_str(), sub_graph->GetName().c_str(),
-             sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str());
       output_subgraphs.push_back(sgi);
+    } else {
+      graph_2_input_subgraph_[compute_graph] = sgi;
     }
   }
   return SUCCESS;
diff --git a/src/ge/graph/partition/graph_partition.h b/src/ge/graph/partition/graph_partition.h
index 26592359..a363bd9d 100644
--- a/src/ge/graph/partition/graph_partition.h
+++ b/src/ge/graph/partition/graph_partition.h
@@ -173,8 +173,10 @@ class GraphPartitioner {
   };
   std::unordered_map<ComputeGraphPtr, GraphPartitionInfo> graph_2_graph_partition_info_;
   Graph2SubGraphInfoList graph_2_subgraph_list_;
+  Graph2InputNodesSubGraphInfo graph_2_input_subgraph_;
   GraphPartitionInfo graph_info_;
   uint32_t partition_times_;  // times of call partition
+  friend class GraphManager;
 };
 }  // namespace ge
 
diff --git a/src/ge/graph/passes/common_subexpression_elimination_pass.cc b/src/ge/graph/passes/common_subexpression_elimination_pass.cc
index 18f2e857..4415d144 100644
--- a/src/ge/graph/passes/common_subexpression_elimination_pass.cc
+++ b/src/ge/graph/passes/common_subexpression_elimination_pass.cc
@@ -20,6 +20,7 @@
 #include <string>
 #include <set>
 
+#include "common/base64.h"
 #include "graph/utils/node_utils.h"
 #include "ge_local_engine/engine/host_cpu_engine.h"
 #include "graph/passes/folding_pass.h"
@@ -83,7 +84,7 @@ Status CommonSubexpressionEliminationPass::Run(ComputeGraphPtr graph) {
       continue;
     }
     auto key = GetCseKey(node);
-    GELOGD("The node %s cse key %s", node->GetName().c_str(), key.c_str());
+    GELOGD("The node %s cse key %s", node->GetName().c_str(), ge::base64::EncodeToBase64(key).c_str());
     auto iter = keys_to_node.find(key);
     if (iter == keys_to_node.end()) {
       keys_to_node[key] = node;
diff --git a/src/ge/graph/passes/compile_nodes_pass.cc b/src/ge/graph/passes/compile_nodes_pass.cc
index 330569a2..a93671c7 100644
--- a/src/ge/graph/passes/compile_nodes_pass.cc
+++ b/src/ge/graph/passes/compile_nodes_pass.cc
@@ -93,7 +93,7 @@ graphStatus CompileNodesPass::GetSupportedKernel(const NodePtr &node, const std:
   // reset op kernel lib, find supported kernel
   kernel_lib_name = op_desc->GetOpKernelLibName();
   if (kernel_lib_name.empty()) {
-    (void)instance->DNNEngineManagerObj().GetDNNEngineName(op_desc);
+    (void)instance->DNNEngineManagerObj().GetDNNEngineName(node);
     kernel_lib_name = op_desc->GetOpKernelLibName();
     if (kernel_lib_name.empty()) {
       GELOGE(GRAPH_FAILED, "Get node:%s, type:%s kernel lib failed.", node->GetName().c_str(),
diff --git a/src/ge/graph/passes/cond_pass.cc b/src/ge/graph/passes/cond_pass.cc
index 03ca9009..c3a421b1 100644
--- a/src/ge/graph/passes/cond_pass.cc
+++ b/src/ge/graph/passes/cond_pass.cc
@@ -227,7 +227,7 @@ Status CondPass::HandleScalarCond(const ComputeGraphPtr &graph, const OutDataAnc
   GELOGI("Handle cond with scalar cond-input.");
 
   GeTensorDesc tensor = out_anchor->GetOwnerNode()->GetOpDesc()->GetOutputDesc(out_anchor->GetIdx());
-  std::string cast_name = out_anchor->GetOwnerNode()->GetName() + "_Cast";
+  std::string cast_name = in_anchor->GetOwnerNode()->GetName() + "_Cast";
   NodePtr cast_node = AddCastNode(graph, cast_name, tensor, src_type, DT_INT32);
   if (cast_node == nullptr) {
     GELOGE(FAILED, "Add Cast node failed, name:%s.", cast_name.c_str());
@@ -266,7 +266,7 @@ Status CondPass::InsertNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr
   out_tensor.SetShape(in_tensor.GetShape());
   out_tensor.SetOriginShape(in_tensor.GetOriginShape());
 
-  OpDescBuilder op_desc_builder(out_anchor->GetOwnerNode()->GetName() + "_" + type, type);
+  OpDescBuilder op_desc_builder(in_anchor->GetOwnerNode()->GetName() + "_" + type, type);
   OpDescPtr op_desc = op_desc_builder.AddInput("x", in_tensor).AddOutput("y", out_tensor).Build();
   if (op_desc == nullptr) {
     GELOGE(FAILED, "Create op_desc failed.");
diff --git a/src/ge/graph/passes/ctrl_edge_transfer_pass.cc b/src/ge/graph/passes/ctrl_edge_transfer_pass.cc
index 9454c00d..6c426e95 100644
--- a/src/ge/graph/passes/ctrl_edge_transfer_pass.cc
+++ b/src/ge/graph/passes/ctrl_edge_transfer_pass.cc
@@ -20,6 +20,7 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/util.h"
 #include "graph/utils/graph_utils.h"
+#include "graph/debug/ge_attr_define.h"
 
 namespace ge {
 /* Pass Explaination:
@@ -42,6 +43,12 @@ Status CtrlEdgeTransferPass::Run(ge::ComputeGraphPtr graph) {
   GELOGD("CtrlEdgeTransferPass start running");
   GE_CHECK_NOTNULL(graph);
 
+  bool is_dynamic_shape = false;
+  (void)AttrUtils::GetBool(graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, is_dynamic_shape);
+  if (!is_dynamic_shape) {
+    return SUCCESS;
+  }
+
   for (ge::NodePtr &n : graph->GetDirectNode()) {
     auto op_desc = n->GetOpDesc();
     if (op_desc == nullptr) {
diff --git a/src/ge/graph/passes/end_of_sequence_add_control_pass.cc b/src/ge/graph/passes/end_of_sequence_add_control_pass.cc
index a3928835..90c0841c 100644
--- a/src/ge/graph/passes/end_of_sequence_add_control_pass.cc
+++ b/src/ge/graph/passes/end_of_sequence_add_control_pass.cc
@@ -112,7 +112,7 @@ bool EndOfSequenceAddControlPass::IsDataLikeNode(const NodePtr &node) {
   }
   string engine_name = op_desc->GetOpEngineName();
   if (engine_name.empty()) {
-    engine_name = instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node->GetOpDesc());
+    engine_name = instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node);
   }
   const map<string, SchedulerConf> schedulers = instance_ptr->DNNEngineManagerObj().GetSchedulers();
   // Only one scheduler has been supported by now
diff --git a/src/ge/graph/passes/folding_pass.cc b/src/ge/graph/passes/folding_pass.cc
index 8281db5d..b52a3226 100644
--- a/src/ge/graph/passes/folding_pass.cc
+++ b/src/ge/graph/passes/folding_pass.cc
@@ -142,8 +142,8 @@ Status FoldingPass::Folding(NodePtr &node, vector<GeTensorPtr> &outputs) {
   for (auto iter = in_data_nodes_set.begin(); iter != in_data_nodes_set.end(); ++iter) {
     auto pre_node = *iter;
     if (pre_node->GetOutDataNodesSize() == 0) {
-      if (pre_node->GetType() == DATA) {
-        GELOGI("No need to remove data, node name:%s.", pre_node->GetName().c_str());
+      if ((pre_node->GetType() == DATA) || (pre_node->GetType() == ENTER)) {
+        GELOGI("No need to remove data/enter, node name:%s.", pre_node->GetName().c_str());
         continue;
       }
       if (IsolateAndDeleteNode(pre_node, {}) != SUCCESS) {
@@ -174,7 +174,7 @@ Status FoldingPass::DealWithInNodes(NodePtr &node) {
     if (in_node == nullptr) {
       continue;
     }
-    if ((in_node->GetType() == SWITCH) || (in_node->GetType() == REFSWITCH) || (in_node->GetType() == SWITCHN)) {
+    if ((in_node->GetType() == SWITCH) || (in_node->GetType() == REFSWITCH)) {
       GELOGI("The in_node name is %s, and node type is %s.", in_node->GetName().c_str(), in_node->GetType().c_str());
       auto ret = in_node_anchor->Unlink(in_data_anchor);
       if (ret != SUCCESS) {
diff --git a/src/ge/graph/passes/get_original_format_pass.cc b/src/ge/graph/passes/get_original_format_pass.cc
index 066c46ea..8c3c84f9 100644
--- a/src/ge/graph/passes/get_original_format_pass.cc
+++ b/src/ge/graph/passes/get_original_format_pass.cc
@@ -25,6 +25,7 @@
 #include "framework/omg/omg_inner_types.h"
 #include "graph/utils/attr_utils.h"
 #include "graph/utils/op_desc_utils.h"
+#include "graph/common/local_context.h"
 
 using domi::DOMI_TENSOR_NCHW;
 using domi::DOMI_TENSOR_NHWC;
@@ -33,8 +34,6 @@ using domi::FAILED;
 using domi::PARAM_INVALID;
 using domi::SUCCESS;
 
-using domi::GetContext;
-
 namespace ge {
 Status GetOriginalFormatPass::Run(ge::ComputeGraphPtr graph) {
   GE_CHECK_NOTNULL(graph);
@@ -62,8 +61,8 @@ Status GetOriginalFormatPass::SetOriginalFormat(const ge::ComputeGraphPtr &graph
     GE_CHECK_NOTNULL(desc_ptr);
     auto is_data = (desc_ptr->GetType() == DATA_TYPE || desc_ptr->GetType() == AIPP_DATA_TYPE);
     if (is_data) {
-      GELOGI("Data node: %s,format :%d", node_ptr->GetName().c_str(), domi::GetContext().format);
-      ori_format = static_cast<int64_t>(domi::GetContext().format);
+      GELOGI("Data node: %s,format :%d", node_ptr->GetName().c_str(), GetLocalOmgContext().format);
+      ori_format = static_cast<int64_t>(GetLocalOmgContext().format);
       GE_IF_BOOL_EXEC(!AttrUtils::SetInt(desc_ptr, ATTR_NAME_FORMAT, ori_format),
                       GELOGE(FAILED, "set ATTR_NAME_FORMAT failed");
                       return FAILED);
diff --git a/src/ge/graph/passes/infershape_pass.cc b/src/ge/graph/passes/infershape_pass.cc
index 7ed1ea8c..cacca584 100644
--- a/src/ge/graph/passes/infershape_pass.cc
+++ b/src/ge/graph/passes/infershape_pass.cc
@@ -18,12 +18,21 @@
 #include "common/util/error_manager/error_manager.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/ge_inner_error_codes.h"
+#include "analyzer/analyzer.h"
+#include "framework/common/util.h"
 #include "graph/shape_refiner.h"
 
 namespace ge {
 Status InferShapePass::Run(NodePtr &node) {
   auto ret = ShapeRefiner::InferShapeAndType(node, !OptionExists(kOptimizeAfterSubGraph));
   if (ret != GRAPH_SUCCESS) {
+    // select INFERSHAPE failed info
+    auto graph = node->GetOwnerComputeGraph();
+    GE_CHECK_NOTNULL(graph);
+    analyzer::DataInfo analyze_info{graph->GetSessionID(), graph->GetGraphID(), analyzer::INFER_SHAPE, node,
+                                    "InferShapeFailed!"};
+    (void)Analyzer::GetInstance()->DoAnalyze(analyze_info);
+
     GELOGE(GE_GRAPH_INFERSHAPE_FAILED, "infershape failed. node: %s", node->GetName().c_str());
     return GE_GRAPH_INFERSHAPE_FAILED;
   }
diff --git a/src/ge/graph/passes/iterator_op_pass.cc b/src/ge/graph/passes/iterator_op_pass.cc
index 1d11004d..656ed390 100644
--- a/src/ge/graph/passes/iterator_op_pass.cc
+++ b/src/ge/graph/passes/iterator_op_pass.cc
@@ -73,14 +73,14 @@ Status IteratorOpPass::Run(ge::ComputeGraphPtr graph) {
       GE_IF_BOOL_EXEC(status != SUCCESS, GELOGW("Fail to Get var_desc of NODE_NAME_FLOWCTRL_LOOP_PER_ITER failed.");
                       continue);
       Status ret;
-      ret = SetRtContext(graph->GetSessionID(), rtContext_t(), RT_CTX_NORMAL_MODE);
+      ret = SetRtContext(graph->GetSessionID(), graph->GetGraphID(), rtContext_t(), RT_CTX_NORMAL_MODE);
 
       // EOS will not be considered if ret is not SUCCESS.
       GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGW("Set rt context RT_CTX_NORMAL_MODE failed."); continue);
 
       status =
         GetVariableValue(graph->GetSessionID(), ge_tensor_desc, NODE_NAME_FLOWCTRL_LOOP_PER_ITER, &loop_per_iter);
-      ret = SetRtContext(graph->GetSessionID(), rtContext_t(), RT_CTX_GEN_MODE);
+      ret = SetRtContext(graph->GetSessionID(), graph->GetGraphID(), rtContext_t(), RT_CTX_GEN_MODE);
 
       // The following process will be affected if ret is not SUCCESS.
       GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Set rt context RT_CTX_GEN_MODE failed."); return ret);
@@ -279,11 +279,14 @@ ge::OpDescPtr IteratorOpPass::CreateMemcpyAsyncOp(const ge::NodePtr &pre_node) {
   return op_desc;
 }
 
-Status IteratorOpPass::SetRtContext(uint64_t session_id, rtContext_t rt_context, rtCtxMode_t mode) {
-  GELOGI("set rt_context %d, device id:%u.", static_cast<int>(mode), ge::GetContext().DeviceId());
+Status IteratorOpPass::SetRtContext(uint64_t session_id, uint32_t graph_id, rtContext_t rt_context, rtCtxMode_t mode) {
+  GELOGI("set rt_context, session id: %lu, graph id: %u, mode %d, device id:%u.", session_id, graph_id,
+         static_cast<int>(mode), ge::GetContext().DeviceId());
+
   GE_CHK_RT_RET(rtCtxCreate(&rt_context, mode, ge::GetContext().DeviceId()));
   GE_CHK_RT_RET(rtCtxSetCurrent(rt_context));
-  RtContextUtil::GetInstance().AddRtContext(session_id, rt_context);
+  RtContextUtil::GetInstance().AddRtContext(session_id, graph_id, rt_context);
+
   return SUCCESS;
 }
 }  // namespace ge
diff --git a/src/ge/graph/passes/iterator_op_pass.h b/src/ge/graph/passes/iterator_op_pass.h
index 78b951e6..77e80600 100644
--- a/src/ge/graph/passes/iterator_op_pass.h
+++ b/src/ge/graph/passes/iterator_op_pass.h
@@ -64,7 +64,7 @@ class IteratorOpPass : public GraphPass {
   ///
   ge::OpDescPtr CreateMemcpyAsyncOp(const ge::NodePtr &pre_node);
 
-  Status SetRtContext(uint64_t session_id, rtContext_t rt_context, rtCtxMode_t mode);
+  Status SetRtContext(uint64_t session_id, uint32_t graph_id, rtContext_t rt_context, rtCtxMode_t mode);
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_ITERATOR_OP_PASS_H_
diff --git a/src/ge/graph/passes/link_gen_mask_nodes_pass.cc b/src/ge/graph/passes/link_gen_mask_nodes_pass.cc
index 63ca68a2..4f122fb2 100644
--- a/src/ge/graph/passes/link_gen_mask_nodes_pass.cc
+++ b/src/ge/graph/passes/link_gen_mask_nodes_pass.cc
@@ -127,7 +127,7 @@ Status LinkGenMaskNodesPass::GetGenMaskGroupSize(vector<NodePtr> &gen_mask_nodes
 
   auto ge_lib = GELib::GetInstance();
   if ((ge_lib != nullptr) && ge_lib->InitFlag()) {
-    (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(gen_mask_op);
+    (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(gen_mask_node);
   }
 
   size_t gen_mask_group_num = kDefaultMaxParallelNum;
diff --git a/src/ge/graph/passes/memcpy_addr_async_pass.cc b/src/ge/graph/passes/memcpy_addr_async_pass.cc
index 3af40888..934f4737 100644
--- a/src/ge/graph/passes/memcpy_addr_async_pass.cc
+++ b/src/ge/graph/passes/memcpy_addr_async_pass.cc
@@ -19,6 +19,8 @@
 #include "common/ge/ge_util.h"
 #include "framework/common/debug/log.h"
 #include "graph/utils/node_utils.h"
+#include "graph/utils/op_desc_utils.h"
+#include "graph/utils/tensor_utils.h"
 
 namespace ge {
 Status MemcpyAddrAsyncPass::Run(ComputeGraphPtr graph) {
@@ -262,6 +264,11 @@ Status MemcpyAddrAsyncPass::InsertMemAddrAsyncNodeBeforeNetoutput(const ComputeG
     if ((in_node->GetType() != CONSTANT) && (in_node->GetType() != CONSTANTOP) && (in_node->GetType() != DATA)) {
       continue;
     }
+    auto desc = in_node->GetOpDesc();
+    GE_CHECK_NOTNULL(desc);
+    if (IsEmptyTenor(desc->GetOutputDesc(peer_out_anchor->GetIdx()).GetShape())) {
+      continue;
+    }
     GELOGI("Need to insert MemcpyAddrAsync before netoutput on parent graph.");
     NodePtr memcpy_addr_async_node = CreateMemcpyAddrAsyncNode(graph, peer_out_anchor, in_node);
     GE_IF_BOOL_EXEC(memcpy_addr_async_node == nullptr, GELOGE(INTERNAL_ERROR, "CreateMemcpyAddrAsyncNode failed.");
@@ -271,9 +278,30 @@ Status MemcpyAddrAsyncPass::InsertMemAddrAsyncNodeBeforeNetoutput(const ComputeG
     GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "InsertMemcpyAddrAsyncNode failed."); return ret);
     GELOGI("Insert mem_addr_async node %s success between %s and %s.", memcpy_addr_async_node->GetName().c_str(),
            in_node->GetName().c_str(), node->GetName().c_str());
-    NodeUtils::UpdateIsInputConst(memcpy_addr_async_node);
+    // if src node is const, need to update attr and offset here because this pass process is after offset set.
+    if ((in_node->GetType() == CONSTANT) || (in_node->GetType() == CONSTANTOP)) {
+      NodeUtils::UpdateIsInputConst(memcpy_addr_async_node);
+      auto output_desc = node->GetOpDesc();
+      GE_CHECK_NOTNULL(output_desc);
+      auto output_tensor_desc = output_desc->MutableInputDesc(static_cast<uint32_t>(in_data_anchor->GetIdx()));
+      int64_t data_offset = 0;
+      (void)TensorUtils::GetDataOffset(*output_tensor_desc, data_offset);
+      auto input_tensor = memcpy_addr_async_node->GetOpDesc()->MutableInputDesc(0);
+      GELOGI("Need update const Offset %ld to op [%s]", data_offset, memcpy_addr_async_node->GetName().c_str());
+      TensorUtils::SetDataOffset(*input_tensor, data_offset);
+      TensorUtils::SetDataOffset(*output_tensor_desc, 0);
+    }
   }
   NodeUtils::UpdateIsInputConst(node);
   return SUCCESS;
 }
+
+bool MemcpyAddrAsyncPass::IsEmptyTenor(const GeShape &shape) const {
+  for (const auto dim : shape.GetDims()) {
+    if (dim == 0) {
+      return true;
+    }
+  }
+  return false;
+}
 }  // namespace ge
diff --git a/src/ge/graph/passes/memcpy_addr_async_pass.h b/src/ge/graph/passes/memcpy_addr_async_pass.h
index 1f184bd5..a70fcbdd 100644
--- a/src/ge/graph/passes/memcpy_addr_async_pass.h
+++ b/src/ge/graph/passes/memcpy_addr_async_pass.h
@@ -30,6 +30,7 @@ class MemcpyAddrAsyncPass : public GraphPass {
   void FindUserData(const NodePtr &node, uint32_t &parent_index);
   void FindUserDataForKnown(const NodePtr &parent_node, uint32_t &parent_index);
   void FindUserDataForNonDynamic(const ge::NodePtr &parent_node, uint32_t &parent_index);
+  bool IsEmptyTenor(const GeShape &shape) const;
 
   NodePtr CreateMemcpyAddrAsyncNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor,
                                     const NodePtr &out_of_user_data);
diff --git a/src/ge/graph/passes/multi_batch_clone_pass.cc b/src/ge/graph/passes/multi_batch_clone_pass.cc
index 3390e783..4bf41dcb 100644
--- a/src/ge/graph/passes/multi_batch_clone_pass.cc
+++ b/src/ge/graph/passes/multi_batch_clone_pass.cc
@@ -16,8 +16,8 @@
 
 #include "graph/passes/multi_batch_clone_pass.h"
 
-#include "common/ge/ge_util.h"
 #include "common/formats/utils/formats_trans_utils.h"
+#include "common/ge/ge_util.h"
 #include "graph/preprocess/multi_batch_options.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/op_desc_utils.h"
@@ -30,7 +30,9 @@ constexpr uint8_t kDataOutIndex = 0;
 constexpr uint8_t kCaseArgIndex = 1;
 
 const std::string kMultiBatchCaseNode = "ascend_mbatch_shape_case";
-const std::string kMultiBatchIndexNode = "ascend_mbatch_shape_data";
+const std::string kMultiBatchDataNode = "ascend_mbatch_shape_data";
+const std::string kMultiBatchConstNode = "ascend_mbatch_shape_const";
+const std::string kMultiBatchMapIndexNode = "ascend_mbatch_shape_mapindex";
 }  // namespace
 
 Status MultiBatchClonePass::Run(ComputeGraphPtr graph) {
@@ -59,6 +61,7 @@ Status MultiBatchClonePass::Run(ComputeGraphPtr graph) {
   }
   (void)AttrUtils::SetStr(branch, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id_);
 
+  graph->InValid();  // Will modify, need topological again.
   graph->Swap(*branch);
   if (CreateRootGraph(graph) != SUCCESS) {
     return FAILED;
@@ -174,40 +177,130 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) {
 
 ///
 /// @ingroup ge
-/// @brief Create index node for root graph.
+/// @brief Create index data node for root graph.
 /// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
+/// @param [in] NodePtr node: index data node.
 /// @return 0: SUCCESS / others: FAILED
 ///
-Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) {
-  // Data --> MapIndex --> Case
-  const OpDescPtr op_desc = MakeShared<OpDesc>(kMultiBatchIndexNode, DATA);
-  if (op_desc == nullptr) {
-    GELOGE(OUT_OF_MEMORY, "Create multi-batch index node failed");
+Status MultiBatchClonePass::CreateIndexDataNode(const ComputeGraphPtr &graph, NodePtr &node) {
+  const OpDescPtr data_desc = MakeShared<OpDesc>(kMultiBatchDataNode, DATA);
+  if (data_desc == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Create multi-batch data node failed");
     return FAILED;
   }
 
-  GeTensorDesc data_desc(GeShape(), FORMAT_ND, DT_INT32);
-  if (op_desc->AddInputDesc(data_desc) != GRAPH_SUCCESS) {
-    GELOGE(FAILED, "Add output desc failed");
+  GeTensorDesc data_tensor(GeShape({static_cast<int64_t>(batch_shapes_[0].size())}), FORMAT_ND, DT_INT32);
+  if (data_desc->AddInputDesc(data_tensor) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "Add input desc failed");
     return FAILED;
   }
-  if (op_desc->AddOutputDesc(data_desc) != GRAPH_SUCCESS) {
+  if (data_desc->AddOutputDesc(data_tensor) != GRAPH_SUCCESS) {
     GELOGE(FAILED, "Add output desc failed");
     return FAILED;
   }
 
   size_t data_index = all_data_nodes_.size();
-  (void)AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index);
-  (void)AttrUtils::SetBool(op_desc, ATTR_INSERT_BY_MBATCH, true);
+  (void)AttrUtils::SetInt(data_desc, ATTR_NAME_INDEX, data_index);
+  (void)AttrUtils::SetBool(data_desc, ATTR_INSERT_BY_MBATCH, true);
 
-  index_node_ = graph->AddNode(op_desc);
-  if (index_node_ == nullptr) {
-    GELOGE(OUT_OF_MEMORY, "Create multi-batch case node failed");
+  node = graph->AddNode(data_desc);
+  if (node == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Create multi-batch data node failed");
+    return OUT_OF_MEMORY;
+  }
+
+  return SUCCESS;
+}
+
+///
+/// @ingroup ge
+/// @brief Create index const node for root graph.
+/// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
+/// @param [in] NodePtr node: index const node.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status MultiBatchClonePass::CreateIndexConstNode(const ComputeGraphPtr &graph, NodePtr &node) {
+  const OpDescPtr const_desc = MakeShared<OpDesc>(kMultiBatchConstNode, CONSTANT);
+  if (const_desc == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Create multi-batch const node failed");
+    return FAILED;
+  }
+
+  int64_t count = batch_shapes_.size() * batch_shapes_[0].size();
+  std::unique_ptr<int32_t[]> addr(new (std::nothrow) int32_t[count]);
+  GE_CHECK_NOTNULL(addr);
+
+  size_t i = 0;
+  for (auto &batch_shape : batch_shapes_) {
+    for (int64_t dim : batch_shape) {
+      addr[i++] = static_cast<int32_t>(dim);
+    }
+  }
+
+  GeTensorDesc const_tensor(GeShape({count}), FORMAT_ND, DT_INT32);
+  GeTensor tensor(const_tensor);
+  tensor.SetData(reinterpret_cast<uint8_t *>(addr.get()), count * sizeof(int32_t));
+  if (!AttrUtils::SetTensor(const_desc, ATTR_NAME_WEIGHTS, tensor)) {
+    GELOGE(OUT_OF_MEMORY, "Failed to init tensor value for const %s", const_desc->GetName().c_str());
+    return FAILED;
+  }
+
+  if (const_desc->AddOutputDesc(const_tensor) != GRAPH_SUCCESS) {
+    GELOGE(OUT_OF_MEMORY, "Failed to add output desc for const node %s", const_desc->GetName().c_str());
+    return FAILED;
+  }
+
+  node = graph->AddNode(const_desc);
+  if (node == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Create multi-batch const node failed");
+    return OUT_OF_MEMORY;
+  }
+
+  return SUCCESS;
+}
+
+///
+/// @ingroup ge
+/// @brief Create index node for root graph.
+/// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) {
+  // Data --> MapIndex --> Case
+  NodePtr data_node;
+  GE_CHK_STATUS_RET(CreateIndexDataNode(graph, data_node), "Create data node failed");
+
+  NodePtr const_node;
+  GE_CHK_STATUS_RET(CreateIndexConstNode(graph, const_node), "Create const node failed");
+
+  OpDescBuilder op_builder(kMultiBatchMapIndexNode, "MapIndex");
+  op_builder.AddInput("x", data_node->GetOpDesc()->GetOutputDesc(0))
+    .AddInput("data_seq", const_node->GetOpDesc()->GetOutputDesc(0))
+    .AddOutput("y", GeTensorDesc(GeShape(), FORMAT_ND, DT_INT32));
+
+  const OpDescPtr op_desc = op_builder.Build();
+  if (op_desc == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Create multi-batch index desc failed");
+    return FAILED;
+  }
+  NodePtr index_node = graph->AddNode(op_desc);
+  if (index_node == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Create multi-batch index node failed");
     return OUT_OF_MEMORY;
   }
 
-  if (GraphUtils::AddEdge(index_node_->GetOutDataAnchor(0), case_node_->GetInDataAnchor(0)) != GRAPH_SUCCESS) {
-    GELOGE(FAILED, "Failed to add edge between Data:%s to Case:%s", index_node_->GetName().c_str(),
+  if (GraphUtils::AddEdge(data_node->GetOutDataAnchor(0), index_node->GetInDataAnchor(0)) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "Failed to add edge between node:%s to MapIndex:%s", data_node->GetName().c_str(),
+           index_node->GetName().c_str());
+    return FAILED;
+  }
+  if (GraphUtils::AddEdge(const_node->GetOutDataAnchor(0), index_node->GetInDataAnchor(1)) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "Failed to add edge between node:%s to MapIndex:%s", const_node->GetName().c_str(),
+           index_node->GetName().c_str());
+    return FAILED;
+  }
+  if (GraphUtils::AddEdge(index_node->GetOutDataAnchor(0), case_node_->GetInDataAnchor(0)) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "Failed to add edge between MapIndex:%s to Case:%s", index_node->GetName().c_str(),
            case_node_->GetName().c_str());
     return FAILED;
   }
@@ -366,6 +459,7 @@ Status MultiBatchClonePass::SetMaxShapeToData(const NodePtr &data) {
     return SUCCESS;
   }
 
+  (void)AttrUtils::SetListInt(data->GetOpDesc(), ATTR_MBATCH_ORIGIN_INPUT_DIMS, data_shape.GetDims());
   size_t max_shape_index = 0;
   int64_t max_size = 0;
   for (size_t i = 0; i < batch_shapes_.size(); ++i) {
diff --git a/src/ge/graph/passes/multi_batch_clone_pass.h b/src/ge/graph/passes/multi_batch_clone_pass.h
index 1da08e78..0d52b738 100644
--- a/src/ge/graph/passes/multi_batch_clone_pass.h
+++ b/src/ge/graph/passes/multi_batch_clone_pass.h
@@ -17,9 +17,9 @@
 #ifndef GE_GRAPH_PASSES_MULTI_BATCH_CLONE_PASS_H_
 #define GE_GRAPH_PASSES_MULTI_BATCH_CLONE_PASS_H_
 
+#include <map>
 #include <string>
 #include <vector>
-#include <map>
 
 #include "inc/graph_pass.h"
 
@@ -45,6 +45,24 @@ class MultiBatchClonePass : public GraphPass {
   ///
   Status CreateRootGraph(const ComputeGraphPtr &graph);
 
+  ///
+  /// @ingroup ge
+  /// @brief Create index data node for root graph.
+  /// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
+  /// @param [in] NodePtr node: index data node.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status CreateIndexDataNode(const ComputeGraphPtr &graph, NodePtr &node);
+
+  ///
+  /// @ingroup ge
+  /// @brief Create index const node for root graph.
+  /// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
+  /// @param [in] NodePtr node: index const node.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status CreateIndexConstNode(const ComputeGraphPtr &graph, NodePtr &node);
+
   ///
   /// @ingroup ge
   /// @brief Create index node for root graph.
@@ -149,7 +167,6 @@ class MultiBatchClonePass : public GraphPass {
   std::map<ComputeGraphPtr, NodePtr> all_branch_output_;
 
   NodePtr case_node_;
-  NodePtr index_node_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_MULTI_BATCH_CLONE_PASS_H_
diff --git a/src/ge/graph/passes/net_output_pass.cc b/src/ge/graph/passes/net_output_pass.cc
index f9c3835f..8ded625c 100644
--- a/src/ge/graph/passes/net_output_pass.cc
+++ b/src/ge/graph/passes/net_output_pass.cc
@@ -27,6 +27,7 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/omg/omg_inner_types.h"
 #include "graph/debug/ge_attr_define.h"
+#include "graph/common/local_context.h"
 #include "graph/passes/pass_utils.h"
 #include "graph/utils/tensor_utils.h"
 #include "graph/utils/type_utils.h"
@@ -413,7 +414,7 @@ Status NetOutputPass::ProcessWithNetoutput(const ge::ComputeGraphPtr &graph, con
 Status NetOutputPass::AddCtrlEdgesBetweenLeafAndNetOutput(const ge::ComputeGraphPtr &graph,
                                                           const ge::NodePtr &net_out_node) {
   GE_CHECK_NOTNULL(net_out_node);
-  if (!domi::GetContext().user_out_nodes.empty()) {
+  if (!GetLocalOmgContext().user_out_nodes.empty()) {
     GELOGI("No need to add ctrl edge to netoutput because user out nodes have been set.");
     return SUCCESS;
   }
@@ -603,7 +604,7 @@ Status NetOutputPass::SetUserDefDTypeAndFormatFromAtcParams(const NodePtr &outpu
     GELOGI("[NETOUTPUT PASS] The graph no need netoutput node!");
     return SUCCESS;
   }
-  auto output_type = domi::GetContext().output_type;
+  auto output_type = GetLocalOmgContext().output_type;
   auto op_desc = output_node->GetOpDesc();
   GE_CHECK_NOTNULL(op_desc);
   std::vector<std::string> userdef_dtypes;
diff --git a/src/ge/graph/passes/permute_pass.cc b/src/ge/graph/passes/permute_pass.cc
index 3c0dfd4e..e55edbb2 100644
--- a/src/ge/graph/passes/permute_pass.cc
+++ b/src/ge/graph/passes/permute_pass.cc
@@ -24,10 +24,10 @@
 #include "inc/kernel.h"
 #include "inc/kernel_factory.h"
 #include "framework/omg/omg_inner_types.h"
+#include "graph/common/local_context.h"
 
 using domi::DOMI_TENSOR_ND;
 using domi::DOMI_TENSOR_NHWC;
-using domi::GetContext;
 using domi::SUCCESS;
 using domi::TENSORFLOW;
 
@@ -39,11 +39,11 @@ Status PermutePass::Run(ComputeGraphPtr graph) {
     OpDescPtr op_desc_ptr = node->GetOpDesc();
     GE_CHECK_NOTNULL(op_desc_ptr);
     GE_IF_BOOL_EXEC(
-      op_desc_ptr->GetType() == PERMUTE && GetContext().type == domi::TENSORFLOW,
+      op_desc_ptr->GetType() == PERMUTE && GetLocalOmgContext().type == domi::TENSORFLOW,
       /// Input format 5D means NHWC in 4D way. So if input origin foramt is NCHW and
       /// permute paramter list is [0,3,1,2], this permute can be optimised.
       GE_IF_BOOL_EXEC(
-        GetContext().format != DOMI_TENSOR_ND,
+        GetLocalOmgContext().format != DOMI_TENSOR_ND,
         // Get input origin foramt
         for (NodePtr &n
              : graph->GetDirectNode()) {
diff --git a/src/ge/graph/passes/reshape_recovery_pass.cc b/src/ge/graph/passes/reshape_recovery_pass.cc
index 07b08de9..a3de0525 100644
--- a/src/ge/graph/passes/reshape_recovery_pass.cc
+++ b/src/ge/graph/passes/reshape_recovery_pass.cc
@@ -20,7 +20,7 @@
 namespace ge {
 namespace {
 NodePtr CreateReshape(const ConstGeTensorDescPtr &src, const ConstGeTensorDescPtr &dst, const ComputeGraphPtr &graph) {
-  static std::atomic<int> reshape_num(0);
+  static std::atomic_long reshape_num(0);
   auto next_num = reshape_num.fetch_add(1);
   auto reshape = MakeShared<OpDesc>("Reshape_ReshapeRecoveryPass_" + std::to_string(next_num), RESHAPE);
   if (reshape == nullptr) {
@@ -83,4 +83,4 @@ Status ReshapeRecoveryPass::Run(ComputeGraphPtr graph) {
   }
   return SUCCESS;
 }
-}  // namespace ge
\ No newline at end of file
+}  // namespace ge
diff --git a/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc b/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc
index d51f52e1..2146a35d 100644
--- a/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc
+++ b/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc
@@ -64,9 +64,10 @@ void SameTransdataBreadthFusionPass::GetSubGraphNodesInfo() {
 }
 
 OpDescPtr SameTransdataBreadthFusionPass::GetCastOp(const GeTensorDesc &in_desc, const GeTensorDesc &out_desc) {
-  static uint32_t fusion_cast_op_count = 1;
+  static std::atomic_long atomic_fusion_cast_op_count(1);
+  auto fusion_cast_op_count = atomic_fusion_cast_op_count.fetch_add(1);
   std::stringstream cast_op_name;
-  cast_op_name << "fusion_cast_" << fusion_cast_op_count++;
+  cast_op_name << "fusion_cast_" << fusion_cast_op_count;
   auto node_op = ge::OperatorFactory::CreateOperator(cast_op_name.str(), CAST);
   auto cast_op = ge::OpDescUtils::GetOpDescFromOperator(node_op);
   node_op.BreakConnect();
diff --git a/src/ge/graph/passes/subexpression_migration_pass.cc b/src/ge/graph/passes/subexpression_migration_pass.cc
index cb09a743..c7f3845e 100644
--- a/src/ge/graph/passes/subexpression_migration_pass.cc
+++ b/src/ge/graph/passes/subexpression_migration_pass.cc
@@ -24,7 +24,6 @@ namespace ge {
 constexpr uint32_t kDataOutIndex = 0;
 constexpr uint32_t kCaseInputBase = 1;
 constexpr uint32_t kInvalidParent = 0x7fffffffU;
-const std::set<std::string> kTransOpTypes = {"Cast", "TransData", "Reshape", "BnHost"};
 
 bool IsSameTensor(ConstGeTensorDescPtr src_tensor, ConstGeTensorDescPtr dst_tensor) {
   if ((src_tensor == nullptr) && (dst_tensor == nullptr)) {
@@ -163,7 +162,6 @@ Status SubexpressionMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap
       }
 
       data_nodes[parent_index] = data;
-      GELOGD("Subgraph %s has %zu Data nodes", subgraph->GetName().c_str(), data_nodes.size());
     }
   }
 
@@ -181,9 +179,9 @@ Status SubexpressionMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap
 ///
 /// @ingroup ge
 /// @brief Get all Data nodes for all subgraph.
-/// @param [in] graph: Root compute graph.
-/// @param [in] func_desc: functional OpDesc of Case.
-/// @param [out] graph_nodes: Data groups of subgraph.
+/// @param [in] node: Node Directly to Data.
+/// @param [out] inputs: parent index of Input.
+/// @param [out] outputs: parent index of Output.
 /// @return true: SUCCESS / false: FAILED
 ///
 bool SubexpressionMigrationPass::GetAssociatedNodes(const NodePtr &node, map<uint32_t, uint32_t> &inputs,
@@ -227,9 +225,9 @@ bool SubexpressionMigrationPass::GetAssociatedNodes(const NodePtr &node, map<uin
 /// @ingroup ge
 /// @brief Get all Data nodes for all subgraph.
 /// @param [in] graph_nodes: Data groups of subgraph.
-/// @param [in] data_base: Data Node for migration.
-/// @param [in] data_idx: Data groups of subgraph.
-/// @param [in] data_idx: Data groups of subgraph.
+/// @param [in] base_node: Data Node for migration.
+/// @param [in] node_idx: Parent index of Data node.
+/// @param [in] anchor_idx: Anchor index of node.
 /// @return true: Same / false: not same
 ///
 bool SubexpressionMigrationPass::IsParallelNodeSame(const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes,
@@ -245,10 +243,10 @@ bool SubexpressionMigrationPass::IsParallelNodeSame(const map<ComputeGraphPtr, m
 
     const auto &work_data = data_it->second;
     const auto &out_anchor = work_data->GetOutDataAnchor(kDataOutIndex);
-    const auto &in_ahchors = out_anchor->GetPeerInDataAnchors();
-    const auto &in_anchor = in_ahchors.at(anchor_idx);
+    const auto &in_anchors = out_anchor->GetPeerInDataAnchors();
+    const auto &in_anchor = in_anchors.at(anchor_idx);
     if (in_anchor == nullptr) {
-      GELOGE(FAILED, "Data anchor size: %u, anchor size: %zu", anchor_idx, in_ahchors.size());
+      GELOGE(FAILED, "Data anchor size: %u, anchor size: %zu", anchor_idx, in_anchors.size());
       return false;
     }
 
@@ -288,7 +286,8 @@ Status SubexpressionMigrationPass::GraphNodeMigration(const ComputeGraphPtr &gra
     for (size_t i = 0; i < in_anchors.size(); ++i) {
       const auto &in_anchor = in_anchors.at(i);
       const auto &base_node = in_anchor->GetOwnerNode();
-      if (kTransOpTypes.count(base_node->GetType()) == 0) {
+      GELOGD("Get Data direct node: %s", base_node->GetName().c_str());
+      if (!base_node->GetHostNode()) {
         continue;
       }
 
@@ -453,7 +452,7 @@ Status SubexpressionMigrationPass::AttachParallelNode(const ComputeGraphPtr &gra
       GELOGE(FAILED, "Node: %s parent index %u not found", attach->GetName().c_str(), i);
       return FAILED;
     }
-    if (it_idx->second == kInvalidParent) {  // Not connnect, Skip.
+    if (it_idx->second == kInvalidParent) {  // Not connect, Skip.
       continue;
     }
 
@@ -469,7 +468,7 @@ Status SubexpressionMigrationPass::AttachParallelNode(const ComputeGraphPtr &gra
     if (it_idx == outputs.end()) {
       return FAILED;
     }
-    if (it_idx->second == kInvalidParent) {  // Not connnect, Skip.
+    if (it_idx->second == kInvalidParent) {  // Not connect, Skip.
       continue;
     }
 
diff --git a/src/ge/graph/passes/subexpression_migration_pass.h b/src/ge/graph/passes/subexpression_migration_pass.h
index ac750725..fbe28cae 100644
--- a/src/ge/graph/passes/subexpression_migration_pass.h
+++ b/src/ge/graph/passes/subexpression_migration_pass.h
@@ -48,9 +48,9 @@ class SubexpressionMigrationPass : public GraphPass {
   ///
   /// @ingroup ge
   /// @brief Get all Data nodes for all subgraph.
-  /// @param [in] graph: Root compute graph.
-  /// @param [in] func_desc: functional OpDesc of Case.
-  /// @param [out] graph_nodes: Data groups of subgraph.
+  /// @param [in] node: Node Directly to Data.
+  /// @param [out] inputs: parent index of Input.
+  /// @param [out] outputs: parent index of Output.
   /// @return true: SUCCESS / false: FAILED
   ///
   bool GetAssociatedNodes(const NodePtr &node, map<uint32_t, uint32_t> &inputs, map<uint32_t, uint32_t> &outputs);
@@ -59,13 +59,13 @@ class SubexpressionMigrationPass : public GraphPass {
   /// @ingroup ge
   /// @brief Get all Data nodes for all subgraph.
   /// @param [in] graph_nodes: Data groups of subgraph.
-  /// @param [in] data_base: Data Node for migration.
-  /// @param [in] data_idx: Data groups of subgraph.
-  /// @param [in] data_idx: Data groups of subgraph.
+  /// @param [in] base_node: Data Node for migration.
+  /// @param [in] node_idx: Parent index of Data node.
+  /// @param [in] anchor_idx: Anchor index of node.
   /// @return true: Same / false: not same
   ///
   bool IsParallelNodeSame(const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes, const NodePtr &base_node,
-                          uint32_t base_idx, uint32_t anchor_idx);
+                          uint32_t node_idx, uint32_t anchor_idx);
 
   ///
   /// @ingroup ge
@@ -134,4 +134,4 @@ class SubexpressionMigrationPass : public GraphPass {
   bool migration_append_{false};
 };
 }  // namespace ge
-#endif  // GE_COMMON_SUBEXPRESSION_MIGRATION_H_
\ No newline at end of file
+#endif  // GE_COMMON_SUBEXPRESSION_MIGRATION_H_
diff --git a/src/ge/graph/passes/switch_data_edges_bypass.cc b/src/ge/graph/passes/switch_data_edges_bypass.cc
index 059ad772..d7f5d90f 100644
--- a/src/ge/graph/passes/switch_data_edges_bypass.cc
+++ b/src/ge/graph/passes/switch_data_edges_bypass.cc
@@ -16,6 +16,7 @@
 
 #include "switch_data_edges_bypass.h"
 
+#include <atomic>
 #include "common/debug/log.h"
 #include "common/ge/ge_util.h"
 #include "common/op/ge_op_utils.h"
@@ -78,7 +79,8 @@ std::pair<NodePtr, OutDataAnchorPtr> GetInDataNodeByIndex(const NodePtr &node, i
   return {out_anchor->GetOwnerNode(), out_anchor};
 }
 NodePtr AddIdentityAfterNode(const NodePtr &node, int index) {
-  static int identity_counter = 0;
+  static std::atomic_long atomic_identity_counter(0);
+  auto identity_counter = atomic_identity_counter.fetch_add(1);
 
   auto node_desc = node->GetOpDesc();
   if (node_desc == nullptr) {
@@ -100,7 +102,7 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) {
   }
 
   auto identity_opdesc =
-    MakeShared<OpDesc>("SwitchDataEdgesByPass_Identity_" + std::to_string(identity_counter++), IDENTITY);
+    MakeShared<OpDesc>("SwitchDataEdgesByPass_Identity_" + std::to_string(identity_counter), IDENTITY);
   if (identity_opdesc == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to add identity after node %s index %d", node->GetName().c_str(), index);
     return nullptr;
@@ -117,7 +119,8 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) {
   return identity;
 }
 NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) {
-  static int counter = 0;
+  static std::atomic_long atomic_counter(0);
+  auto counter = atomic_counter.fetch_add(1);
 
   auto node_desc = node->GetOpDesc();
   if (node_desc == nullptr) {
@@ -138,7 +141,7 @@ NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) {
     return nullptr;
   }
 
-  auto memcpy_opdesc = MakeShared<OpDesc>("SwitchDataEdgesByPass_Memcpy_" + std::to_string(counter++), MEMCPYASYNC);
+  auto memcpy_opdesc = MakeShared<OpDesc>("SwitchDataEdgesByPass_Memcpy_" + std::to_string(counter), MEMCPYASYNC);
   if (memcpy_opdesc == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to add memcpy before node %s index %d", node->GetName().c_str(), index);
     return nullptr;
@@ -218,4 +221,4 @@ Status SwitchDataEdgesBypass::BypassSwitch(const NodePtr &node) {
 
   return SUCCESS;
 }
-}  // namespace ge
\ No newline at end of file
+}  // namespace ge
diff --git a/src/ge/graph/passes/transop_breadth_fusion_pass.cc b/src/ge/graph/passes/transop_breadth_fusion_pass.cc
index d8df4a22..5c754f4f 100644
--- a/src/ge/graph/passes/transop_breadth_fusion_pass.cc
+++ b/src/ge/graph/passes/transop_breadth_fusion_pass.cc
@@ -28,6 +28,12 @@ Status TransOpBreadthFusionPass::Run(ge::ComputeGraphPtr graph) {
   if (graph == nullptr) {
     return SUCCESS;
   }
+  // breadth fusion pass requires new topologic
+  Status ret_topo = graph->TopologicalSorting();
+  if (ret_topo != SUCCESS) {
+    GELOGE(ret_topo, "TopologicalSorting the merged graph failed.");
+    return ret_topo;
+  }
 
   for (auto const &node : graph->GetDirectNode()) {
     GE_CHECK_NOTNULL(node);
diff --git a/src/ge/graph/passes/transop_symmetry_elimination_pass.cc b/src/ge/graph/passes/transop_symmetry_elimination_pass.cc
index 9d0ac4d4..e217656c 100644
--- a/src/ge/graph/passes/transop_symmetry_elimination_pass.cc
+++ b/src/ge/graph/passes/transop_symmetry_elimination_pass.cc
@@ -163,9 +163,9 @@ bool TransOpSymmetryEliminationPass::JudgeTransposeDBack2Raw(const NodePtr &src_
   //  which we get through 3: i = perm_1[perm_2[i]]
   //
   vector<int64_t> src_node_perm;
-  AttrUtils::GetListInt(src_node->GetOpDesc(), ge::PERMUTE_ATTR_PERM, src_node_perm);
+  (void)AttrUtils::GetListInt(src_node->GetOpDesc(), ge::PERMUTE_ATTR_PERM, src_node_perm);
   vector<int64_t> dst_node_perm;
-  AttrUtils::GetListInt(dst_node->GetOpDesc(), ge::PERMUTE_ATTR_PERM, dst_node_perm);
+  (void)AttrUtils::GetListInt(dst_node->GetOpDesc(), ge::PERMUTE_ATTR_PERM, dst_node_perm);
 
   if (src_node_perm.size() != dst_node_perm.size()) {
     return false;
diff --git a/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc b/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc
index 3080e886..61bca6b8 100644
--- a/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc
+++ b/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <sstream>
 #include <string>
+#include <atomic>
 #include "common/ge/ge_util.h"
 #include "common/ge_inner_error_codes.h"
 #include "common/types.h"
@@ -451,9 +452,11 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkNodesWhenDescNotChanged(
 
 OpDescPtr TransOpWithoutReshapeFusionPass::GetFormatTransferOp(const GeTensorDesc &format_trans_input_desc,
                                                                const GeTensorDesc &format_trans_output_desc) {
-  static uint32_t fusion_format_transfer_op_count = 1;
+  static std::atomic_long atomic_fusion_format_transfer_op_count(1);
+  auto fusion_format_transfer_op_count = atomic_fusion_format_transfer_op_count.fetch_add(1);
+
   std::stringstream format_transfer_op_name;
-  format_transfer_op_name << "fusion_format_transfer_" << fusion_format_transfer_op_count++;
+  format_transfer_op_name << "fusion_format_transfer_" << fusion_format_transfer_op_count;
   OpDescPtr format_transfer_op = MakeShared<OpDesc>(format_transfer_op_name.str().c_str(), TRANSDATA);
   if (format_transfer_op == nullptr) {
     GELOGE(INTERNAL_ERROR, "new format transfer op failed!");
@@ -496,9 +499,11 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetFormatTransferOp(const GeTensorDes
 
 OpDescPtr TransOpWithoutReshapeFusionPass::GetCastOp(const GeTensorDesc &cast_input_desc,
                                                      const GeTensorDesc &cast_output_desc) {
+  static std::atomic_long atomic_fusion_cast_op_count(1);
+  auto fusion_cast_op_count = atomic_fusion_cast_op_count.fetch_add(1);
+
   std::stringstream cast_op_name;
-  static uint32_t fusion_cast_op_count = 1;
-  cast_op_name << "fusion_cast_op_" << fusion_cast_op_count++;
+  cast_op_name << "fusion_cast_op_" << fusion_cast_op_count;
   auto node_op = ge::OperatorFactory::CreateOperator(cast_op_name.str(), CAST);
   auto cast_op = ge::OpDescUtils::GetOpDescFromOperator(node_op);
   node_op.BreakConnect();
diff --git a/src/ge/graph/passes/transpose_transdata_pass.cc b/src/ge/graph/passes/transpose_transdata_pass.cc
index 3ac6dea5..b9bd59be 100644
--- a/src/ge/graph/passes/transpose_transdata_pass.cc
+++ b/src/ge/graph/passes/transpose_transdata_pass.cc
@@ -43,7 +43,7 @@ Status TransposeTransDataPass::Run(NodePtr &node) {
     return PARAM_INVALID;
   }
 
-  if (op_desc->GetType() != TRANSPOSE && op_desc->GetType() != TRANSPOSED) {
+  if (op_desc->GetType() != TRANSPOSED) {
     return SUCCESS;
   }
   if (CheckOneInAndOneOutDataAnchor(node) != SUCCESS) {
diff --git a/src/ge/graph/preprocess/graph_preprocess.cc b/src/ge/graph/preprocess/graph_preprocess.cc
index 4df22cfc..20216941 100644
--- a/src/ge/graph/preprocess/graph_preprocess.cc
+++ b/src/ge/graph/preprocess/graph_preprocess.cc
@@ -32,6 +32,7 @@
 #include "common/formats/utils/formats_trans_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/common/ge_call_wrapper.h"
+#include "graph/common/local_context.h"
 #include "graph/common/transop_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/ge_context.h"
@@ -1073,10 +1074,14 @@ Status GraphPrepare::CheckRefOp() {
 };
 
 Status GraphPrepare::SetRtContext(rtContext_t rt_context, rtCtxMode_t mode) {
-  GELOGI("set rt_context %d, device id:%u.", static_cast<int>(mode), ge::GetContext().DeviceId());
+  GE_CHECK_NOTNULL(compute_graph_);
+  GELOGI("set rt_context, session id: %lu, graph id: %u, mode %d, device id:%u.", session_id_,
+         compute_graph_->GetGraphID(), static_cast<int>(mode), ge::GetContext().DeviceId());
+
   GE_CHK_RT_RET(rtCtxCreate(&rt_context, mode, ge::GetContext().DeviceId()));
   GE_CHK_RT_RET(rtCtxSetCurrent(rt_context));
-  RtContextUtil::GetInstance().AddRtContext(session_id_, rt_context);
+  RtContextUtil::GetInstance().AddRtContext(session_id_, compute_graph_->GetGraphID(), rt_context);
+
   return SUCCESS;
 }
 
@@ -1109,14 +1114,14 @@ Status GraphPrepare::AdjustDataOpOutput(const NodePtr &node) {
 }
 
 Status GraphPrepare::UpdateInput(const std::vector<GeTensor> &user_input) {
-  compute_graph_->SaveDataFormat(ge::TypeUtils::DomiFormatToFormat(domi::GetContext().format));
+  compute_graph_->SaveDataFormat(ge::TypeUtils::DomiFormatToFormat(GetLocalOmgContext().format));
   for (NodePtr &input_node : compute_graph_->GetDirectNode()) {
     GE_CHECK_NOTNULL(input_node);
     OpDescPtr op = input_node->GetOpDesc();
     GE_CHECK_NOTNULL(op);
     if (op->GetType() == DATA) {
       GeAttrValue::INT index = 0;
-      if ((!(AttrUtils::GetInt(op, ATTR_NAME_INDEX, index))) || (domi::GetContext().is_dynamic_input)) {
+      if ((!(AttrUtils::GetInt(op, ATTR_NAME_INDEX, index))) || (GetLocalOmgContext().is_dynamic_input)) {
         GELOGW("Get index from data attr failed");
         continue;
       }
@@ -1357,7 +1362,7 @@ Status GraphPrepare::PrepareDynShape(ConstGraphPtr graph, const std::vector<GeTe
   GE_CHECK_NOTNULL(graph);
   GE_CHECK_NOTNULL(compute_graph);
 
-  domi::GetContext().type = static_cast<domi::FrameworkType>(options_.framework_type);
+  GetLocalOmgContext().type = static_cast<domi::FrameworkType>(options_.framework_type);
   const Graph &const_graph = *graph;
 
   PP_RUN("Init", Init, const_graph, session_id);
@@ -1520,7 +1525,7 @@ Status GraphPrepare::VerifyConstOp(const NodePtr &node) {
 }
 
 Status GraphPrepare::CheckUserInput(const std::vector<GeTensor> &user_input) {
-  if (domi::GetContext().is_dynamic_input) {
+  if (GetLocalOmgContext().is_dynamic_input) {
     return SUCCESS;
   }
   unsigned int node_num = 0;
diff --git a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc
index c231ef15..eb936282 100644
--- a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc
+++ b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc
@@ -39,6 +39,7 @@
 #include "graph/utils/tensor_utils.h"
 #include "graph/utils/type_utils.h"
 #include "proto/insert_op.pb.h"
+#include "graph/common/local_context.h"
 
 #define SAVE_AIPP_ATTR(KEY, SAVE_TYPE)                                                       \
   do {                                                                                       \
@@ -144,13 +145,13 @@ int64_t CalcMaxSize(int64_t batch_count) {
 }
 
 Format GetAndCheckFormat() {
-  switch (domi::GetContext().format) {
+  switch (GetLocalOmgContext().format) {
     case domi::DOMI_TENSOR_NCHW:
       return FORMAT_NCHW;
     case domi::DOMI_TENSOR_NHWC:
       return FORMAT_NHWC;
     default:
-      GELOGE(PARAM_INVALID, "Unexpected format found %d", static_cast<int>(domi::GetContext().format));
+      GELOGE(PARAM_INVALID, "Unexpected format found %d", static_cast<int>(GetLocalOmgContext().format));
       return FORMAT_ND;
   }
 }
@@ -619,8 +620,9 @@ void AippOp::SetDtcDefaultValue() {
 Status AippOp::GenerateOpDesc(OpDescPtr op_desc) {
   GE_CHECK_NOTNULL(op_desc);
 
-  static int op_idx = 0;
-  op_desc->SetName(std::string("aipp_node").append(std::to_string(op_idx++)));
+  static std::atomic_long atomic_op_idx(0);
+  auto op_idx = atomic_op_idx.fetch_add(1);
+  op_desc->SetName(std::string("aipp_node").append(std::to_string(op_idx)));
   op_desc->SetType(AIPP);
 
   // Add two InputDesc, add the second after the first one is added successfully.
diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
index 38bc595e..c55be013 100644
--- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
+++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
@@ -39,6 +39,8 @@ using domi::AippOpParams;
 namespace ge {
 namespace {
 const char *const kMbatchSwitchnName = "mbatch-switch-name";
+const int64_t kFormatAgnosticSwitch = 1;
+const int64_t kFormatDependInputIndex = 1;
 }  // namespace
 static void ConvertShape2Nhwc(Format &format, vector<int64_t> &shape_vec) {
   if ((format == FORMAT_NHWC) || (shape_vec.size() != static_cast<size_t>(NORMAL_TENSOR_SIZE))) {
@@ -200,9 +202,28 @@ Status InsertNewOpUtil::GetAippParams(const std::unique_ptr<domi::AippOpParams>
 
   return SUCCESS;
 }
+
+Status InsertNewOpUtil::AddFormatAgnosticAttrToSwitchn(const NodePtr &aipp_node) {
+  GE_CHECK_NOTNULL(aipp_node);
+  auto next_nodes = aipp_node->GetOutDataNodes();
+  for (const auto next_node : next_nodes) {
+    GE_CHECK_NOTNULL(next_node);
+    auto op_desc = next_node->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
+    if (op_desc->GetType() == SWITCHN) {
+      GELOGI("Find switchn node [%s] after aipp [%s]", op_desc->GetName().c_str(), aipp_node->GetName().c_str());
+      (void)AttrUtils::SetInt(op_desc, "_format_agnostic", kFormatAgnosticSwitch);
+      (void)AttrUtils::SetListInt(op_desc, "_format_agnostic_except_input",
+                                  std::vector<int64_t>({kFormatDependInputIndex}));
+    }
+  }
+  return SUCCESS;
+}
+
 Status InsertNewOpUtil::UpdateDataNodeByAipp(const ComputeGraphPtr &graph) {
   std::map<std::string, NodePtr> switchn_names_to_data;
   std::set<NodePtr> updated_switchn;
+  NodePtr multbatch_case;
 
   for (auto &node : graph->GetDirectNode()) {
     if (node->GetType() == DATA) {
@@ -213,6 +234,12 @@ Status InsertNewOpUtil::UpdateDataNodeByAipp(const ComputeGraphPtr &graph) {
     }
     if (node->GetType() == AIPP) {
       GE_RETURN_IF_ERROR(UpdatePrevNodeByAipp(node, updated_switchn));
+      // In dynamic batch/HW and dynamic aipp scend, switchn should be set format agnostic, otherwise transdata maybe
+      // inserted between aipp and switchn which introduce performance and memory increase problem.
+      GE_RETURN_IF_ERROR(AddFormatAgnosticAttrToSwitchn(node));
+    }
+    if (node->GetType() == CASE && node->GetOpDesc()->HasAttr(ATTR_NAME_BATCH_NUM)) {
+      multbatch_case = node;
     }
   }
 
@@ -225,8 +252,107 @@ Status InsertNewOpUtil::UpdateDataNodeByAipp(const ComputeGraphPtr &graph) {
     GE_RETURN_IF_ERROR(UpdateDataBySwitchN(switchn, data_iter->second));
   }
 
+  if (multbatch_case != nullptr) {
+    GE_RETURN_IF_ERROR(UpdateCaseNode(graph, multbatch_case));
+  }
+  return SUCCESS;
+}
+
+Status InsertNewOpUtil::FindMaxSizeNode(const ComputeGraphPtr &graph, const NodePtr &case_node,
+                                        map<uint32_t, int64_t> &max_sizes,
+                                        map<uint32_t, GeTensorDescPtr> &aipp_inputs) {
+  const auto &func_desc = case_node->GetOpDesc();
+  for (const auto &name : func_desc->GetSubgraphInstanceNames()) {
+    const auto &subgraph = graph->GetSubgraph(name);
+    if (subgraph == nullptr) {
+      GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str());
+      return GE_GRAPH_EMPTY_SUBGRAPH;
+    }
+
+    std::set<NodePtr> updated_switchn;  // fix interface
+    for (auto &node : subgraph->GetDirectNode()) {
+      if (node->GetType() == AIPP) {
+        GE_RETURN_IF_ERROR(UpdatePrevNodeByAipp(node, updated_switchn));
+        int64_t size = 0;
+        auto in_data_anchor = node->GetInDataAnchor(0);
+        GE_CHECK_NOTNULL(in_data_anchor);
+        auto peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
+        GE_CHECK_NOTNULL(peer_out_anchor);
+        const auto &src_node = peer_out_anchor->GetOwnerNode();
+        const auto &src_op = src_node->GetOpDesc();
+        GE_CHECK_NOTNULL(src_op);
+
+        uint32_t parent_index = 0;
+        if (!AttrUtils::GetInt(src_op, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+          GELOGE(FAILED, "Parent index not found, name: %s", src_op->GetName().c_str());
+          return FAILED;
+        }
+
+        auto aipp_op_desc = node->GetOpDesc();
+        GE_CHECK_NOTNULL(aipp_op_desc);
+        auto input = aipp_op_desc->MutableInputDesc(0);
+        GE_CHECK_NOTNULL(input);
+        if (TensorUtils::GetSize(*input, size) == GRAPH_SUCCESS) {
+          if (max_sizes[parent_index] < size) {
+            max_sizes[parent_index] = size;
+            aipp_inputs[parent_index] = input;
+          }
+        }
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status InsertNewOpUtil::UpdateCaseNode(const ComputeGraphPtr &graph, const NodePtr &case_node) {
+  const auto &func_desc = case_node->GetOpDesc();
+  map<uint32_t, int64_t> max_sizes;
+  map<uint32_t, GeTensorDescPtr> aipp_inputs;
+
+  GE_RETURN_IF_ERROR(FindMaxSizeNode(graph, case_node, max_sizes, aipp_inputs));
+  for (const auto &item : aipp_inputs) {
+    uint32_t parent_index = item.first;
+    const GeTensorDescPtr &aipp_input = item.second;
+    GE_CHECK_NOTNULL(aipp_input);
+
+    const GeTensorDescPtr &input_desc = func_desc->MutableInputDesc(parent_index);
+    GE_CHECK_NOTNULL(input_desc);
+    input_desc->SetDataType(aipp_input->GetDataType());
+    input_desc->SetOriginDataType(aipp_input->GetOriginDataType());
+    input_desc->SetShape(aipp_input->GetShape());
+    input_desc->SetOriginShape(aipp_input->GetShape());
+    input_desc->SetFormat(aipp_input->GetFormat());
+    input_desc->SetOriginFormat(aipp_input->GetFormat());
+    ge::TensorUtils::SetSize(*input_desc, max_sizes[item.first]);
+
+    const auto &in_anchor = case_node->GetInDataAnchor(parent_index);
+    const auto &out_anchor = in_anchor->GetPeerOutAnchor();
+    const auto &data = out_anchor->GetOwnerNode();
+    auto data_opdesc = data->GetOpDesc();
+    GE_CHECK_NOTNULL(data_opdesc);
+    Format old_format = data_opdesc->MutableOutputDesc(0)->GetFormat();
+
+    auto ret = data_opdesc->UpdateOutputDesc(0, *input_desc);
+    if (ret != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Failed to update data %s output using case %s", data->GetName().c_str(),
+             case_node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+    ret = data_opdesc->UpdateInputDesc(0, *input_desc);
+    if (ret != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Failed to update data %s input using case %s", data->GetName().c_str(),
+             case_node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+
+    // Update attr _mbatch_origin_input_dims for data when it is linked to aipp
+    UpdateMultiBatchInputDims(data_opdesc, old_format);
+  }
+
   return SUCCESS;
 }
+
 Status InsertNewOpUtil::UpdatePrevNodeByAipp(NodePtr &node, std::set<NodePtr> &switchns) {
   GELOGI("Start to update prev node size by aipp %s.", node->GetName().c_str());
   auto aipp_op_desc = node->GetOpDesc();
@@ -389,7 +515,7 @@ Status InsertNewOpUtil::GetDataRelatedNode(NodePtr &node, std::map<NodePtr, std:
       const auto &dst_op = dst_node->GetOpDesc();
       GE_CHECK_NOTNULL(dst_op);
 
-      if (dst_op->GetType() == AIPP || dst_op->GetType() == SWITCHN) {
+      if (dst_op->GetType() == AIPP || dst_op->GetType() == SWITCHN || dst_op->GetType() == CASE) {
         auto data_iter = data_next_node_map.find(node);
         if (data_iter == data_next_node_map.end()) {
           std::set<NodePtr> next_node_set;
@@ -407,7 +533,7 @@ Status InsertNewOpUtil::GetDataRelatedNode(NodePtr &node, std::map<NodePtr, std:
   return SUCCESS;
 }
 
-Status InsertNewOpUtil::GetAllAipps(const NodePtr &node, std::vector<NodePtr> &aipps) {
+Status InsertNewOpUtil::GetAllAipps(const NodePtr &data_node, const NodePtr &node, std::vector<NodePtr> &aipps) {
   GE_CHECK_NOTNULL(node);
   OpDescPtr op = node->GetOpDesc();
   GE_CHECK_NOTNULL(op);
@@ -427,6 +553,32 @@ Status InsertNewOpUtil::GetAllAipps(const NodePtr &node, std::vector<NodePtr> &a
         }
       }
     }
+  } else if (op->GetType() == CASE) {
+    const ComputeGraphPtr &graph = node->GetOwnerComputeGraph();
+    for (const auto &name : op->GetSubgraphInstanceNames()) {
+      const auto &subgraph = graph->GetSubgraph(name);
+      if (subgraph == nullptr) {
+        GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str());
+        return GE_GRAPH_EMPTY_SUBGRAPH;
+      }
+
+      for (auto &subgraph_node : subgraph->GetDirectNode()) {
+        if (subgraph_node->GetType() == AIPP) {
+          auto src_node = subgraph_node->GetInDataNodes().at(0);
+          const auto &src_op = src_node->GetOpDesc();
+          GE_CHECK_NOTNULL(src_op);
+          uint32_t parent_index = 0;
+          if (!AttrUtils::GetInt(src_op, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+            GELOGE(FAILED, "Parent index not found, name: %s", src_op->GetName().c_str());
+            return FAILED;
+          }
+          auto data = node->GetInDataNodes().at(parent_index);
+          if (data->GetName() == data_node->GetName()) {
+            aipps.emplace_back(subgraph_node);
+          }
+        }
+      }
+    }
   }
   return SUCCESS;
 }
@@ -446,14 +598,14 @@ Status InsertNewOpUtil::RecordAIPPInfoToData(const ComputeGraphPtr &graph) {
     auto data_node = it.first;
     auto data_op_desc = data_node->GetOpDesc();
     GE_CHECK_NOTNULL(data_op_desc);
-    std::set<NodePtr> aipps_or_switchs = it.second;
-    if (aipps_or_switchs.size() != 1) {
+    std::set<NodePtr> aipps_or_switchs_or_case = it.second;
+    if (aipps_or_switchs_or_case.size() != 1) {
       GELOGW("The number of successors swith or aipp of data is more than 1");
       continue;
     }
 
     std::vector<NodePtr> aipps;
-    GE_RETURN_IF_ERROR(GetAllAipps(*aipps_or_switchs.begin(), aipps));
+    GE_RETURN_IF_ERROR(GetAllAipps(data_node, *aipps_or_switchs_or_case.begin(), aipps));
     GELOGI("RecordAIPPInfoToData: Data: name[%s], type[%s], batch size[%u]", data_node->GetName().c_str(),
            data_node->GetType().c_str(), aipps.size());
 
diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h
index 93a96ca2..ae431c32 100644
--- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h
+++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h
@@ -30,7 +30,7 @@ enum AippType { OLD_TYPE, NEW_TYPE };
 class InsertNewOpUtil {
  public:
   static InsertNewOpUtil &Instance() {
-    static InsertNewOpUtil instance;
+    thread_local InsertNewOpUtil instance;
     return instance;
   }
 
@@ -64,10 +64,14 @@ class InsertNewOpUtil {
   void UpdateMultiBatchInputDims(const OpDescPtr &data_opdesc, Format &old_format);
   Status UpdatePrevNodeByAipp(NodePtr &node, std::set<NodePtr> &switchns);
   Status UpdateDataBySwitchN(const NodePtr &switchn, const NodePtr &data);
+  Status AddFormatAgnosticAttrToSwitchn(const NodePtr &aipp_node);
   Status GetDataRelatedNode(NodePtr &node, std::map<NodePtr, std::set<NodePtr>> &data_next_node_map);
-  Status GetAllAipps(const NodePtr &node, std::vector<NodePtr> &aipps);
+  Status GetAllAipps(const NodePtr &data_node, const NodePtr &node, std::vector<NodePtr> &aipps);
   Status GetInputOutputInfo(NodePtr &data_node, NodePtr &aipp_node, std::string &input, std::string &output);
   Status SetModelInputDims(NodePtr &data_node, NodePtr &aipp_node);
+  Status FindMaxSizeNode(const ComputeGraphPtr &graph, const NodePtr &case_node, map<uint32_t, int64_t> &max_sizes,
+                         map<uint32_t, GeTensorDescPtr> &aipp_inputs);
+  Status UpdateCaseNode(const ComputeGraphPtr &graph, const NodePtr &case_node);
 };
 }  // namespace ge
 
diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.cc b/src/ge/graph/preprocess/multi_batch_copy_graph.cc
index 8a066b6a..6adcc63e 100644
--- a/src/ge/graph/preprocess/multi_batch_copy_graph.cc
+++ b/src/ge/graph/preprocess/multi_batch_copy_graph.cc
@@ -30,15 +30,16 @@
 #include "framework/omg/omg_inner_types.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/ge_context.h"
+#include "graph/passes/multi_batch_clone_pass.h"
 #include "graph/passes/prune_pass.h"
+#include "graph/preprocess/multi_batch_options.h"
 #include "graph/utils/attr_utils.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/tensor_utils.h"
 #include "graph/utils/type_utils.h"
-#include "graph/preprocess/multi_batch_options.h"
 #include "inc/pass_manager.h"
-#include "graph/passes/multi_batch_clone_pass.h"
+#include "graph/common/local_context.h"
 
 using std::set;
 using std::string;
@@ -54,6 +55,9 @@ const int kDataOutIndex = 0;
 const int kDataInIndex = 0;
 const int kMergeDataOutIndex = 0;
 const int kStaticOutput = -1;
+const int kDynmaicDims = -1;
+const int kDynamicBatchDynamicDimsNum = 1;
+const int kDynamicImgSizeDynamciDimsNum = 2;
 
 inline bool IsDataLikeType(const std::string &node_type) { return (node_type == DATA) || (node_type == AIPP); }
 
@@ -131,6 +135,8 @@ NodePtr InsertCopyNode(const NodePtr &node, size_t n) {
     return nullptr;
   }
 
+  (void)AttrUtils::SetListStr(desc, ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, {node->GetName()});
+
   auto graph = node->GetOwnerComputeGraph();
   return graph->AddNode(desc);
 }
@@ -228,6 +234,12 @@ Status MultiBatchGraphCopyer::CopyGraph() {
     return ret;
   }
 
+  ret = InsertIdentityAfterSwitchN();
+  if (ret != SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Failed to insert identity nodes after switchn node.");
+    return INTERNAL_ERROR;
+  }
+
   GELOGI("Begin to remove useless nodes by prune pass after copy process");
   PrunePass prune_pass;
   ret = prune_pass.Run(graph_);
@@ -255,7 +267,10 @@ Status MultiBatchGraphCopyer::Init() {
 
 Status MultiBatchGraphCopyer::LabelStatus() {
   for (const auto &data : origin_data_nodes_) {
-    origin_nodes_status_[data.get()] = kNodeInBatchBranch;
+    auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
+    if (!IsAllDimsPositive(data_shape.GetDims())) {
+      origin_nodes_status_[data.get()] = kNodeInBatchBranch;
+    }
   }
   bool changed = true;
   // If anyone of in node is kNodeInBatchBranch, it is also kNodeInBatchBranch
@@ -267,8 +282,9 @@ Status MultiBatchGraphCopyer::LabelStatus() {
         continue;
       }
       for (auto &in_node : node->GetInAllNodes()) {
-        if (origin_nodes_status_.find(in_node.get()) != origin_nodes_status_.end() &&
-            origin_nodes_status_[in_node.get()] == kNodeInBatchBranch) {
+        bool is_in_batch = origin_nodes_status_.find(in_node.get()) != origin_nodes_status_.end() &&
+                           origin_nodes_status_[in_node.get()] == kNodeInBatchBranch;
+        if (is_in_batch) {
           origin_nodes_status_[node.get()] = kNodeInBatchBranch;
           changed = true;
           break;
@@ -316,6 +332,10 @@ Status MultiBatchGraphCopyer::CreateNewNodes() {
     switch (branch_status) {
       case kNodeStartNode:
         GELOGD("Name: %s, type: %s, status: kNodeStartNode.", node->GetName().c_str(), node->GetType().c_str());
+        ret = UpdateDataToDynamicInfo(node);
+        if (ret != SUCCESS) {
+          break;
+        }
         ret = InsertSwitchNForData(node);
         if (ret == SUCCESS) {
           ret = UpdateMaxShapeToData(node);
@@ -712,7 +732,57 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &data) {
   data_nodes_to_switchn_[data.get()] = switchn;
   return SUCCESS;
 }
-
+Status MultiBatchGraphCopyer::UpdateDataToDynamicInfo(const NodePtr &data) {
+  auto data_desc = NodeUtils::GetOutputDesc(*data, kDataOutIndex);
+  auto data_shape = data_desc.GetShape();
+  auto data_format = data_desc.GetFormat();
+  auto data_name = data->GetName();
+  if (IsAllDimsPositive(data_shape.GetDims())) {
+    return SUCCESS;
+  }
+  if (data_to_dynamic_info_.find(data_name) == data_to_dynamic_info_.end()) {
+    auto data_shape_dims = data_shape.GetDims();
+    auto dynamic_dims_num = std::count_if(data_shape_dims.begin(), data_shape_dims.end(),
+                                          [&data_shape_dims](int64_t dim) { return dim < 0; });
+    if (dynamic_type_ == DynamicType::kDynamicBatch) {
+      if (dynamic_dims_num != kDynamicBatchDynamicDimsNum || data_shape.GetDim(0) != kDynmaicDims) {
+        GELOGE(INTERNAL_ERROR, "data: %s shape:%s do not satisfy dynamic batch rule", data->GetName().c_str(),
+               data_shape.ToString().c_str());
+        return INTERNAL_ERROR;
+      }
+    } else if (dynamic_type_ == DynamicType::kDynamicImageSize) {
+      int64_t height = 0;
+      int64_t width = 0;
+      if (data_format == FORMAT_NCHW) {
+        height = data_shape.GetDim(NCHW_DIM_H);
+        width = data_shape.GetDim(NCHW_DIM_W);
+      } else if (data_format == FORMAT_NHWC) {
+        height = data_shape.GetDim(NHWC_DIM_H);
+        width = data_shape.GetDim(NHWC_DIM_W);
+      }
+      if (dynamic_dims_num != kDynamicImgSizeDynamciDimsNum || height != kDynmaicDims || width != kDynmaicDims) {
+        GELOGE(INTERNAL_ERROR, "data: %s shape:%s do not satisfy dynamic image size rule", data->GetName().c_str(),
+               data_shape.ToString().c_str());
+        return INTERNAL_ERROR;
+      }
+    } else if (dynamic_type_ == DynamicType::kDynamicDims) {
+      GELOGE(INTERNAL_ERROR, "data: %s shape:%s must be set int --input_shape", data->GetName().c_str(),
+             data_shape.ToString().c_str());
+      return INTERNAL_ERROR;
+    }
+    // all data has dynamic dims are not in atc parameter --input_shape
+    if (data_to_dynamic_info_.empty()) {
+      vector<pair<string, vector<int64_t>>> tmp_data_name_and_shape{std::make_pair(data_name, data_shape_dims)};
+      auto ret = ParserDataToDynmaicInfo(shapes_, tmp_data_name_and_shape, data_to_dynamic_info_);
+      if (ret != SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "parse data : %s dynamic gear info failed", data_name.c_str());
+        return INTERNAL_ERROR;
+      }
+    }
+    data_to_dynamic_info_[data_name] = data_to_dynamic_info_.begin()->second;
+  }
+  return SUCCESS;
+}
 Status MultiBatchGraphCopyer::InsertMergeForEdgeNode(const NodePtr &node) {
   for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
     auto src_out_anchor = in_data_anchor->GetPeerOutAnchor();
@@ -911,33 +981,77 @@ Status MultiBatchGraphCopyer::LinkToNodeOutBranch(const NodePtr &node) {
   return SUCCESS;
 }
 
-Status ProcessMultiBatch(ComputeGraphPtr &graph) {
-  const char *multi_batch_with_case = std::getenv("MULTI_BATCH_WITH_CASE");
-  if (multi_batch_with_case != nullptr) {
-    PassManager pass_manager;
-    GE_CHK_STATUS_RET(pass_manager.AddPass("MultiBatchClonePass", new (std::nothrow) MultiBatchClonePass));
-    return pass_manager.Run(graph);
+Status MultiBatchGraphCopyer::InsertIdentityAfterSwitchN() {
+  for (auto &node : graph_->GetAllNodes()) {
+    if (node->GetType() != SWITCHN) {
+      continue;
+    }
+    auto switchn_desc = node->GetOpDesc();
+    GE_CHECK_NOTNULL(switchn_desc);
+    size_t i = 0;
+    for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
+      for (auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+        auto identity_desc = MakeShared<OpDesc>(node->GetName() + "_identity_" + std::to_string(i), IDENTITY);
+        GE_CHECK_NOTNULL(identity_desc);
+
+        auto out_node = in_data_anchor->GetOwnerNode();
+        auto op_desc = out_node->GetOpDesc();
+        GE_CHECK_NOTNULL(op_desc);
+        string batch_label;
+        if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
+          if (!AttrUtils::SetStr(identity_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
+            GELOGE(FAILED, "Set attr ATTR_NAME_BATCH_LABEL failed, node:%s.", identity_desc->GetName().c_str());
+            return FAILED;
+          }
+        }
+
+        auto data_desc = switchn_desc->GetOutputDesc(i);
+        i++;
+        GE_CHK_STATUS_RET(identity_desc->AddInputDesc("x", data_desc));
+        GE_CHK_STATUS_RET(identity_desc->AddOutputDesc("y", data_desc));
+
+        auto identity_node = graph_->AddNode(identity_desc);
+        GE_CHECK_NOTNULL(identity_node);
+        GE_CHK_STATUS_RET(out_data_anchor->LinkTo(identity_node->GetInDataAnchor(0)));
+        GE_CHECK_NOTNULL(identity_node->GetOutControlAnchor());
+        GE_CHK_STATUS_RET(identity_node->GetOutControlAnchor()->LinkTo(out_node->GetInControlAnchor()));
+      }
+    }
   }
 
+  return SUCCESS;
+}
+
+Status ProcessMultiBatch(ComputeGraphPtr &graph) {
   std::vector<std::vector<int64_t>> shapes;
   if (!InitDynamicParams(shapes)) {
     GELOGD("There is no multi-batch options, no need to process multi-batch copy");
     return SUCCESS;
   }
   map<string, vector<vector<int64_t>>> data_to_dynamic_info;
-  if (ParserDataToDynmaicInfo(shapes, data_to_dynamic_info) != SUCCESS) {
-    GELOGD("Parse each data's own dynamic info failed");
-    return SUCCESS;
+  // parser data dynamic info from atc parameter --input_shape
+  if (ParserDataToDynmaicInfo(shapes, GetLocalOmgContext().user_input_dims, data_to_dynamic_info) != SUCCESS) {
+    GELOGE(PARAM_INVALID, "Parse each data's own dynamic info failed");
+    return PARAM_INVALID;
+  }
+  DynamicType dynamic_type = DynamicType::kDynamicUnknown;
+  if (!GetLocalOmgContext().dynamic_batch_size.empty()) {
+    dynamic_type = DynamicType::kDynamicBatch;
+  } else if (!GetLocalOmgContext().dynamic_image_size.empty()) {
+    dynamic_type = DynamicType::kDynamicImageSize;
+    ;
+  } else if (!GetLocalOmgContext().dynamic_dims.empty()) {
+    dynamic_type = DynamicType::kDynamicDims;
   }
-
   std::vector<std::pair<std::string, std::vector<int64_t>>> user_designate_shape;
-  user_designate_shape = domi::GetContext().user_input_dims;
+  user_designate_shape = GetLocalOmgContext().user_input_dims;
 
   GELOGI("Begin to copy graph for multi-batch");
   multibatch::MultiBatchGraphCopyer copyer(graph);
   for (auto &shape : shapes) {
     copyer.AddShape(shape);
   }
+  copyer.SetDynamicType(dynamic_type);
   copyer.SetUserDesignateShape(user_designate_shape);
   copyer.SetDataToDynamicInfo(data_to_dynamic_info);
   return copyer.CopyGraph();
diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.h b/src/ge/graph/preprocess/multi_batch_copy_graph.h
index a0e61554..062b98d2 100644
--- a/src/ge/graph/preprocess/multi_batch_copy_graph.h
+++ b/src/ge/graph/preprocess/multi_batch_copy_graph.h
@@ -37,6 +37,13 @@ enum NodeStatus {
   kNodeNotSupportNode,
 };
 
+enum DynamicType {
+  kDynamicBatch,
+  kDynamicImageSize,
+  kDynamicDims,
+  kDynamicUnknown,
+};
+
 class MultiBatchGraphCopyer {
  public:
   explicit MultiBatchGraphCopyer(ComputeGraphPtr &graph) : graph_(graph) {}
@@ -52,6 +59,7 @@ class MultiBatchGraphCopyer {
   void SetDataToDynamicInfo(const map<string, vector<vector<int64_t>>> &designate_shape) {
     data_to_dynamic_info_ = designate_shape;
   }
+  void SetDynamicType(const DynamicType dynamic_type) { dynamic_type_ = dynamic_type; }
   Status CopyGraph();
 
  private:
@@ -65,6 +73,7 @@ class MultiBatchGraphCopyer {
 
   NodePtr InsertShapeDataNode();
   Status InsertSwitchNForData(const NodePtr &data);
+  Status InsertIdentityAfterSwitchN();
   Status UpdateMaxShapeToData(const NodePtr &data);
 
   Status InsertMergeForEdgeNode(const NodePtr &node);
@@ -93,7 +102,7 @@ class MultiBatchGraphCopyer {
   Status LinkNodeToMerge(const NodePtr &node, int out_index, const NodePtr &merge);
   Status CopyInDataEdges(const NodePtr &origin_node, int batch_num, const NodePtr &copyed_node);
   Status CopyInControlEdges(const NodePtr &node, int batch_num, const NodePtr &copyed_node);
-
+  Status UpdateDataToDynamicInfo(const NodePtr &node);
   bool IsInBatchBranch(const NodePtr &node);
   NodeStatus GetNodeStatus(const NodePtr &node) { return origin_nodes_status_[node.get()]; };
   Status CheckCopyResult(const std::vector<NodePtr> &start_nodes);
@@ -129,6 +138,9 @@ class MultiBatchGraphCopyer {
 
   // each data's own dynamic info
   map<string, vector<vector<int64_t>>> data_to_dynamic_info_;
+
+  // dynamic type : dynamic batch,, dynamic image size, dynamic dims.
+  DynamicType dynamic_type_ = DynamicType::kDynamicUnknown;
 };
 }  // namespace multibatch
 }  // namespace ge
diff --git a/src/ge/graph/preprocess/multi_batch_options.cc b/src/ge/graph/preprocess/multi_batch_options.cc
index cbf8206f..005240ca 100644
--- a/src/ge/graph/preprocess/multi_batch_options.cc
+++ b/src/ge/graph/preprocess/multi_batch_options.cc
@@ -25,6 +25,7 @@
 #include "graph/debug/ge_attr_define.h"
 #include "graph/utils/node_utils.h"
 #include "graph/ge_context.h"
+#include "graph/common/local_context.h"
 
 namespace ge {
 namespace multibatch {
@@ -59,9 +60,9 @@ void ParseDynamicSize(string dynamic_size, vector<vector<int64_t>> &shapes) {
 /// @return true: Configed for Multi batch / false: Not configed for Multi batch.
 ///
 bool InitDynamicParams(vector<vector<int64_t>> &shapes) {
-  if (!domi::GetContext().dynamic_batch_size.empty()) {
-    GELOGD("Found dynamic batch option, value %s", domi::GetContext().dynamic_batch_size.c_str());
-    std::vector<std::string> dims = ge::StringUtils::Split(domi::GetContext().dynamic_batch_size, ',');
+  if (!GetLocalOmgContext().dynamic_batch_size.empty()) {
+    GELOGD("Found dynamic batch option, value %s", GetLocalOmgContext().dynamic_batch_size.c_str());
+    std::vector<std::string> dims = ge::StringUtils::Split(GetLocalOmgContext().dynamic_batch_size, ',');
     for (const auto &dim : dims) {
       if (dim.empty()) {
         continue;
@@ -71,18 +72,18 @@ bool InitDynamicParams(vector<vector<int64_t>> &shapes) {
     }
   }
 
-  if (!domi::GetContext().dynamic_image_size.empty()) {
-    GELOGD("Found dynamic image size option, value %s", domi::GetContext().dynamic_image_size.c_str());
-    ParseDynamicSize(domi::GetContext().dynamic_image_size, shapes);
+  if (!GetLocalOmgContext().dynamic_image_size.empty()) {
+    GELOGD("Found dynamic image size option, value %s", GetLocalOmgContext().dynamic_image_size.c_str());
+    ParseDynamicSize(GetLocalOmgContext().dynamic_image_size, shapes);
 
     for (const auto &shape : shapes) {
       GELOGI("Found dynamic image size, shape %s", formats::JoinToString(shape).c_str());
     }
   }
 
-  if (!domi::GetContext().dynamic_dims.empty()) {
-    GELOGD("Found dynamic dims option, value %s", domi::GetContext().dynamic_dims.c_str());
-    ParseDynamicSize(domi::GetContext().dynamic_dims, shapes);
+  if (!GetLocalOmgContext().dynamic_dims.empty()) {
+    GELOGD("Found dynamic dims option, value %s", GetLocalOmgContext().dynamic_dims.c_str());
+    ParseDynamicSize(GetLocalOmgContext().dynamic_dims, shapes);
 
     for (const auto &shape : shapes) {
       GELOGI("Found dynamic dims, shape %s", formats::JoinToString(shape).c_str());
@@ -99,14 +100,11 @@ bool InitDynamicParams(vector<vector<int64_t>> &shapes) {
 /// @return true: Configed for Multi batch / false: Not configed for Multi batch.
 ///
 Status ParserDataToDynmaicInfo(const vector<vector<int64_t>> &shapes,
+                               vector<pair<string, vector<int64_t>>> &data_name_and_shape,
                                map<string, vector<vector<int64_t>>> &data_to_dynamic_info) {
-  if (domi::GetContext().user_input_dims.empty()) {
-    GELOGD("Get user designed shape failed");
-    return FAILED;
-  }
   size_t cur_data_index = 0;
-  for (size_t index = 0; index < domi::GetContext().user_input_dims.size(); ++index) {
-    auto &cur_item = domi::GetContext().user_input_dims[index];
+  for (size_t index = 0; index < data_name_and_shape.size(); ++index) {
+    auto &cur_item = data_name_and_shape[index];
     auto &data_name = cur_item.first;
     auto &data_shape = cur_item.second;
     auto dynamic_dims_num =
@@ -239,13 +237,13 @@ Status CalcShape(const std::vector<int64_t> &batch_shape, GeShape &data_shape) {
 Status StampDynamicType(const OpDescPtr &op_desc) {
   GE_CHECK_NOTNULL(op_desc);
   int32_t dynamic_type = static_cast<int32_t>(FIXED);
-  if (!domi::GetContext().dynamic_batch_size.empty()) {
+  if (!GetLocalOmgContext().dynamic_batch_size.empty()) {
     dynamic_type = static_cast<int32_t>(DYNAMIC_BATCH);
   }
-  if (!domi::GetContext().dynamic_image_size.empty()) {
+  if (!GetLocalOmgContext().dynamic_image_size.empty()) {
     dynamic_type = static_cast<int32_t>(DYNAMIC_IMAGE);
   }
-  if (!domi::GetContext().dynamic_dims.empty()) {
+  if (!GetLocalOmgContext().dynamic_dims.empty()) {
     dynamic_type = static_cast<int32_t>(DYNAMIC_DIMS);
   }
   if (!AttrUtils::SetInt(op_desc, ATTR_DYNAMIC_TYPE, dynamic_type)) {
diff --git a/src/ge/graph/preprocess/multi_batch_options.h b/src/ge/graph/preprocess/multi_batch_options.h
index 650020d9..18f667ae 100644
--- a/src/ge/graph/preprocess/multi_batch_options.h
+++ b/src/ge/graph/preprocess/multi_batch_options.h
@@ -54,10 +54,13 @@ Status CalcShape(const std::vector<int64_t> &batch_shape, GeShape &data_shape);
 ///
 /// @ingroup ge
 /// @brief parse each data's own dynamic dims.
+/// @param [in] vector<vector<int64_t>> &shapes: dynamic batch gears info.
+/// @param [in] vector<pair<string, vector<int64_t>>> data_name_and_shape: eg:{{data:{1,1,-1,2}}}.
 /// @param [out] map<string, vector<vector<int64_t>>> &data_to_dynamic_info: key:data_name. value:dynamic dims.
 /// @return SUCCESS / PARAM_INVALID
 ///
 Status ParserDataToDynmaicInfo(const vector<vector<int64_t>> &shapes,
+                               vector<pair<string, vector<int64_t>>> &data_name_and_shape,
                                map<string, vector<vector<int64_t>>> &data_to_dynamic_info);
 
 ///
diff --git a/src/ge/host_kernels/rsqrt_kernel.cc b/src/ge/host_kernels/rsqrt_kernel.cc
index f91e3399..5184d885 100644
--- a/src/ge/host_kernels/rsqrt_kernel.cc
+++ b/src/ge/host_kernels/rsqrt_kernel.cc
@@ -73,14 +73,12 @@ Status RsqrtKernel::RsqrtCompute(ConstGeTensorPtr &input_tensor_ptr, GeTensorPtr
     auto ptr = const_cast<T *>(reinterpret_cast<const T *>(input_tensor_ptr->GetData().data()));
     for (size_t i = 0; i < data_count; i++) {
       if (ZeroCheck(*(ptr + i), data_type) != SUCCESS) {
-        GELOGE(PARAM_INVALID, "The input data can not be 0. ");
-        return PARAM_INVALID;
+        GELOGW("Rsqrt: The input data can not less than or equal to zero, rsqrt folding failed.");
+        return NOT_CHANGED;
       }
       switch (data_type) {
         case DT_FLOAT16: {
           double val = static_cast<double>(*(reinterpret_cast<const fp16_t *>(input_tensor_ptr->GetData().data()) + i));
-          GE_IF_BOOL_EXEC(val < 0, GELOGE(PARAM_INVALID, "The denominator data %lf can not less than 0.", val);
-                          return PARAM_INVALID);
           double drSqrt = 1.0 / std::sqrt(val);
           buf[i] = drSqrt;
           break;
diff --git a/src/ge/hybrid/common/npu_memory_allocator.cc b/src/ge/hybrid/common/npu_memory_allocator.cc
index 1908725f..cbb556e2 100644
--- a/src/ge/hybrid/common/npu_memory_allocator.cc
+++ b/src/ge/hybrid/common/npu_memory_allocator.cc
@@ -17,16 +17,17 @@
 #include "npu_memory_allocator.h"
 #include <mutex>
 #include "framework/common/debug/log.h"
-#include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/graph_caching_allocator.h"
+#include "graph/manager/graph_mem_allocator.h"
+#include "graph/manager/rdma_pool_allocator.h"
 
 namespace ge {
 namespace hybrid {
 std::map<uint32_t, std::unique_ptr<NpuMemoryAllocator>> NpuMemoryAllocator::allocators_;
 std::mutex NpuMemoryAllocator::mu_;
 
-AllocationAttr::AllocationAttr(int padding, void *try_reuse_addr)
-    : padding_(padding), try_reuse_addr_(try_reuse_addr) {}
+AllocationAttr::AllocationAttr(int padding, void *try_reuse_addr, MemStorageType mem_type)
+    : padding_(padding), try_reuse_addr_(try_reuse_addr), mem_type_(mem_type) {}
 AllocationAttr::AllocationAttr(int padding) : AllocationAttr(padding, nullptr) {}
 AllocationAttr::AllocationAttr(void *try_reuse_addr) : AllocationAttr(0, try_reuse_addr) {}
 
@@ -46,6 +47,7 @@ NpuMemoryAllocator::NpuMemoryAllocator(uint32_t device_id) : device_id_(device_i
 void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
   void *try_reuse_addr = nullptr;
   size_t allocate_size = size;
+  MemStorageType mem_type = HBM;
   if (attr != nullptr) {
     try_reuse_addr = attr->try_reuse_addr_;
     if (attr->padding_ != 0) {
@@ -53,10 +55,24 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
       allocate_size = (size + 2 * attr->padding_ - 1) / attr->padding_ * attr->padding_;
       GELOGD("Padding size %ld by %d. final size = %zu.", size, attr->padding_, allocate_size);
     }
+    mem_type = attr->mem_type_;
   }
 
-  void *buffer = MemManager::CachingInstance(RT_MEMORY_HBM)
-                   .Malloc(allocate_size, reinterpret_cast<uint8_t *>(try_reuse_addr), device_id_);
+  if (allocate_size == 0) {
+    GELOGE(MEMALLOC_FAILED, "Memory size is 0, device_id = %u, size = %zu", device_id_, allocate_size);
+    return nullptr;
+  }
+
+  void *buffer = nullptr;
+  if (mem_type == RDMA_HBM) {
+    buffer = MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Malloc(allocate_size, device_id_);
+  } else if (mem_type == HOST_DDR) {
+    buffer = malloc(allocate_size);
+  } else {
+    buffer = MemManager::Instance()
+               .CachingInstance(RT_MEMORY_HBM)
+               .Malloc(allocate_size, reinterpret_cast<uint8_t *>(try_reuse_addr), device_id_);
+  }
   if (buffer == nullptr) {
     GELOGE(MEMALLOC_FAILED, "Failed to malloc memory, device_id = %u, size = %zu", device_id_, allocate_size);
     return nullptr;
@@ -66,11 +82,17 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
   return buffer;
 }
 
-void NpuMemoryAllocator::Deallocate(void *data) {
+void NpuMemoryAllocator::Deallocate(void *data, MemStorageType mem_type) {
   GELOGI("To deallocating buffer, addr = %p", data);
   if (data != nullptr) {
     GELOGI("Deallocating buffer successfully. addr = %p", data);
-    MemManager::CachingInstance(RT_MEMORY_HBM).Free(reinterpret_cast<uint8_t *>(data), device_id_);
+    if (mem_type == RDMA_HBM) {
+      MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Free(reinterpret_cast<uint8_t *>(data), device_id_);
+    } else if (mem_type == HOST_DDR) {
+      free(data);
+    } else {
+      MemManager::Instance().CachingInstance(RT_MEMORY_HBM).Free(reinterpret_cast<uint8_t *>(data), device_id_);
+    }
   }
 }
 
diff --git a/src/ge/hybrid/common/npu_memory_allocator.h b/src/ge/hybrid/common/npu_memory_allocator.h
index 7aa15578..99c01b34 100644
--- a/src/ge/hybrid/common/npu_memory_allocator.h
+++ b/src/ge/hybrid/common/npu_memory_allocator.h
@@ -32,7 +32,7 @@ class AllocationAttr {
   AllocationAttr() = default;
   explicit AllocationAttr(int padding);
   explicit AllocationAttr(void *try_reuse_addr);
-  AllocationAttr(int padding, void *try_reuse_addr);
+  AllocationAttr(int padding, void *try_reuse_addr, MemStorageType = HBM);
   ~AllocationAttr() = default;
   void SetMemType(MemStorageType memType) { mem_type_ = memType; }
   MemStorageType GetMemType() { return mem_type_; }
@@ -56,7 +56,7 @@ class NpuMemoryAllocator {
   }
 
   void *Allocate(std::size_t size, AllocationAttr *attr = nullptr);
-  void Deallocate(void *data);
+  void Deallocate(void *data, MemStorageType mem_type = HBM);
 
   static constexpr int kDefaultPadding = 32;
 
diff --git a/src/ge/hybrid/common/tensor_value.cc b/src/ge/hybrid/common/tensor_value.cc
index 929d3c87..11a96d13 100644
--- a/src/ge/hybrid/common/tensor_value.cc
+++ b/src/ge/hybrid/common/tensor_value.cc
@@ -21,8 +21,8 @@
 
 namespace ge {
 namespace hybrid {
-TensorBuffer::TensorBuffer(NpuMemoryAllocator *allocator, void *buffer, size_t size)
-    : allocator_(allocator), buffer_(buffer), size_(size) {}
+TensorBuffer::TensorBuffer(NpuMemoryAllocator *allocator, void *buffer, size_t size, MemStorageType mem_type)
+    : allocator_(allocator), buffer_(buffer), size_(size), mem_type_(mem_type) {}
 
 std::unique_ptr<TensorBuffer> TensorBuffer::Create(NpuMemoryAllocator *allocator, size_t size, AllocationAttr *attr) {
   void *buffer = nullptr;
@@ -36,14 +36,18 @@ std::unique_ptr<TensorBuffer> TensorBuffer::Create(NpuMemoryAllocator *allocator
     return nullptr;
   }
 
+  MemStorageType mem_type = HBM;
+  if (attr != nullptr) {
+    mem_type = attr->GetMemType();
+  }
   buffer = allocator->Allocate(size, attr);
   if (buffer == nullptr) {
     GELOGE(MEMALLOC_FAILED, "Failed to allocate memory. size = %zu", size);
     return nullptr;
   }
 
-  GELOGD("Tensor created. addr = %p, size = %zu", buffer, size);
-  return std::unique_ptr<TensorBuffer>(new (std::nothrow) TensorBuffer(allocator, buffer, size));
+  GELOGD("Tensor created. addr = %p, size = %zu, mem_type = %d", buffer, size, static_cast<int32_t>(mem_type));
+  return std::unique_ptr<TensorBuffer>(new (std::nothrow) TensorBuffer(allocator, buffer, size, mem_type));
 }
 
 std::unique_ptr<TensorBuffer> TensorBuffer::Create(void *buffer, size_t size) {
@@ -53,7 +57,7 @@ std::unique_ptr<TensorBuffer> TensorBuffer::Create(void *buffer, size_t size) {
 
 TensorBuffer::~TensorBuffer() {
   if (allocator_ != nullptr && buffer_ != nullptr) {
-    allocator_->Deallocate(buffer_);
+    allocator_->Deallocate(buffer_, mem_type_);
   }
 }
 
diff --git a/src/ge/hybrid/common/tensor_value.h b/src/ge/hybrid/common/tensor_value.h
index db8df9e5..d720e0e0 100644
--- a/src/ge/hybrid/common/tensor_value.h
+++ b/src/ge/hybrid/common/tensor_value.h
@@ -20,6 +20,7 @@
 #include <atomic>
 #include <cstddef>
 #include <memory>
+#include "memory/memory_api.h"
 
 namespace ge {
 namespace hybrid {
@@ -33,6 +34,8 @@ class TensorBuffer {
 
   static std::unique_ptr<TensorBuffer> Create(void *buffer, size_t size);
 
+  TensorBuffer(const TensorBuffer &) = delete;
+  TensorBuffer &operator=(const TensorBuffer &) = delete;
   ~TensorBuffer();
 
   void *GetData() { return buffer_; }
@@ -40,11 +43,12 @@ class TensorBuffer {
   size_t GetSize() const { return size_; }
 
  private:
-  TensorBuffer(NpuMemoryAllocator *allocator, void *buffer, size_t size);
+  TensorBuffer(NpuMemoryAllocator *allocator, void *buffer, size_t size, MemStorageType mem_type = HBM);
 
   NpuMemoryAllocator *allocator_ = nullptr;
   void *buffer_ = nullptr;
   size_t size_ = 0;
+  MemStorageType mem_type_;
 };
 
 class TensorValue {
diff --git a/src/ge/hybrid/executor/worker/execution_engine.cc b/src/ge/hybrid/executor/worker/execution_engine.cc
index b19d0849..1eb73e41 100644
--- a/src/ge/hybrid/executor/worker/execution_engine.cc
+++ b/src/ge/hybrid/executor/worker/execution_engine.cc
@@ -272,8 +272,9 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, TaskContext &task_
   if (context.profiling_level > 0) {
     auto *ctx = &context;
     const string &name = node_state.GetName();
-    task_context.RegisterCallback([ctx, name]() { RECORD_CALLBACK_EVENT(ctx, name.c_str(), "[Compute] Start"); });
+    (void)task_context.RegisterCallback([ctx, name]() { RECORD_CALLBACK_EVENT(ctx, name.c_str(), "[Compute] Start"); });
   }
+  RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ExecuteTask] Start");
   GE_CHK_STATUS_RET(node_item.node_executor->ExecuteTask(*task, task_context, callback), "[%s] Failed to execute task",
                     node_state.GetName().c_str());
   RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ExecuteTask] End");
@@ -286,8 +287,18 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const
   for (auto i = 0; i < task_context.NumInputs(); ++i) {
     const auto &input_tensor = task_context.GetInput(i);
     GE_CHECK_NOTNULL(input_tensor);
+    if (input_tensor->GetData() == nullptr) {
+      GELOGD("[%s] Skipping null input, index = %d", task_context.GetNodeName(), i);
+      continue;
+    }
+
     const auto &tensor_desc = node_state.GetOpDesc()->MutableInputDesc(i);
     GE_CHECK_NOTNULL(tensor_desc);
+    if (tensor_desc->GetDataType() == DT_STRING) {
+      GELOGD("[%s] Skipping DT_STRING input, index = %d", task_context.GetNodeName(), i);
+      continue;
+    }
+
     int64_t expected_size;
     GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, expected_size));
     GELOGD("[%s] Input[%d] expects [%ld] bytes.", task_context.GetNodeName(), i, expected_size);
diff --git a/src/ge/hybrid/model/hybrid_model_builder.cc b/src/ge/hybrid/model/hybrid_model_builder.cc
index 97783711..45fb3a6a 100644
--- a/src/ge/hybrid/model/hybrid_model_builder.cc
+++ b/src/ge/hybrid/model/hybrid_model_builder.cc
@@ -36,20 +36,24 @@ const uint32_t kAlignment = 32;
 const int kBytes = 8;
 
 int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) {
-  int64_t var_size = GetSizeByDataType(desc.GetDataType());
-  if (var_size <= 0) {
-    GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s",
-           TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str());
-    return -1;
-  }
-  auto shape = desc.GetShape();
-  auto dim_num = shape.GetDimNum();
-  for (size_t dim_index = 0; dim_index < dim_num; ++dim_index) {
-    var_size *= shape.GetDim(dim_index);
+  int64_t var_size = 0;
+  auto data_type = desc.GetDataType();
+  if (data_type == DT_STRING) {
+    (void)TensorUtils::GetSize(desc, var_size);
+  } else {
+    var_size = GetSizeByDataType(data_type);
+    if (var_size <= 0) {
+      GELOGW("Failed to calc var data size from data type %s", TypeUtils::DataTypeToSerialString(data_type).c_str());
+      return -1;
+    }
+    auto shape = desc.GetShape();
+    auto dim_num = shape.GetDimNum();
+    for (size_t dim_index = 0; dim_index < dim_num; ++dim_index) {
+      var_size *= shape.GetDim(dim_index);
+    }
+    // padding up to multiple of kAlignment, and add extra kAlignment
+    var_size = (var_size + kAlignment * 2 - 1) / kAlignment * kAlignment;
   }
-
-  // padding up to multiple of kAlignment, and add extra kAlignment
-  var_size = (var_size + kAlignment * 2 - 1) / kAlignment * kAlignment;
   return var_size;
 }
 }  // namespace
@@ -614,11 +618,6 @@ Status HybridModelBuilder::VarNodeToTensor(const NodePtr &var_node, std::unique_
   }
 
   int64_t var_size = CalcVarSizeInBytes(*tensor_desc);
-  if (var_size < 0) {
-    GELOGE(INTERNAL_ERROR, "[%s] Invalid var size: %ld", var_name.c_str(), var_size);
-    return INTERNAL_ERROR;
-  }
-
   tensor.reset(new (std::nothrow) TensorValue(dev_mem, var_size));
   GE_CHECK_NOTNULL(tensor);
   return SUCCESS;
diff --git a/src/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc b/src/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
index 6cf7363e..cc140b08 100644
--- a/src/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
+++ b/src/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
@@ -28,7 +28,7 @@ namespace hybrid {
 REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::GE_LOCAL, GeLocalNodeExecutor);
 
 const std::unordered_map<std::string, std::vector<uint32_t>> RefInputTask::out_ref_input_index_ = {
-  {DATA, {}}, {AIPPDATA, {}}, {RESHAPE, {}}, {EXPANDDIMS, {}}};
+  {DATA, {}}, {AIPPDATA, {}}, {RESHAPE, {}}, {EXPANDDIMS, {}}, {SQUEEZE, {}}, {BROADCASTGRADIENTARGS, {}}};
 
 const std::unordered_set<std::string> DependInputShapeTask::depend_input_shape_ops_ = {SHAPE, SHAPEN, RANK, SIZE};
 
diff --git a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
index 57b426d8..f2cd1888 100644
--- a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
+++ b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
@@ -15,15 +15,21 @@
  */
 
 #include "hybrid/node_executor/hccl/hccl_node_executor.h"
-#include "graph/manager/util/hcom_util.h"
-#include "framework/common/debug/ge_log.h"
-#include "framework/common/fmk_error_codes.h"
 #include "common/ge/ge_util.h"
 #include "common/ge/plugin_manager.h"
+#include "framework/common/debug/ge_log.h"
 #include "graph/attr_value.h"
 #include "graph/debug/ge_attr_define.h"
+#include "graph/manager/util/hcom_util.h"
+#include "graph/runtime_inference_context.h"
 #include "hccl/hcom.h"
 
+namespace {
+const size_t kVarTableDims = 2;
+const size_t kVarTableRowCnt = 3;
+const size_t kVarTableIdxAddr = 1;
+const size_t kVarTableIdxLen = 2;
+}  // namespace
 namespace ge {
 namespace hybrid {
 
@@ -35,8 +41,8 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
     GELOGE(FAILED, "hccl handle is nullptr! ");
     return FAILED;
   }
-  auto EnqueueHcomOpertion = (hcclResult_t(*)(HcomOpertion, std::function<void(hcclResult_t status)>))dlsym(
-    context.handle_, "EnqueueHcomOpertion");
+  auto EnqueueHcomOpertion =
+    (HcclResult(*)(HcomOpertion, std::function<void(HcclResult status)>))dlsym(context.handle_, "EnqueueHcomOpertion");
   if (EnqueueHcomOpertion == nullptr) {
     GELOGE(FAILED, "Failed to invoke EnqueueHcomOpertion hcom unknown node function.");
     if (dlclose(context.handle_) != 0) {
@@ -74,7 +80,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
     return PARAM_INVALID;
   }
   op_info.dataType = iter->second;
-  hcclRedOp_t op_type = HCCL_REP_OP_SUM;
+  HcclReduceOp op_type = HCCL_REDUCE_SUM;
   if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HCOMREDUCESCATTER ||
       op_desc->GetType() == HVDCALLBACKALLREDUCE) {
     GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type), "GetHcclOperationType failed");
@@ -85,7 +91,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
     GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id), "GetHcclRootId failed");
   }
   op_info.root = root_id;
-  auto callback = [this, op_desc](hcclResult_t status) {
+  auto callback = [this, op_desc](HcclResult status) {
     if (status != HCCL_SUCCESS) {
       GELOGE(HCCL_E_INTERNAL, "node %s call EnqueueHcomOpertion failed, ret: 0x%X", op_desc->GetName().c_str(), status);
     }
@@ -94,14 +100,14 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
     GELOGI("node %s hccl callback success.", op_desc->GetName().c_str());
   };
   int32_t count = 0;
-  GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomCount(op_desc, static_cast<hcclDataType_t>(op_info.dataType),
+  GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomCount(op_desc, static_cast<HcclDataType>(op_info.dataType),
                                               op_desc->GetType() == HCOMALLGATHER, count),
                     "GetHcomCount failed");
   GELOGI("[%s] HcclNodeTask::ExecuteAsync hccl_type %s, count %d, data_type %d, op_type %d, root %d.",
          context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root);
   op_info.count = count;
 
-  hcclResult_t hccl_ret = EnqueueHcomOpertion(op_info, callback);
+  HcclResult hccl_ret = EnqueueHcomOpertion(op_info, callback);
   if (hccl_ret != HCCL_SUCCESS) {
     GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret);
     return HCCL_E_INTERNAL;
@@ -116,6 +122,119 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
   return SUCCESS;
 }
 
+Status RdmaNodeTask::UpdateArgs(TaskContext &context) { return SUCCESS; }
+
+Status RdmaNodeTask::Init(TaskContext &context) {
+  GELOGI("[%s] RdmaNodeTask::Init in.", context.GetNodeName());
+  const NodeItem &node_item = context.GetNodeItem();
+  GE_CHECK_NOTNULL(node_item.op_desc);
+  auto remote_idx = node_item.op_desc->GetInputIndexByName("remote");
+  auto in_data_anchor = node_item.node->GetInDataAnchor(remote_idx);
+  GE_CHECK_NOTNULL(in_data_anchor);
+  auto out_data_anchor = in_data_anchor->GetPeerOutAnchor();
+  GE_CHECK_NOTNULL(out_data_anchor);
+  auto peer_node = out_data_anchor->GetOwnerNode();
+  GE_CHECK_NOTNULL(peer_node->GetOpDesc());
+
+  remote_index_ = {peer_node->GetOpDesc()->GetId(), out_data_anchor->GetIdx()};
+  if (node_item.node->GetType() == HCOMREMOTEREAD) {
+    local_index_ = 0;
+  } else {
+    local_index_ = node_item.op_desc->GetInputIndexByName("local");
+  }
+  return SUCCESS;
+}
+
+Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccessAddrInfo> &addr_infos) {
+  RuntimeInferenceContext *ctx = nullptr;
+  GE_CHK_STATUS_RET(RuntimeInferenceContext::GetContext(std::to_string(context.GetSessionId()), &ctx));
+
+  ge::Tensor remote_tensor;
+  GE_CHK_STATUS_RET(ctx->GetTensor(remote_index_.first, remote_index_.second, remote_tensor));
+  auto data = reinterpret_cast<uint64_t *>(remote_tensor.GetData());
+  if (data == nullptr) {
+    GELOGE(FAILED, "Tensor data is nullptr.");
+    return FAILED;
+  }
+  auto dims = remote_tensor.GetTensorDesc().GetShape().GetDims();
+  if (dims.size() != kVarTableDims && dims.back() != kVarTableRowCnt) {
+    GELOGE(PARAM_INVALID, "Variable table shape check failed");
+    return PARAM_INVALID;
+  }
+
+  size_t remote_size = 0;
+  for (auto idx = 0; idx < dims.front(); ++idx) {
+    remote_size += data[idx * kVarTableRowCnt + kVarTableIdxLen];
+  }
+
+  if (context.GetNodeItem().NodeType() == HCOMREMOTEREAD) {
+    auto allocator = NpuMemoryAllocator::GetAllocator();
+    GE_CHECK_NOTNULL(allocator);
+    AllocationAttr attr;
+    attr.SetMemType(RDMA_HBM);
+    for (auto i = 0; i < context.NumOutputs(); ++i) {
+      GELOGD("Allocate rdma memory for node %s, size: %zu", context.GetNodeName(), remote_size);
+      auto tensor_buffer = TensorBuffer::Create(allocator, remote_size, &attr);
+      GE_CHK_STATUS_RET(context.SetOutput(i, TensorValue(std::shared_ptr<TensorBuffer>(tensor_buffer.release()))));
+    }
+  }
+
+  TensorValue *tv;
+  if (context.GetNodeItem().NodeType() == HCOMREMOTEREAD) {
+    tv = context.MutableOutput(0);
+  } else {
+    tv = context.MutableInput(local_index_);
+  }
+  GE_CHECK_NOTNULL(tv);
+  auto local_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(tv->MutableData()));
+  for (auto idx = 0; idx < dims.front(); ++idx) {
+    addr_infos.push_back({static_cast<uint32_t>(data[idx * kVarTableRowCnt]),
+                          data[idx * kVarTableRowCnt + kVarTableIdxAddr], local_addr,
+                          data[idx * kVarTableRowCnt + kVarTableIdxLen]});
+    local_addr += data[idx * kVarTableRowCnt + kVarTableIdxLen];
+  }
+
+  return SUCCESS;
+}
+
+Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
+  GELOGI("[%s] RdmaNodeTask::ExecuteAsync in.", context.GetNodeName());
+  auto EnqueueRemoteAccess =
+    (HcclResult(*)(const string &, const vector<HcomRemoteAccessAddrInfo> &,
+                   std::function<void(HcclResult status)>))dlsym(context.handle_, "EnqueueRemoteAccess");
+  if (EnqueueRemoteAccess == nullptr) {
+    GELOGE(FAILED, "Failed to invoke EnqueueRemoteAccess hcom unknown node function.");
+    if (dlclose(context.handle_) != 0) {
+      GELOGW("Failed to close handle %s", dlerror());
+    }
+    return FAILED;
+  }
+  vector<HcomRemoteAccessAddrInfo> addr_infos;
+  GE_CHK_STATUS_RET(ExtractTensor(context, addr_infos));
+
+  auto callback = [this](HcclResult status) {
+    if (status != HCCL_SUCCESS) {
+      GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status);
+    }
+    std::lock_guard<std::mutex> lock(this->hccl_mutex_);
+    this->cond_.notify_all();
+    GELOGI("rdma callback success.");
+  };
+  HcclResult hccl_ret = EnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback);
+  if (hccl_ret != HCCL_SUCCESS) {
+    GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret);
+    return HCCL_E_INTERNAL;
+  }
+
+  // pending until hccl finished
+  std::unique_lock<std::mutex> ulock(hccl_mutex_);
+  cond_.wait(ulock);
+
+  (void)context.RegisterCallback(done_callback);
+  GELOGI("[%s] RdmaNodeTask::ExecuteAsync success.", context.GetNodeName());
+  return SUCCESS;
+}
+
 Status HcclNodeTask::UpdateArgs(TaskContext &context) { return SUCCESS; }
 
 Status HcclNodeTask::Init(TaskContext &context) {
@@ -127,8 +246,10 @@ Status HcclNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const
   GELOGI("[%s] HcclNodeExecutor::PrepareTask in.", context.GetNodeName());
 
   GE_CHK_STATUS_RET(task.Init(context), "hccl node load hccl so failed.");
-  // allocate output mem
-  GE_CHK_STATUS_RET(context.AllocateOutputs(), "hccl node task allocate output failed.");
+  // allocate output mem, output mem or remote read will be calculated when node execute.
+  if (context.GetNodeItem().NodeType() != HCOMREMOTEREAD) {
+    GE_CHK_STATUS_RET(context.AllocateOutputs(), "hccl node task allocate output failed.");
+  }
 
   GE_CHK_STATUS_RET(task.UpdateArgs(context), "hccl node task update args failed.");
   GELOGI("[%s] HcclNodeExecutor::PrepareTask success.", context.GetNodeName());
@@ -138,8 +259,11 @@ Status HcclNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const
 Status HcclNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const {
   GELOGI("[%s] HcclNodeExecutor::LoadTask in.", node->GetName().c_str());
   GE_CHECK_NOTNULL(node);
-
-  task = MakeShared<HcclNodeTask>();
+  if (node->GetType() == HCOMREMOTEREAD || node->GetType() == HCOMREMOTEWRITE) {
+    task = MakeShared<RdmaNodeTask>();
+  } else {
+    task = MakeShared<HcclNodeTask>();
+  }
   GE_CHECK_NOTNULL(task);
   GELOGI("[%s] HcclNodeExecutor::LoadTask success.", node->GetName().c_str());
   return SUCCESS;
@@ -169,12 +293,12 @@ Status HcclNodeExecutor::Initialize() {
     GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror());
     return FAILED;
   }
-  auto HcomExcutorInitialize = (hcclResult_t(*)())dlsym(handle_, "HcomExcutorInitialize");
+  auto HcomExcutorInitialize = (HcclResult(*)())dlsym(handle_, "HcomExcutorInitialize");
   if (HcomExcutorInitialize == nullptr) {
     GELOGE(FAILED, "Failed to invoke HcomExcutorInitialize hcom unknown node function.");
     return FAILED;
   }
-  hcclResult_t hccl_ret = HcomExcutorInitialize();
+  HcclResult hccl_ret = HcomExcutorInitialize();
   if (hccl_ret == HCCL_E_PTR) {
     GELOGI("Hccl comm is null, hcom executor initialize is not required.");
   } else if (hccl_ret == HCCL_SUCCESS) {
@@ -187,12 +311,12 @@ Status HcclNodeExecutor::Initialize() {
 }
 
 Status HcclNodeExecutor::Finalize() {
-  auto HcomExcutorFinalize = (hcclResult_t(*)())dlsym(handle_, "HcomExcutorFinalize");
+  auto HcomExcutorFinalize = (HcclResult(*)())dlsym(handle_, "HcomExcutorFinalize");
   if (HcomExcutorFinalize == nullptr) {
     GELOGE(FAILED, "Failed to invoke HcomExcutorFinalize hcom unknown node function.");
     return FAILED;
   }
-  hcclResult_t hccl_ret = HcomExcutorFinalize();
+  HcclResult hccl_ret = HcomExcutorFinalize();
   if (hccl_ret != HCCL_SUCCESS) {
     GELOGE(FAILED, "Call HcomExcutorFinalize failed, ret: 0x%X", hccl_ret);
     return FAILED;
diff --git a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h
index 8791c4e3..ddf6eb3a 100644
--- a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h
+++ b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h
@@ -16,9 +16,9 @@
 
 #ifndef HYBRID_HCCL_NODE_EXECUTOR_H_
 #define HYBRID_HCCL_NODE_EXECUTOR_H_
-#include "hybrid/node_executor/node_executor.h"
-#include "hybrid/model/hybrid_model.h"
 #include "graph/op_desc.h"
+#include "hybrid/model/hybrid_model.h"
+#include "hybrid/node_executor/node_executor.h"
 
 namespace ge {
 namespace hybrid {
@@ -41,6 +41,24 @@ class HcclNodeTask : public NodeTask {
   std::condition_variable cond_;
 };
 
+class RdmaNodeTask : public NodeTask {
+ public:
+  RdmaNodeTask() = default;
+
+  ~RdmaNodeTask() override {}
+
+  Status UpdateArgs(TaskContext &context) override;
+  Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override;
+  Status Init(TaskContext &context) override;
+
+ private:
+  Status ExtractTensor(TaskContext &context, vector<HcomRemoteAccessAddrInfo> &addr_infos);
+  std::pair<int64_t, int64_t> remote_index_;
+  int32_t local_index_ = 0;
+  std::mutex hccl_mutex_;
+  std::condition_variable cond_;
+};
+
 class HcclNodeExecutor : public NodeExecutor {
  public:
   Status LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const;
diff --git a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
index fbad1fcd..49ff722f 100644
--- a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
+++ b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
@@ -18,7 +18,6 @@
 #include "hybrid/node_executor/host_cpu/kernel_factory.h"
 #include "graph/passes/folding_pass.h"
 #include "hybrid/model/hybrid_model.h"
-#include "inc/kernel_factory.h"
 #include "ge_local_engine/engine/host_cpu_engine.h"
 
 namespace ge {
@@ -32,16 +31,8 @@ Status HostNodeTaskBase::UpdateArgs(TaskContext &) {
 
 Status HostNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
   GELOGD("[%s] Start execute.", context.GetNodeName());
-
-  std::vector<GeTensorPtr> inputs;
-  std::vector<GeTensorPtr> outputs;
-  GE_CHK_STATUS_RET(ProcessInputs(context, inputs), "node:%s type:%s, process inputs failed.", node_->GetName().c_str(),
-                    node_->GetType().c_str());
-  GE_CHK_STATUS_RET(Execute(context, inputs, outputs), "node:%s type:%s, task execute failed.",
-                    node_->GetName().c_str(), node_->GetType().c_str());
-  GE_CHK_STATUS_RET(ProcessOutputs(context, outputs), "node:%s type:%s, process outputs failed.",
-                    node_->GetName().c_str(), node_->GetType().c_str());
-
+  GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s, task execute failed.", node_->GetName().c_str(),
+                    node_->GetType().c_str())
   if (done_callback) {
     GELOGD("[%s] Start invoke callback.", context.GetNodeName());
     done_callback();
@@ -50,98 +41,48 @@ Status HostNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void()
   return SUCCESS;
 }
 
-Status HostNodeTaskBase::ProcessInputs(TaskContext &context, std::vector<GeTensorPtr> &inputs) {
-  int32_t input_num = context.NumInputs();
-  for (auto i = 0; i < input_num; ++i) {
-    auto tensor_value = context.GetInput(i);
-    GE_CHECK_NOTNULL(tensor_value);
-    GeTensorPtr input_ptr =
-      MakeShared<GeTensor>(node_->GetOpDesc()->GetInputDesc(i),
-                           reinterpret_cast<const uint8_t *>(tensor_value->GetData()), tensor_value->GetSize());
-    if (input_ptr == nullptr) {
-      GELOGE(MEMALLOC_FAILED, "Make shared failed");
-      return MEMALLOC_FAILED;
-    }
-    inputs.push_back(input_ptr);
-  }
-  return SUCCESS;
-}
+Status CpuKernelNodeTask::Execute(TaskContext &context) {
+  const auto &op_desc = node_->GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
 
-Status HostNodeTaskBase::ProcessOutputs(TaskContext &context, std::vector<GeTensorPtr> &outputs) {
-  int32_t output_num = context.NumOutputs();
-  if (static_cast<size_t>(output_num) != outputs.size()) {
-    GELOGE(INTERNAL_ERROR, "node %s type %s has %d output, but kernel compute only has %zu output.",
-           node_->GetName().c_str(), node_->GetType().c_str(), output_num, outputs.size());
-    return INTERNAL_ERROR;
+  std::vector<ConstGeTensorPtr> inputs;
+  for (int32_t i = 0; i < context.NumInputs(); ++i) {
+    const auto &input_desc = op_desc->GetInputDesc(i);
+    auto in_tensor = MakeShared<GeTensor>(input_desc, reinterpret_cast<const uint8_t *>(context.GetInput(i)->GetData()),
+                                          context.GetInput(i)->GetSize());
+    GE_CHECK_NOTNULL(in_tensor);
+    in_tensor->MutableTensorDesc().SetDataType(input_desc.GetDataType());
+    in_tensor->MutableTensorDesc().SetShape(input_desc.GetShape());
+    inputs.emplace_back(in_tensor);
+    GELOGI("node:%s allocate input %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i,
+           reinterpret_cast<const uint8_t *>(in_tensor->GetData().data()), in_tensor->GetData().size());
   }
 
-  // alloc output
-  GE_CHK_STATUS_RET_NOLOG(context.AllocateOutputs());
-
-  // copy data to output
-  for (auto i = 0; i < output_num; ++i) {
-    GeTensorPtr &tensor = outputs[i];
-    GE_CHECK_NOTNULL(tensor);
-    auto tensor_data = tensor->GetData();
-    auto tensor_value = context.MutableOutput(i);
-    GE_CHECK_NOTNULL(tensor_value);
-    if (tensor_data.GetSize() > tensor_value->GetSize()) {
-      GELOGE(INTERNAL_ERROR, "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.",
-             node_->GetName().c_str(), node_->GetType().c_str(), i, tensor_data.GetSize(), tensor_value->GetSize());
-      return INTERNAL_ERROR;
-    }
-
-    GELOGI("node:%s type:%s [%d]th output data=%p, out size=%zu, data size=%zu.", node_->GetName().c_str(),
-           node_->GetType().c_str(), i, tensor_value->GetData(), tensor_value->GetSize(), tensor_data.GetSize());
-    if (tensor_data.GetSize() > 0) {
-      GE_CHK_RT_RET(rtMemcpy(tensor_value->MutableData(), tensor_value->GetSize(), tensor_data.GetData(),
-                             tensor_data.GetSize(), RT_MEMCPY_HOST_TO_HOST));
+  std::vector<GeTensorPtr> outputs;
+  for (int32_t i = 0; i < context.NumOutputs(); ++i) {
+    const auto &output_desc = op_desc->GetOutputDesc(i);
+    AllocationAttr attr;
+    attr.SetMemType(HOST_DDR);
+    if (context.AllocateOutput(i, output_desc, nullptr, &attr) != SUCCESS) {
+      GELOGE(FAILED, "node:%s Failed to allocate output %d", context.GetNodeName(), i);
+      return FAILED;
     }
-    GELOGI("node:%s type:%s [%d]th set data success, data size=%zu.", node_->GetName().c_str(),
-           node_->GetType().c_str(), i, tensor_data.GetSize());
-  }
-
-  return SUCCESS;
-}
-
-Status CpuKernelNodeTask::Execute(TaskContext &context, const std::vector<GeTensorPtr> &inputs,
-                                  std::vector<GeTensorPtr> &outputs) {
-  std::vector<ConstGeTensorPtr> const_inputs;
-  for (const auto &input : inputs) {
-    const_inputs.emplace_back(input);
-  }
-  return FoldingPass::RunOpKernel(node_, const_inputs, outputs);
-}
-
-Status HostKernelNodeTask::Execute(TaskContext &context, const std::vector<GeTensorPtr> &inputs,
-                                   std::vector<GeTensorPtr> &outputs) {
-  auto kernel = KernelFactory::Instance().Create(node_->GetType());
-  if (kernel == nullptr) {
-    GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.", node_->GetName().c_str(),
-           node_->GetType().c_str());
-    return UNSUPPORTED;
-  }
-
-  std::vector<ConstGeTensorPtr> const_inputs;
-  for (const auto &input : inputs) {
-    const_inputs.emplace_back(input);
-  }
-  Status compute_ret = kernel->Compute(node_->GetOpDesc(), const_inputs, outputs);
-  if (compute_ret != SUCCESS) {
-    GELOGE(compute_ret, "node %s type %s compute failed or not imply.", node_->GetName().c_str(),
-           node_->GetType().c_str());
-    return compute_ret;
+    auto tensor = context.GetOutput(i);
+    GE_CHECK_NOTNULL(tensor);
+    auto out_tensor =
+      MakeShared<GeTensor>(output_desc, reinterpret_cast<const uint8_t *>(tensor->GetData()), tensor->GetSize());
+    GE_CHECK_NOTNULL(out_tensor);
+    out_tensor->MutableTensorDesc().SetDataType(output_desc.GetDataType());
+    out_tensor->MutableTensorDesc().SetShape(output_desc.GetShape());
+    outputs.emplace_back(out_tensor);
+    GELOGI("node:%s allocate output %d, addr=%p, size=%zu", op_desc->GetName().c_str(), i,
+           reinterpret_cast<const uint8_t *>(out_tensor->GetData().data()), out_tensor->GetData().size());
   }
 
-  return SUCCESS;
+  return HostCpuEngine::GetInstance().Run(node_, inputs, outputs);
 }
 
-Status HostCpuNodeTask::ProcessInputs(TaskContext &context, std::vector<GeTensorPtr> &inputs) { return SUCCESS; }
-
-Status HostCpuNodeTask::ProcessOutputs(TaskContext &context, std::vector<GeTensorPtr> &outputs) { return SUCCESS; }
-
-Status HostCpuNodeTask::Execute(TaskContext &context, const std::vector<GeTensorPtr> &inputs,
-                                std::vector<GeTensorPtr> &outputs) {
+Status HostCpuNodeTask::Execute(TaskContext &context) {
   RunContext run_context;
   auto host_kernel = hybrid::host_cpu::KernelFactory::Instance().CreateKernel(node_);
   if (host_kernel == nullptr) {
@@ -175,10 +116,6 @@ Status HostCpuNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &no
     GELOGI("create CpuKernelNodeTask for node %s, type %s.", name.c_str(), type.c_str());
     task = MakeShared<CpuKernelNodeTask>(node);
     GE_CHECK_NOTNULL(task);
-  } else if (KernelFactory::Instance().Create(type) != nullptr) {
-    GELOGI("create HostKernelNodeTask for node %s, type %s.", name.c_str(), type.c_str());
-    task = MakeShared<HostKernelNodeTask>(node);
-    GE_CHECK_NOTNULL(task);
   } else if (hybrid::host_cpu::KernelFactory::Instance().CreateKernel(node) != nullptr) {
     GELOGI("create HostCpuNodeTask for node %s, type %s.", name.c_str(), type.c_str());
     task = MakeShared<HostCpuNodeTask>(node);
diff --git a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.h b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.h
index b27e558b..036a0c60 100644
--- a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.h
+++ b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.h
@@ -17,58 +17,41 @@
 #ifndef GE_HYBRID_KERNEL_HOST_CPU_NODE_EXECUTOR_H_
 #define GE_HYBRID_KERNEL_HOST_CPU_NODE_EXECUTOR_H_
 
-#include "inc/kernel.h"
 #include "hybrid/node_executor/node_executor.h"
+#include "inc/kernel.h"
 
 namespace ge {
 namespace hybrid {
 class HostNodeTaskBase : public NodeTask {
  public:
   explicit HostNodeTaskBase(const NodePtr &node) : node_(node) {}
-  ~HostNodeTaskBase() = default;
-  virtual Status UpdateArgs(TaskContext &context);
-  virtual Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback);
+  ~HostNodeTaskBase() override = default;
+  Status UpdateArgs(TaskContext &context) override;
+  Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override;
 
  protected:
   NodePtr node_;
 
  private:
-  virtual Status Execute(TaskContext &context, const std::vector<GeTensorPtr> &inputs,
-                         std::vector<GeTensorPtr> &outputs) = 0;
-  virtual Status ProcessInputs(TaskContext &context, std::vector<GeTensorPtr> &inputs);
-  virtual Status ProcessOutputs(TaskContext &context, std::vector<GeTensorPtr> &outputs);
+  virtual Status Execute(TaskContext &context) = 0;
 };
 
 class CpuKernelNodeTask : public HostNodeTaskBase {
  public:
   explicit CpuKernelNodeTask(const NodePtr &node) : HostNodeTaskBase(node) {}
-  ~CpuKernelNodeTask() = default;
-
- private:
-  Status Execute(TaskContext &context, const std::vector<GeTensorPtr> &inputs,
-                 std::vector<GeTensorPtr> &outputs) override;
-};
-
-class HostKernelNodeTask : public HostNodeTaskBase {
- public:
-  explicit HostKernelNodeTask(const NodePtr &node) : HostNodeTaskBase(node) {}
-  ~HostKernelNodeTask() = default;
+  ~CpuKernelNodeTask() override = default;
 
  private:
-  Status Execute(TaskContext &context, const std::vector<GeTensorPtr> &inputs,
-                 std::vector<GeTensorPtr> &outputs) override;
+  Status Execute(TaskContext &context) override;
 };
 
 class HostCpuNodeTask : public HostNodeTaskBase {
  public:
   explicit HostCpuNodeTask(const NodePtr &node) : HostNodeTaskBase(node) {}
-  ~HostCpuNodeTask() = default;
+  ~HostCpuNodeTask() override = default;
 
  private:
-  Status Execute(TaskContext &context, const std::vector<GeTensorPtr> &inputs,
-                 std::vector<GeTensorPtr> &outputs) override;
-  Status ProcessInputs(TaskContext &context, std::vector<GeTensorPtr> &inputs) override;
-  Status ProcessOutputs(TaskContext &context, std::vector<GeTensorPtr> &outputs) override;
+  Status Execute(TaskContext &context) override;
 };
 
 class HostCpuNodeExecutor : public NodeExecutor {
diff --git a/src/ge/hybrid/node_executor/task_context.cc b/src/ge/hybrid/node_executor/task_context.cc
index dd833fe1..e49a2b43 100644
--- a/src/ge/hybrid/node_executor/task_context.cc
+++ b/src/ge/hybrid/node_executor/task_context.cc
@@ -18,6 +18,7 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/debug/log.h"
 #include "graph/utils/tensor_utils.h"
+#include "graph/debug/ge_attr_define.h"
 #include "hybrid/executor/hybrid_execution_context.h"
 #include "hybrid/executor/subgraph_executor.h"
 
@@ -225,7 +226,15 @@ Status TaskContext::AllocateOutputs(AllocationAttr *attr) {
   for (int i = 0; i < node_item_->num_outputs; ++i) {
     const auto &output_desc = node_item_->op_desc->MutableOutputDesc(i);
     GE_CHECK_NOTNULL(output_desc);
-    GE_CHK_STATUS_RET_NOLOG(AllocateOutput(i, *output_desc, nullptr, attr));
+    uint32_t mem_type = 0;
+    (void)AttrUtils::GetInt(node_item_->op_desc, ATTR_OUTPUT_MEMORY_TYPE, mem_type);
+    if (attr == nullptr) {
+      auto tmp_attr = AllocationAttr(0, nullptr, static_cast<MemStorageType>(mem_type));
+      GE_CHK_STATUS_RET_NOLOG(AllocateOutput(i, *output_desc, nullptr, &tmp_attr));
+    } else {
+      attr->SetMemType(static_cast<MemStorageType>(mem_type));
+      GE_CHK_STATUS_RET_NOLOG(AllocateOutput(i, *output_desc, nullptr, attr));
+    }
   }
 
   return SUCCESS;
diff --git a/src/ge/init/gelib.cc b/src/ge/init/gelib.cc
index 0532321e..d5e745eb 100644
--- a/src/ge/init/gelib.cc
+++ b/src/ge/init/gelib.cc
@@ -29,15 +29,17 @@
 #include "common/profiling/profiling_manager.h"
 #include "common/properties_manager.h"
 #include "framework/common/debug/ge_log.h"
+#include "framework/common/debug/log.h"
 #include "framework/common/util.h"
+#include "analyzer/analyzer.h"
 #include "ge/ge_api_types.h"
 #include "ge_local_engine/engine/host_cpu_engine.h"
+#include "graph/common/ge_call_wrapper.h"
 #include "graph/ge_context.h"
 #include "graph/ge_global_options.h"
 #include "graph/load/new_model_manager/model_manager.h"
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/graph_var_manager.h"
-#include "graph/common/ge_call_wrapper.h"
 #include "omm/csa_interact.h"
 #include "runtime/kernel.h"
 
@@ -142,8 +144,15 @@ Status GELib::InnerInitialize(const map<string, string> &options) {
     return initHostCpuEngineStatus;
   }
 
+  GELOGI("Start to init Analyzer!");
+  Status init_analyzer_status = ge::Analyzer::GetInstance()->Initialize();
+  if (init_analyzer_status != SUCCESS) {
+    GELOGE(init_analyzer_status, "Failed to initialize HostCpuEngine");
+    RollbackInit();
+    return init_analyzer_status;
+  }
+
   init_flag_ = true;
-  GELOGI("GeLib initial success.");
   return SUCCESS;
 }
 
@@ -159,6 +168,11 @@ Status GELib::SystemInitialize(const map<string, string> &options) {
   // In train and infer, profiling is always needed.
   InitOptions(options);
   InitProfiling(this->options_);
+  auto model_manager = ModelManager::GetInstance();
+  GE_CHECK_NOTNULL(model_manager);
+  GE_IF_BOOL_EXEC(model_manager->EnableExceptionDump(options) != SUCCESS,
+                  GELOGE(FAILED, "Enable exception dump failed");
+                  return FAILED);
   // 1.`is_train_mode_` means case: train
   // 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer
   // these two case need call `InitSystemWithOptions->rtGetDeviceIndexByPhyId`
@@ -278,20 +292,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt
   CsaInteract::GetInstance().Init(options.device_id, GetContext().TraceId());
   Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_RUNNING, JOBSUBSTATE_ENV_INIT);
   GE_LOGE_IF(ret != SUCCESS, "write job state failed, ret:%u", ret);
-  options.physical_device_id = options.device_id;
-
-  // The physical ID is transferred to the logical ID. FMK receives physical ID and needs to be converted
-  uint32_t dev_logic_index = 0;
-  rtError_t rt_ret = rtGetDeviceIndexByPhyId(static_cast<uint32_t>(options.device_id), &dev_logic_index);
-  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE,
-                  GELOGE(rt_ret, "rtGetDeviceIndexByPhyId transform index by phyId %d failed", options.device_id);
-                  CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_ENV_INIT);
-                  return FAILED);
-  options.device_id = static_cast<int32_t>(dev_logic_index);
-  GELOGI("rtGetDeviceIndexByPhyId physical device id:%d,logical device id:%u", options.device_id, dev_logic_index);
-
-  GetContext().SetCtxDeviceId(dev_logic_index);
 
+  // set device id
+  GELOGI("set logical device id:%u", options.device_id);
+  GetContext().SetCtxDeviceId(static_cast<uint32_t>(options.device_id));
   GE_CHK_RT_RET(rtSetDevice(options.device_id));
 
   // In the scenario that the automatic add fusion is set, but there is no cleanaddr operator,
@@ -389,6 +393,9 @@ Status GELib::Finalize() {
   GELOGI("HostCpuEngine finalization.");
   HostCpuEngine::GetInstance().Finalize();
 
+  GELOGI("Analyzer finalization");
+  Analyzer::GetInstance()->Finalize();
+
   // Shut down profiling
   ShutDownProfiling();
 
diff --git a/src/ge/ir_build/atc_ir_common.cc b/src/ge/ir_build/atc_ir_common.cc
index dbfe688b..82ed40bd 100644
--- a/src/ge/ir_build/atc_ir_common.cc
+++ b/src/ge/ir_build/atc_ir_common.cc
@@ -476,13 +476,6 @@ Status CheckDisableReuseMemoryParamValid(const std::string disable_reuse_memory)
     GELOGE(ge::PARAM_INVALID, "Input parameter[--disable_reuse_memory]'s value must be 1 or 0.");
     return ge::PARAM_INVALID;
   }
-
-  const char *env_ge_dump = std::getenv("DUMP_OP");
-  const int decimal = 10;
-  int ge_dump_flag = (env_ge_dump != nullptr) ? std::strtol(env_ge_dump, nullptr, decimal) : 0;
-  if (ge_dump_flag && (disable_reuse_memory == "0")) {
-    GELOGW("Will dump uncorrect op data with param disable_reuse_memory=0");
-  }
   return ge::SUCCESS;
 }
 
diff --git a/src/ge/ir_build/ge_ir_build.cc b/src/ge/ir_build/ge_ir_build.cc
index 0a60fa11..90f7a8ca 100644
--- a/src/ge/ir_build/ge_ir_build.cc
+++ b/src/ge/ir_build/ge_ir_build.cc
@@ -31,11 +31,11 @@
 #include "graph/compute_graph.h"
 #include "graph/ge_tensor.h"
 #include "graph/utils/type_utils.h"
+#include "graph/ge_global_options.h"
 #include "init/gelib.h"
 #include "ir_build/atc_ir_common.h"
 #include "model/ge_model.h"
 
-using domi::GetContext;
 using std::string;
 using namespace std;
 
@@ -133,25 +133,24 @@ void aclgrphBuildFinalize() {
 class Impl {
  public:
   Impl() {
-    GetContext().format = domi::DOMI_TENSOR_ND;
-    GetContext().input_nodes_format_map.clear();
-    GetContext().output_formats.clear();
-    GetContext().user_input_dims.clear();
-    GetContext().input_dims.clear();
-    GetContext().op_conf_map.clear();
-    GetContext().out_nodes_map.clear();
-    GetContext().user_out_nodes.clear();
-    GetContext().net_format = domi::DOMI_TENSOR_RESERVED;
-    GetContext().type = domi::FRAMEWORK_RESERVED;
-    GetContext().run_mode = ONLY_PRE_CHECK;
-    GetContext().train_flag = false;
-    GetContext().fp16_high_precision = HIGH_PRECISION_DEFAULT;
-    GetContext().output_type.clear();
-    GetContext().net_name.clear();
-    GetContext().is_dynamic_input = false;
-    GetContext().dynamic_batch_size.clear();
-    GetContext().dynamic_image_size.clear();
-    GetContext().dynamic_dims.clear();
+    omg_context_ = domi::GetContext();
+    omg_context_.format = domi::DOMI_TENSOR_ND;
+    omg_context_.input_nodes_format_map.clear();
+    omg_context_.output_formats.clear();
+    omg_context_.user_input_dims.clear();
+    omg_context_.input_dims.clear();
+    omg_context_.op_conf_map.clear();
+    omg_context_.out_nodes_map.clear();
+    omg_context_.user_out_nodes.clear();
+    omg_context_.net_format = domi::DOMI_TENSOR_RESERVED;
+    omg_context_.type = domi::FRAMEWORK_RESERVED;
+    omg_context_.run_mode = ONLY_PRE_CHECK;
+    omg_context_.train_flag = false;
+    omg_context_.output_type.clear();
+    omg_context_.is_dynamic_input = false;
+    omg_context_.dynamic_batch_size.clear();
+    omg_context_.dynamic_image_size.clear();
+    omg_context_.dynamic_dims.clear();
   };
   ~Impl() { (void)generator_.Finalize(); };
   graphStatus CheckOptions(const std::map<std::string, std::string> &options);
@@ -161,24 +160,52 @@ class Impl {
                          ModelBufferData &ge_models);
   graphStatus InitDomiOmgContext(const string &input_shape, const string &input_format, const string &net_format,
                                  bool is_dynamic_input);
+  void SetRtSocVersion();
 
  public:
   ge::GeGenerator generator_;
   std::map<std::string, std::string> options_;
   bool is_dynamic_input_ = false;
+  OmgContext omg_context_;
 };
 
 graphStatus Impl::CheckOptions(const std::map<std::string, std::string> &options) {
   for (auto &ele : options) {
     auto it = ge::ir_option::ir_builder_suppported_options.find(ele.first);
     if (it == ge::ir_option::ir_builder_suppported_options.end()) {
-      GELOGE(GRAPH_PARAM_INVALID, "input options include unsupported option(%s).Please check!", ele.first.c_str());
-      return GRAPH_PARAM_INVALID;
+      auto it_lx_fusion = ir_builder_supported_options_for_lx_fusion.find(ele.first);
+      if (it_lx_fusion == ir_builder_supported_options_for_lx_fusion.end()) {
+        GELOGE(GRAPH_PARAM_INVALID, "input options include unsupported option(%s).Please check!", ele.first.c_str());
+        return GRAPH_PARAM_INVALID;
+      }
     }
     options_.insert(ele);
   }
+  // Check options build_mode and build_step.
+  std::string build_mode;
+  auto it = options_.find(BUILD_MODE);
+  if (it != options_.end() && !(it->second.empty())) {
+    if (build_mode_options.find(it->second) == build_mode_options.end()) {
+      GELOGE(GRAPH_PARAM_INVALID, "Build mode:%s is unsupported. Please check!", it->second.c_str());
+      return GRAPH_PARAM_INVALID;
+    }
+    build_mode = it->second;
+  }
+  it = options_.find(BUILD_STEP);
+  if (it != options_.end() && !(it->second.empty())) {
+    if (build_step_options.find(it->second) == build_step_options.end()) {
+      GELOGE(GRAPH_PARAM_INVALID, "Build step:%s is unsupported. Please check!", it->second.c_str());
+      return GRAPH_PARAM_INVALID;
+    }
+  } else {
+    if (build_mode == BUILD_MODE_TUNING) {
+      GELOGE(GRAPH_PARAM_INVALID, "Build mode tuning must specify build step. Please check!");
+      return GRAPH_PARAM_INVALID;
+    }
+  }
   return GRAPH_SUCCESS;
 }
+
 graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
   // 1. check options
   graphStatus ret = CheckOptions(options);
@@ -186,6 +213,13 @@ graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
     GELOGE(ret, "User input options are illegal! Please check!");
     return ret;
   }
+
+  GetThreadLocalContext().SetGlobalOption(GetMutableGlobalOptions());
+  GetThreadLocalContext().SetGraphOption(options_);
+  std::string build_mode = (options_.find(BUILD_MODE) == options_.end() || options_[BUILD_MODE] == BUILD_MODE_NORMAL)
+                             ? ""
+                             : options_[BUILD_MODE];
+  options_[BUILD_MODE] = build_mode;
   // set log level
   std::string log = options_.find(ge::ir_option::LOG_LEVEL) == options_.end() ? IR_OPTION_LOG_LEVEL_DEFAULT
                                                                               : options_[ge::ir_option::LOG_LEVEL];
@@ -212,9 +246,9 @@ graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
   }
   GELOGD("User input dynamic_batch_size:%s, dynamic_image_size:%s, dynamic_dims:%s.", dynamic_batch_size.c_str(),
          dynamic_image_size.c_str(), dynamic_dims.c_str());
-  GetContext().dynamic_batch_size = dynamic_batch_size;
-  GetContext().dynamic_image_size = dynamic_image_size;
-  GetContext().dynamic_dims = dynamic_dims;
+  omg_context_.dynamic_batch_size = dynamic_batch_size;
+  omg_context_.dynamic_image_size = dynamic_image_size;
+  omg_context_.dynamic_dims = dynamic_dims;
   // check output_type
   std::string output_type =
     options_.find(ge::ir_option::OUTPUT_TYPE) == options_.end() ? "" : options_[ge::ir_option::OUTPUT_TYPE];
@@ -235,8 +269,10 @@ graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
   // print ge option map
   ge::PrintOptionMap(options_, "ge option");
 
+  SetRtSocVersion();
+
   // 3. init generator with options_
-  ret = generator_.Initialize(options_);
+  ret = generator_.Initialize(options_, omg_context_);
   if (ret != GRAPH_SUCCESS) {
     GELOGE(ret, "generator Initialize failed!");
     return ret;
@@ -244,6 +280,20 @@ graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
   // 4.parse and init Context with input shape format and net format info
   return this->InitDomiOmgContext(input_shape, input_format, net_format, is_dynamic_input_);
 }
+
+void Impl::SetRtSocVersion() {
+  auto &global_options = GetMutableGlobalOptions();
+  auto it = global_options.find(ge::SOC_VERSION);
+  if (it != global_options.end()) {
+    const char *soc_version = it->second.c_str();
+    rtError_t rt_ret = rtSetSocVersion(soc_version);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("Set soc version %s failed. ret:0x%X", soc_version, rt_ret);
+    }
+    GELOGI("Set soc version %s success.", soc_version);
+  }
+}
+
 graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector<ge::GeTensor> &inputs) {
   auto compute_graph = ge::GraphUtils::GetComputeGraph(graph);
   GE_CHECK_NOTNULL(compute_graph);
@@ -259,8 +309,8 @@ graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector<ge::GeTe
       string data_op_name = op->GetName();
       GELOGI("Data op name: %s", data_op_name.c_str());
       ge::GeShape data_shape;
-      auto iter = GetContext().input_dims.find(data_op_name);
-      if (iter != GetContext().input_dims.end()) {
+      auto iter = omg_context_.input_dims.find(data_op_name);
+      if (iter != omg_context_.input_dims.end()) {
         data_shape = ge::GeShape(iter->second);
         GELOGI("Data op get shape from Context.");
       } else {
@@ -273,7 +323,7 @@ graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector<ge::GeTe
       GELOGI("Data op get data type:%s from InputDesc in ge ir graph.", data_type_str.c_str());
 
       ge::GeTensor inputTensor;
-      ge::GeTensorDesc desc(data_shape, ge::Format(GetContext().format), data_type);
+      ge::GeTensorDesc desc(data_shape, ge::Format(omg_context_.format), data_type);
       inputTensor.SetTensorDesc(desc);
       inputs.push_back(inputTensor);
     }
@@ -292,7 +342,7 @@ graphStatus Impl::BuildModel(const Graph &graph, const std::map<std::string, std
 
   // 2. construct input
   std::vector<GeTensor> inputs;
-  if (!GetContext().is_dynamic_input) {  // if dynamic input , no need to creat inputs
+  if (!omg_context_.is_dynamic_input) {  // if dynamic input , no need to creat inputs
     ret = CreateInputsForIRBuild(graph, inputs);
     if (ret != GRAPH_SUCCESS) {
       GELOGE(ret, "CreateInputsForIRBuild failed!");
@@ -312,15 +362,15 @@ graphStatus Impl::BuildModel(const Graph &graph, const std::map<std::string, std
 graphStatus Impl::InitDomiOmgContext(const string &input_shape, const string &input_format, const string &net_format,
                                      bool is_dynamic_input) {
   // Clear omgcontext data first
-  GetContext().input_dims.clear();
-  GetContext().user_input_dims.clear();
-  GetContext().is_dynamic_input = is_dynamic_input;
+  omg_context_.input_dims.clear();
+  omg_context_.user_input_dims.clear();
+  omg_context_.is_dynamic_input = is_dynamic_input;
   // the default value is ND
-  GetContext().format = domi::DOMI_TENSOR_ND;
+  omg_context_.format = domi::DOMI_TENSOR_ND;
   if (!input_format.empty()) {
     auto iter = ge::input_format_str_to_geformat.find(input_format);
     if (iter != ge::input_format_str_to_geformat.end()) {
-      GetContext().format = iter->second;
+      omg_context_.format = iter->second;
     } else {
       GELOGE(GRAPH_PARAM_INVALID, "Input format %s not support , expect ND/NCHW/NHWC/CHWN/NC1HWC0/NHWC1C0.",
              input_format.c_str());
@@ -332,7 +382,7 @@ graphStatus Impl::InitDomiOmgContext(const string &input_shape, const string &in
     return GRAPH_SUCCESS;
   }
 
-  if (!ParseInputShape(input_shape, GetContext().input_dims, GetContext().user_input_dims, is_dynamic_input)) {
+  if (!ParseInputShape(input_shape, omg_context_.input_dims, omg_context_.user_input_dims, is_dynamic_input)) {
     GELOGE(GRAPH_PARAM_INVALID, "Failed to parse input shape: %s", input_shape.c_str());
     return GRAPH_PARAM_INVALID;
   }
diff --git a/src/ge/model/ge_model.cc b/src/ge/model/ge_model.cc
index 348f8416..70251876 100644
--- a/src/ge/model/ge_model.cc
+++ b/src/ge/model/ge_model.cc
@@ -43,6 +43,8 @@ std::shared_ptr<domi::ModelTaskDef> GeModel::GetModelTaskDefPtr() const { return
 
 const TBEKernelStore &GeModel::GetTBEKernelStore() const { return this->tbe_kernal_store_; }
 
+const CustAICPUKernelStore &GeModel::GetCustAICPUKernelStore() const { return this->cust_aicpu_kernal_store_; }
+
 Buffer GeModel::GetWeight() const { return this->weights_buffer_; }
 
 std::string GeModel::GetName() const { return this->name_; }
@@ -59,6 +61,10 @@ void GeModel::SetModelTaskDef(const std::shared_ptr<domi::ModelTaskDef> &task) {
 
 void GeModel::SetTBEKernelStore(const TBEKernelStore &tbe_kernal_store) { this->tbe_kernal_store_ = tbe_kernal_store; }
 
+void GeModel::SetCustAICPUKernelStore(const CustAICPUKernelStore &cust_aicpu_kernal_store) {
+  this->cust_aicpu_kernal_store_ = cust_aicpu_kernal_store;
+}
+
 void GeModel::SetWeight(const Buffer &weights_buffer) { this->weights_buffer_ = weights_buffer; }
 
 void GeModel::SetName(const std::string &name) { this->name_ = name; }
diff --git a/src/ge/model/ge_model.h b/src/ge/model/ge_model.h
index be4b65bc..288b834f 100644
--- a/src/ge/model/ge_model.h
+++ b/src/ge/model/ge_model.h
@@ -22,6 +22,7 @@
 #include <memory>
 #include <string>
 #include "common/tbe_kernel_store.h"
+#include "common/cust_aicpu_kernel_store.h"
 #include "framework/common/debug/log.h"
 #include "framework/common/fmk_error_codes.h"
 #include "graph/buffer.h"
@@ -40,6 +41,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder
   const Graph &GetGraph() const;
   std::shared_ptr<domi::ModelTaskDef> GetModelTaskDefPtr() const;
   const TBEKernelStore &GetTBEKernelStore() const;
+  const CustAICPUKernelStore &GetCustAICPUKernelStore() const;
   Buffer GetWeight() const;
 
   std::string GetName() const;
@@ -50,6 +52,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder
   void SetGraph(const Graph &graph);
   void SetModelTaskDef(const std::shared_ptr<domi::ModelTaskDef> &task);
   void SetTBEKernelStore(const TBEKernelStore &tbe_kernal_store);
+  void SetCustAICPUKernelStore(const CustAICPUKernelStore &cust_aicpu_kernal_store);
   void SetWeight(const Buffer &weights_buffer);
 
   void SetName(const std::string &name);
@@ -79,6 +82,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder
   Graph graph_;
   std::shared_ptr<domi::ModelTaskDef> task_;
   TBEKernelStore tbe_kernal_store_;
+  CustAICPUKernelStore cust_aicpu_kernal_store_;
   Buffer weights_buffer_;
 
   std::string name_;
diff --git a/src/ge/opskernel_manager/ops_kernel_manager.cc b/src/ge/opskernel_manager/ops_kernel_manager.cc
index 24c5a52d..51e8f438 100644
--- a/src/ge/opskernel_manager/ops_kernel_manager.cc
+++ b/src/ge/opskernel_manager/ops_kernel_manager.cc
@@ -34,6 +34,8 @@ const char *const kInitialize = "Initialize";
 const char *const kGetOpsKernelInfoStores = "GetOpsKernelInfoStores";
 const char *const kGetGraphOptimizerObjs = "GetGraphOptimizerObjs";
 const char *const kFinalize = "Finalize";
+
+std::mutex ops_kernel_info_mutex;
 }  // namespace
 
 namespace ge {
@@ -198,7 +200,7 @@ Status OpsKernelManager::ParsePluginOptions(const map<string, string> &options,
   return SUCCESS;
 }
 
-Status OpsKernelManager::CheckPluginPtr() {
+Status OpsKernelManager::CheckPluginPtr() const {
   for (auto iter = ops_kernel_store_.begin(); iter != ops_kernel_store_.end(); ++iter) {
     if (iter->second == nullptr) {
       GELOGE(INTERNAL_ERROR, "CheckPluginPtr OpsKernelInfoStorePtr is null");
@@ -339,6 +341,8 @@ Status OpsKernelManager::Finalize() {
 }
 
 const vector<OpInfo> &OpsKernelManager::GetOpsKernelInfo(const string &op_type) {
+  std::lock_guard<std::mutex> lock(ops_kernel_info_mutex);
+
   auto find = ops_kernel_info_.find(op_type);
   if (find != ops_kernel_info_.end()) {
     return find->second;
@@ -353,7 +357,10 @@ const vector<OpInfo> &OpsKernelManager::GetOpsKernelInfo(const string &op_type)
   }
 }
 
-const map<string, vector<OpInfo>> &OpsKernelManager::GetAllOpsKernelInfo() const { return ops_kernel_info_; }
+const map<string, vector<OpInfo>> &OpsKernelManager::GetAllOpsKernelInfo() const {
+  std::lock_guard<std::mutex> lock(ops_kernel_info_mutex);
+  return ops_kernel_info_;
+}
 
 OpsKernelInfoStorePtr OpsKernelManager::GetOpsKernelInfoStore(const std::string &name) const {
   auto find = ops_kernel_store_.find(name);
diff --git a/src/ge/opskernel_manager/ops_kernel_manager.h b/src/ge/opskernel_manager/ops_kernel_manager.h
index 43644d0e..a5d4d85c 100644
--- a/src/ge/opskernel_manager/ops_kernel_manager.h
+++ b/src/ge/opskernel_manager/ops_kernel_manager.h
@@ -21,6 +21,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <mutex>
 
 #include "common/debug/log.h"
 #include "common/ge/plugin_manager.h"
@@ -74,9 +75,6 @@ class OpsKernelManager {
   // get enablePluginFlag
   bool GetEnablePluginFlag() const;
 
-  // Finalize other ops kernel resource
-  Status FinalizeOpsKernel();
-
  private:
   OpsKernelManager();
   ~OpsKernelManager();
@@ -89,7 +87,7 @@ class OpsKernelManager {
 
   Status InitOpKernelInfoStores(const map<string, string> &options);
 
-  Status CheckPluginPtr();
+  Status CheckPluginPtr() const;
 
   void GetExternalEnginePath(std::string &path, const std::map<string, string> &options);
 
@@ -105,6 +103,9 @@ class OpsKernelManager {
 
   Status InitGraphOptimizerPriority();
 
+  // Finalize other ops kernel resource
+  Status FinalizeOpsKernel();
+
   PluginManager plugin_manager_;
   OpTilingManager op_tiling_manager_;
   // opsKernelInfoStore
diff --git a/src/ge/session/inner_session.cc b/src/ge/session/inner_session.cc
index a4e77b73..9f1f199f 100644
--- a/src/ge/session/inner_session.cc
+++ b/src/ge/session/inner_session.cc
@@ -31,28 +31,17 @@
 namespace ge {
 namespace {
 Status CheckReuseMemoryOption(const std::map<string, string> &options) {
-  const int kDecimal = 10;
-  auto dump_op_env = std::getenv("DUMP_OP");
-  int dump_op_flag = (dump_op_env != nullptr) ? std::strtol(dump_op_env, nullptr, kDecimal) : 0;
   auto iter = options.find(OPTION_EXEC_DISABLE_REUSED_MEMORY);
   if (iter != options.end()) {
     if (iter->second == "0") {
       GELOGD("%s=0, reuse memory is open", OPTION_EXEC_DISABLE_REUSED_MEMORY);
-      if (dump_op_flag) {
-        GELOGW("Will dump incorrect op data with ge option %s=0", OPTION_EXEC_DISABLE_REUSED_MEMORY);
-      }
     } else if (iter->second == "1") {
       GELOGD("%s=1, reuse memory is close", OPTION_EXEC_DISABLE_REUSED_MEMORY);
     } else {
       GELOGE(PARAM_INVALID, "option %s=%s is invalid", OPTION_EXEC_DISABLE_REUSED_MEMORY, iter->second.c_str());
       return FAILED;
     }
-  } else {
-    if (dump_op_flag) {
-      GELOGW("Will dump incorrect op data with default reuse memory");
-    }
   }
-
   return SUCCESS;
 }
 }  // namespace
@@ -60,7 +49,7 @@ Status CheckReuseMemoryOption(const std::map<string, string> &options) {
 static std::mutex mutex_;  // BuildGraph and RunGraph use
 
 InnerSession::InnerSession(uint64_t session_id, const std::map<string, string> &options)
-    : init_flag_(false), session_id_(session_id), options_(options) {}
+    : init_flag_(false), session_id_(session_id), options_(options), graph_manager_(domi::GetContext()) {}
 
 Status InnerSession::Initialize() {
   if (init_flag_) {
diff --git a/src/ge/session/omg.cc b/src/ge/session/omg.cc
index 805f8653..bcf42032 100644
--- a/src/ge/session/omg.cc
+++ b/src/ge/session/omg.cc
@@ -95,7 +95,28 @@ static void ParseAtcParms(const std::map<std::string, std::string> &atc_params,
   }
 }
 
-static Status CheckInputShapeNode(const ComputeGraphPtr &graph) {
+static Status CheckInputShapeNode(const ComputeGraphPtr &graph, const bool is_dynamic_input) {
+  if (!is_dynamic_input) {
+    for (auto node : graph->GetDirectNode()) {
+      if (node->GetType() == DATA) {
+        auto data_op_desc = node->GetOpDesc();
+        GE_CHECK_NOTNULL(data_op_desc);
+        auto tensor_desc = data_op_desc->MutableInputDesc(0);
+        GE_CHECK_NOTNULL(tensor_desc);
+        for (auto dim : tensor_desc->GetShape().GetDims()) {
+          if (dim < 0) {
+            GELOGE(PARAM_INVALID,
+                   "Input op [%s] shape %ld is negative, maybe you should set input_shape to specify its shape",
+                   node->GetName().c_str(), dim);
+            const string reason = "maybe you should set input_shape to specify its shape";
+            ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
+                                                            {node->GetName(), to_string(dim), reason});
+            return PARAM_INVALID;
+          }
+        }
+      }
+    }
+  }
   for (auto it : domi::GetContext().user_input_dims) {
     std::string node_name = it.first;
     ge::NodePtr node = graph->FindNode(node_name);
@@ -758,7 +779,7 @@ FMK_FUNC_HOST_VISIBILITY Status ParseGraph(ge::Graph &graph, const std::map<stri
   compute_graph = GraphUtils::GetComputeGraph(graph);
   GE_RETURN_IF_ERROR(CheckInputFp16Nodes(compute_graph, input_fp16_nodes, is_input_adjust_hw_layout));
 
-  GE_RETURN_IF_ERROR(CheckInputShapeNode(compute_graph));
+  GE_RETURN_IF_ERROR(CheckInputShapeNode(compute_graph, is_dynamic_input));
 
   std::string compress_weight_conf;
   ParseAtcParms(atc_params, "compress_weight_conf", compress_weight_conf);
@@ -919,9 +940,16 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const
     GELOGE(ret, "LoadFromFile failed.");
     return ret;
   }
-
+  bool flag = false;
   ge::proto::ModelDef model_def;
-  bool flag = google::protobuf::TextFormat::ParseFromString(reinterpret_cast<char *>(model.model_data), &model_def);
+  try {
+    flag = google::protobuf::TextFormat::ParseFromString(reinterpret_cast<char *>(model.model_data), &model_def);
+  } catch (google::protobuf::FatalException &e) {
+    free_model_data(&model.model_data);
+    GELOGE(FAILED, "ParseFromString fail. exception message : %s", e.what());
+    return FAILED;
+  }
+
   if (!flag) {
     free_model_data(&model.model_data);
     GELOGE(FAILED, "ParseFromString fail.");
diff --git a/src/ge/session/session_manager.cc b/src/ge/session/session_manager.cc
index bca98d53..35d97c31 100644
--- a/src/ge/session/session_manager.cc
+++ b/src/ge/session/session_manager.cc
@@ -341,6 +341,13 @@ Status SessionManager::GetVariables(SessionId session_id, const std::vector<std:
     GELOGE(FAILED, "Save variables failed.");
     return FAILED;
   }
+
+  // step 5: remove graph
+  ret = innerSession->RemoveGraph(graph_id);
+  if (ret != SUCCESS) {
+    GELOGE(FAILED, "Remove graph failed.");
+    return FAILED;
+  }
   return ret;
 }
 
diff --git a/src/ge/single_op/single_op.cc b/src/ge/single_op/single_op.cc
index 5fa4efcf..aeefe2be 100644
--- a/src/ge/single_op/single_op.cc
+++ b/src/ge/single_op/single_op.cc
@@ -91,24 +91,12 @@ Status SingleOp::ValidateArgs(const std::vector<DataBuffer> &inputs, const std::
 
 Status SingleOp::GetArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs) {
   size_t arg_index = 0;
-  if (use_physical_addr_) {
-    for (auto &input : inputs) {
-      auto *addr = reinterpret_cast<uint8_t *>(input.data);
-      args_[arg_index++] = reinterpret_cast<uintptr_t>(addr);
-    }
-
-    for (auto &output : outputs) {
-      auto *addr = reinterpret_cast<uint8_t *>(output.data);
-      args_[arg_index++] = reinterpret_cast<uintptr_t>(addr);
-    }
-  } else {
-    for (auto &input : inputs) {
-      args_[arg_index++] = reinterpret_cast<uintptr_t>(input.data);
-    }
+  for (auto &input : inputs) {
+    args_[arg_index++] = reinterpret_cast<uintptr_t>(input.data);
+  }
 
-    for (auto &output : outputs) {
-      args_[arg_index++] = reinterpret_cast<uintptr_t>(output.data);
-    }
+  for (auto &output : outputs) {
+    args_[arg_index++] = reinterpret_cast<uintptr_t>(output.data);
   }
   return SUCCESS;
 }
diff --git a/src/ge/single_op/single_op.h b/src/ge/single_op/single_op.h
index 71096f35..b7d23d32 100644
--- a/src/ge/single_op/single_op.h
+++ b/src/ge/single_op/single_op.h
@@ -53,7 +53,6 @@ class SingleOp {
 
   std::vector<OpTask *> tasks_;
   std::vector<std::vector<uintptr_t *>> arg_table_;
-  bool use_physical_addr_ = false;
 };
 
 class DynamicSingleOp {
diff --git a/src/ge/single_op/single_op_model.cc b/src/ge/single_op/single_op_model.cc
index 65f76acc..8c974259 100644
--- a/src/ge/single_op/single_op_model.cc
+++ b/src/ge/single_op/single_op_model.cc
@@ -85,11 +85,6 @@ void SingleOpModel::ParseOpModelParams(ModelHelper &model_helper, SingleOpModelP
 
 Status SingleOpModel::InitModelMem(StreamResource &res) {
   ParseOpModelParams(model_helper_, model_params_);
-  if (model_params_.memory_size > ALLOC_MEMORY_MAX_SIZE || model_params_.weight_size > ALLOC_MEMORY_MAX_SIZE) {
-    GELOGE(PARAM_INVALID, "Can not alloc memory larger than %lu. memory size = %lu, weight size = %lu",
-           ALLOC_MEMORY_MAX_SIZE, model_params_.memory_size, model_params_.weight_size);
-    return PARAM_INVALID;
-  }
 
   if (model_params_.memory_size > model_params_.zero_copy_mem_size) {
     const string purpose("malloc feature map memory on model execute.");
@@ -203,12 +198,6 @@ Status SingleOpModel::ParseInputsAndOutputs() {
 }
 
 Status SingleOpModel::SetInputsAndOutputs(SingleOp &single_op) {
-  // for lhisi
-  const char *use_physical_address = std::getenv("GE_USE_PHYSICAL_ADDRESS");
-  if (use_physical_address != nullptr) {
-    single_op.use_physical_addr_ = true;
-  }
-
   int arg_index = 0;
   for (size_t i = 0; i < input_offset_list_.size(); ++i) {
     auto *addr = model_params_.mem_base + input_offset_list_[i];
diff --git a/src/ge/single_op/task/op_task.cc b/src/ge/single_op/task/op_task.cc
index 8280fff5..f23073bb 100644
--- a/src/ge/single_op/task/op_task.cc
+++ b/src/ge/single_op/task/op_task.cc
@@ -32,7 +32,7 @@ constexpr int kLaunchRetryTimes = 1000;
 constexpr int kSleepTime = 10;
 }  // namespace
 
-Status OpTask::OpenDump(void *arg, const OpDescPtr &op_desc, rtStream_t stream) {
+Status OpTask::OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream) {
   if (DumpManager::GetInstance().IsDumpOpen()) {
     GELOGI("Dump is open in single op,start to set dump info");
     std::vector<uint64_t> input_addrs;
@@ -40,11 +40,11 @@ Status OpTask::OpenDump(void *arg, const OpDescPtr &op_desc, rtStream_t stream)
     auto input_size = op_desc->GetAllInputsDesc().size();
     auto output_size = op_desc->GetOutputsSize();
     for (size_t i = 0; i < input_size; i++) {
-      uint64_t input_addr = *(reinterpret_cast<uint64_t *>(arg) + i);
+      uint64_t input_addr = *(reinterpret_cast<const uint64_t *>(arg) + i);
       input_addrs.emplace_back(input_addr);
     }
     for (size_t j = 0; j < output_size; j++) {
-      uint64_t output_addr = *(reinterpret_cast<uint64_t *>(arg) + input_size + j);
+      uint64_t output_addr = *(reinterpret_cast<const uint64_t *>(arg) + input_size + j);
       output_adds.emplace_back(output_addr);
     }
     dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc, input_addrs, output_adds, stream);
diff --git a/src/ge/single_op/task/op_task.h b/src/ge/single_op/task/op_task.h
index 0401a177..a571bce1 100644
--- a/src/ge/single_op/task/op_task.h
+++ b/src/ge/single_op/task/op_task.h
@@ -57,7 +57,7 @@ class OpTask {
   std::vector<int64_t> workspace_sizes_;
 
  protected:
-  Status OpenDump(void *arg, const OpDescPtr &op_desc, rtStream_t stream);
+  Status OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream);
   DumpProperties dump_properties_;
   DumpOp dump_op_;
 };
diff --git a/src/proto/dump_task.proto b/src/proto/dump_task.proto
new file mode 100644
index 00000000..ecdf4792
--- /dev/null
+++ b/src/proto/dump_task.proto
@@ -0,0 +1,127 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+package toolkit.dumpdata;
+
+enum OutputDataType {
+    DT_UNDEFINED = 0;
+    DT_FLOAT = 1;
+    DT_FLOAT16 = 2;
+    DT_INT8 = 3;
+    DT_UINT8 = 4;
+    DT_INT16 = 5;
+    DT_UINT16 = 6;
+    DT_INT32 = 7;
+    DT_INT64 = 8;
+    DT_UINT32 = 9;
+    DT_UINT64 = 10;
+    DT_BOOL = 11;
+    DT_DOUBLE = 12;
+    DT_STRING = 13;
+    DT_DUAL_SUB_INT8 = 14;
+    DT_DUAL_SUB_UINT8 = 15;
+    DT_COMPLEX64 = 16;
+    DT_COMPLEX128 = 17;
+    DT_QINT8 = 18;
+    DT_QINT16 = 19;
+    DT_QINT32 = 20;
+    DT_QUINT8 = 21;
+    DT_QUINT16 = 22;
+    DT_RESOURCE = 23;
+    DT_STRING_REF = 24;
+    DT_DUAL = 25;
+}
+
+enum OutputFormat {
+    FORMAT_NCHW = 0;
+    FORMAT_NHWC = 1;
+    FORMAT_ND = 2;
+    FORMAT_NC1HWC0 = 3;
+    FORMAT_FRACTAL_Z = 4;
+    FORMAT_NC1C0HWPAD = 5;
+    FORMAT_NHWC1C0 = 6;
+    FORMAT_FSR_NCHW = 7;
+    FORMAT_FRACTAL_DECONV = 8;
+    FORMAT_C1HWNC0 = 9;
+    FORMAT_FRACTAL_DECONV_TRANSPOSE = 10;
+    FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS = 11;
+    FORMAT_NC1HWC0_C04 = 12;
+    FORMAT_FRACTAL_Z_C04 = 13;
+    FORMAT_CHWN = 14;
+    FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS = 15;
+    FORMAT_HWCN = 16;
+    FORMAT_NC1KHKWHWC0 = 17;
+    FORMAT_BN_WEIGHT = 18;
+    FORMAT_FILTER_HWCK = 19;
+    FORMAT_HASHTABLE_LOOKUP_LOOKUPS=20;
+    FORMAT_HASHTABLE_LOOKUP_KEYS = 21;
+    FORMAT_HASHTABLE_LOOKUP_VALUE = 22;
+    FORMAT_HASHTABLE_LOOKUP_OUTPUT = 23;
+    FORMAT_HASHTABLE_LOOKUP_HITS=24;
+    FORMAT_C1HWNCoC0 = 25;
+    FORMAT_MD = 26;
+    FORMAT_NDHWC = 27;
+    FORMAT_FRACTAL_ZZ = 28;
+    FORMAT_FRACTAL_NZ = 29;
+    FORMAT_RESERVED = 30;
+}
+
+message OriginalOp {
+    string name = 1;
+    uint32 output_index = 2;
+    OutputDataType data_type = 3;
+    OutputFormat format = 4;
+}
+
+message Shape {
+    repeated uint64 dim = 1;
+}
+
+message OpOutput {
+    OutputDataType data_type = 1;
+    OutputFormat format = 2;
+    Shape shape = 3;
+    OriginalOp original_op = 4;  // the original op corresponding to the output
+    bytes data = 5;
+    uint64 size = 6;
+}
+
+message OpInput {
+    OutputDataType data_type = 1;
+    OutputFormat format = 2;
+    Shape shape = 3;
+    bytes data = 4;
+    uint64 size = 5;
+}
+
+enum BufferType {
+    L1 = 0;
+}
+
+message OpBuffer {
+    BufferType buffer_type = 1;
+    bytes data = 2;
+    uint64 size = 3;
+}
+
+message DumpData{
+    string version = 1;
+    uint64 dump_time = 2;
+    repeated OpOutput output = 3;
+    repeated OpInput input = 4;
+    repeated OpBuffer buffer = 5;
+}
diff --git a/src/proto/insert_op.proto b/src/proto/insert_op.proto
index fd5bd3ec..a059e122 100644
--- a/src/proto/insert_op.proto
+++ b/src/proto/insert_op.proto
@@ -40,16 +40,22 @@ message AippOpParams {
 		RAW12 = 12;
 		RAW16 = 13;
 		RAW24 = 14;
+		RGB16 = 15;
+		RGB20 = 16;
+		RGB24 = 17;
+		RGB8_IR = 18;
+		RGB16_IR = 19;
+		RGB24_IR = 20;
 	}
-  
+
 	enum AippMode {
 		undefined = 0;
 		static = 1;
 		dynamic = 2;
 	}
-  
+
 	// AIPPģʽ�����־�̬AIPP�Ͷ�̬AIPP
-	AippMode aipp_mode = 1;			
+	AippMode aipp_mode = 1;
 
 	// related_input_rank����Ϊ�������Ϊ���ͣ����÷�Χ>=0, <=����Data���ӵĸ�����Ĭ��ֵΪ0��
 	// ��ʶ��ģ�͵ĵڼ���������AIPP����������ģ�����������룬��Ҫ�Ե�2��������AIPP��������related_input_rankΪ1��
@@ -126,6 +132,10 @@ message AippOpParams {
 	repeated int32 input_bias_2 = 44;
 
 	// [End] ��̬AIPP����
+
+       // The n number that is used for raw/rgbir data into f16 transformation.
+       // The transformation equation is x/(2^n). If set to 0, no transform is performed.
+       uint32 raw_rgbir_to_f16_n = 45;
 }
 
 message MultiShapeOpParams {
diff --git a/third_party/fwkacllib/inc/cce/taskdown_common.hpp b/third_party/fwkacllib/inc/cce/taskdown_common.hpp
index 51a8ba11..3ecea523 100644
--- a/third_party/fwkacllib/inc/cce/taskdown_common.hpp
+++ b/third_party/fwkacllib/inc/cce/taskdown_common.hpp
@@ -34,7 +34,8 @@ typedef enum tagccKernelType {
   TE_AI_CORE = 4,  /* te aicore operator*/
   TE_AI_CPU = 5,   /* te aicpu operator */
   AI_CPU = 6,      /* aicpu */
-  INVALID = 7,     /* unknown kernel type */
+  CUST_AI_CPU = 7, /* custom aicpu*/
+  INVALID = 8,     /* unknown kernel type */
 } ccKernelType;
 
 typedef struct tagOpContext {
diff --git a/third_party/fwkacllib/inc/hccl/base.h b/third_party/fwkacllib/inc/hccl/base.h
index 1d83d7bf..00c220f1 100644
--- a/third_party/fwkacllib/inc/hccl/base.h
+++ b/third_party/fwkacllib/inc/hccl/base.h
@@ -36,76 +36,18 @@ typedef unsigned short u16;
 typedef unsigned int u32;
 typedef unsigned long long u64;
 
-/**
- * @brief HCOM functions return value definition
- */
-typedef enum tagHcclResult {
-    HCCL_SUCCESS = 0,               /**< success */
-    HCCL_E_PARA = 1,                /**< parameter error */
-    HCCL_E_PTR = 2,                 /**< empty pointer */
-    HCCL_E_MEMORY = 3,              /**< memory error */
-    HCCL_E_INTERNAL = 4,            /**< internal error */
-    HCCL_E_NOT_SUPPORT = 5,         /**< not support feature */
-    HCCL_E_NOT_FOUND = 6,           /**< not found specific resource */
-    HCCL_E_UNAVAIL = 7,             /**< resource unavailable */
-    HCCL_E_SYSCALL = 8,             /**< call system interface error */
-    HCCL_E_TIMEOUT = 9,             /**< timeout */
-    HCCL_E_OPEN_FILE_FAILURE = 10,  /**< open file fail */
-    HCCL_E_TCP_CONNECT = 11,        /**< tcp connect fail */
-    HCCL_E_ROCE_CONNECT = 12,       /**< roce connect fail */
-    HCCL_E_TCP_TRANSFER = 13,       /**< tcp transfer fail */
-    HCCL_E_ROCE_TRANSFER = 14,      /**< roce transfer fail */
-    HCCL_E_RUNTIME = 15,            /**< call runtime api fail */
-    HCCL_E_DRV = 16,                /**< call driver api fail */
-    HCCL_E_PROFILING = 17,          /**< call profiling api fail */
-    HCCL_E_CCE = 18,                /**< call cce api fail */
-    HCCL_E_NETWORK = 19,            /**< call network api fail */
-    HCCL_E_RESERVED                 /**< reserved */
-} hcclResult_t;
-
-/* handle to communicator */
-typedef void *hcclComm_t;
-
-/**
- * @brief HCCL Reduction opperation
- */
-typedef enum tagHcclRedOp {
-    HCCL_REP_OP_SUM = 0,    /**< sum */
-    HCCL_REP_OP_PROD = 1,   /**< prod */
-    HCCL_REP_OP_MAX = 2,    /**< max */
-    HCCL_REP_OP_MIN = 3,    /**< min */
-    HCCL_REP_OP_RESERVED    /**< reserved */
-} hcclRedOp_t;
-
 /**
  * @brief Horovod Reduction opperation
  */
-typedef enum tagHorovodRedOp {
-    HOROVOD_REP_OP_AVERAGE = 0, /**< average */
-    HOROVOD_REP_OP_SUM = 1,     /**< sum */
-    HOROVOD_REP_OP_ADASUM = 2,  /**< adasum */
-    HOROVOD_REP_OP_MIN = 3,     /**< min */
-    HOROVOD_REP_OP_MAX = 4,     /**< max */
-    HOROVOD_REP_OP_PROD = 5,    /**< proo */
-    HOROVOD_REP_OP_RESERVED     /**< reserved */
-} horovodRedOp_t;
-
-/**
- * @brief HCCL data type
- */
-typedef enum tagHcclDataType {
-    HCCL_DATA_TYPE_INT8 = 0,  /**< int8 */
-    HCCL_DATA_TYPE_INT = 1,   /**< int32 */
-    HCCL_DATA_TYPE_HALF = 2,  /**< fp16 */
-    HCCL_DATA_TYPE_FLOAT = 3, /**< fp32 */
-    HCCL_DATA_TYPE_INT16 = 4, /**< int16 */
-    HCCL_DATA_TYPE_RESERVED   /**< reserved */
-} hcclDataType_t;
-
-constexpr u32 HCCL_UNIQUE_ID_BYTES =  2060; // 2060: unique id length
-using hcclUniqueId =  struct hcclUniqueIdDef {
-    char internal[HCCL_UNIQUE_ID_BYTES];
-};
+typedef enum {
+    HOROVOD_REDUCE_AVERAGE = 0, /**< average */
+    HOROVOD_REDUCE_SUM = 1,     /**< sum */
+    HOROVOD_REDUCE_ADASUM = 2,  /**< adasum */
+    HOROVOD_REDUCE_MIN = 3,     /**< min */
+    HOROVOD_REDUCE_MAX = 4,     /**< max */
+    HOROVOD_REDUCE_PROD = 5,    /**< proo */
+    HOROVOD_REDUCE_RESERVED     /**< reserved */
+} HorovodReduceOp;
 
 const u32 HCCL_MAX_SEGMENT_NUM = 8;   // The max number of gradient segments.
 
diff --git a/third_party/fwkacllib/inc/hccl/hccl_types.h b/third_party/fwkacllib/inc/hccl/hccl_types.h
new file mode 100755
index 00000000..03f43649
--- /dev/null
+++ b/third_party/fwkacllib/inc/hccl/hccl_types.h
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hccl_types.h
+ * @brief HCCL data type definition 
+ * 
+ */
+ 
+#ifndef HCCL_TYPES_H_
+#define HCCL_TYPES_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/**
+ * @brief HCCL functions return value definition
+ */
+typedef enum {
+    HCCL_SUCCESS = 0,               /**< success */
+    HCCL_E_PARA = 1,                /**< parameter error */
+    HCCL_E_PTR = 2,                 /**< empty pointer */
+    HCCL_E_MEMORY = 3,              /**< memory error */
+    HCCL_E_INTERNAL = 4,            /**< internal error */
+    HCCL_E_NOT_SUPPORT = 5,         /**< not support feature */
+    HCCL_E_NOT_FOUND = 6,           /**< not found specific resource */
+    HCCL_E_UNAVAIL = 7,             /**< resource unavailable */
+    HCCL_E_SYSCALL = 8,             /**< call system interface error */
+    HCCL_E_TIMEOUT = 9,             /**< timeout */
+    HCCL_E_OPEN_FILE_FAILURE = 10,  /**< open file fail */
+    HCCL_E_TCP_CONNECT = 11,        /**< tcp connect fail */
+    HCCL_E_ROCE_CONNECT = 12,       /**< roce connect fail */
+    HCCL_E_TCP_TRANSFER = 13,       /**< tcp transfer fail */
+    HCCL_E_ROCE_TRANSFER = 14,      /**< roce transfer fail */
+    HCCL_E_RUNTIME = 15,            /**< call runtime api fail */
+    HCCL_E_DRV = 16,                /**< call driver api fail */
+    HCCL_E_PROFILING = 17,          /**< call profiling api fail */
+    HCCL_E_CCE = 18,                /**< call cce api fail */
+    HCCL_E_NETWORK = 19,            /**< call network api fail */
+    HCCL_E_RESERVED                 /**< reserved */
+} HcclResult;
+
+/**
+ * @brief handle to HCCL communicator
+ */
+typedef void *HcclComm;
+
+/**
+ * @brief HCCL Reduction opperation
+ */
+typedef enum {
+    HCCL_REDUCE_SUM = 0,    /**< sum */
+    HCCL_REDUCE_PROD = 1,   /**< prod */
+    HCCL_REDUCE_MAX = 2,    /**< max */
+    HCCL_REDUCE_MIN = 3,    /**< min */
+    HCCL_REDUCE_RESERVED    /**< reserved */
+} HcclReduceOp;
+
+/**
+ * @brief HCCL data type
+ */
+typedef enum {
+    HCCL_DATA_TYPE_INT8 = 0,    /**< int8 */
+    HCCL_DATA_TYPE_INT16 = 1,   /**< int16 */
+    HCCL_DATA_TYPE_INT32 = 2,   /**< int32 */
+    HCCL_DATA_TYPE_FP16 = 3,    /**< fp16 */
+    HCCL_DATA_TYPE_FP32 = 4,    /**< fp32 */
+    HCCL_DATA_TYPE_RESERVED     /**< reserved */
+} HcclDataType;
+
+const uint32_t HCCL_ROOT_INFO_BYTES =  4108; // 4108: root info length
+
+/**
+ * @brief HCCL root info
+ */
+typedef struct HcclRootInfoDef {
+    char internal[HCCL_ROOT_INFO_BYTES];
+} HcclRootInfo;
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // HCCL_TYPES_H_
diff --git a/third_party/fwkacllib/inc/hccl/hcom.h b/third_party/fwkacllib/inc/hccl/hcom.h
index 19bf4fb3..4399d3a8 100644
--- a/third_party/fwkacllib/inc/hccl/hcom.h
+++ b/third_party/fwkacllib/inc/hccl/hcom.h
@@ -23,6 +23,7 @@
 #define HCOM_H_
 
 #include <hccl/base.h>
+#include <hccl/hccl_types.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,37 +34,37 @@ extern "C" {
  *
  * @param rank_table A string identifying the rank table file path, include file name.
  * @param identify A string identifying the identify for the rank.
- * @return hcclResult_t
+ * @return HcclResult
  * @see hcom_destroy()
  */
-extern hcclResult_t hcom_init(const char *rank_table, const char *identify);
+extern HcclResult hcom_init(const char *rank_table, const char *identify);
 
 /**
  * @brief Destroy HCOM
  *
- * @return hcclResult_t
+ * @return HcclResult
  * @see hcom_init()
  */
-extern hcclResult_t hcom_destroy(void);
+extern HcclResult hcom_destroy(void);
 
 /**
  * @brief Bind the model.
  *
  * @param model A pointer identifying the model information.
  * @param stream A pointer identifying the stream information.
- * @return hcclResult_t
+ * @return HcclResult
  * @see hcom_unbind_model()
  */
-extern hcclResult_t hcom_bind_model(rtModel_t model, rtStream_t stream);
+extern HcclResult hcom_bind_model(rtModel_t model, rtStream_t stream);
 
 /**
  * @brief Unbind the model.
  *
  * @param model An pointer identifying the model information.
- * @return hcclResult_t
+ * @return HcclResult
  * @see hcom_unbind_model()
  */
-extern hcclResult_t hcom_unbind_model(rtModel_t model);
+extern HcclResult hcom_unbind_model(rtModel_t model);
 
 /**
  * @brief All-gather operator.
@@ -75,10 +76,10 @@ extern hcclResult_t hcom_unbind_model(rtModel_t model);
  * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
  * @param group A string identifying the group name of ranks participating in the operator.
  * @param stream A pointer identifying the stream information.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-extern hcclResult_t hcom_all_gather(const char *tag, void *inputPtr, void *outputPtr, u64 inputCount,
-                                    hcclDataType_t dataType, const char *group, rtStream_t stream);
+extern HcclResult hcom_all_gather(const char *tag, void *inputPtr, void *outputPtr, u64 inputCount,
+                                  HcclDataType dataType, const char *group, rtStream_t stream);
 
 /**
  * @brief All-reduce operator.
@@ -91,10 +92,10 @@ extern hcclResult_t hcom_all_gather(const char *tag, void *inputPtr, void *outpu
  * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
  * @param group A string identifying the group name of ranks participating in the operator.
  * @param stream A pointer identifying the stream information.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-extern hcclResult_t hcom_all_reduce(const char *tag, void *inputPtr, void *outputPtr, u64 count,
-                                    hcclDataType_t dataType, hcclRedOp_t op, const char *group, rtStream_t stream);
+extern HcclResult hcom_all_reduce(const char *tag, void *inputPtr, void *outputPtr, u64 count,
+                                  HcclDataType dataType, HcclReduceOp op, const char *group, rtStream_t stream);
 
 /**
  * @brief Broadcast operator.
@@ -106,9 +107,9 @@ extern hcclResult_t hcom_all_reduce(const char *tag, void *inputPtr, void *outpu
  * @param root An integer(u32) identifying the the root rank in the operator.
  * @param group A string identifying the group name of ranks participating in the operator.
  * @param stream A pointer identifying the stream information.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-extern hcclResult_t hcom_broadcast(const char *tag, void *ptr, u64 count, hcclDataType_t dataType, u32 root,
+extern HcclResult hcom_broadcast(const char *tag, void *ptr, u64 count, HcclDataType dataType, u32 root,
                                    const char *group, rtStream_t stream);
 
 /**
@@ -122,46 +123,46 @@ extern hcclResult_t hcom_broadcast(const char *tag, void *ptr, u64 count, hcclDa
  * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
  * @param group A string identifying the group name of ranks participating in the operator.
  * @param stream A pointer identifying the stream information.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-extern hcclResult_t hcom_reduce_scatter(const char *tag, void *inputPtr, void *outputPtr, u64 count,
-                                        hcclDataType_t dataType, hcclRedOp_t op, const char *group, rtStream_t stream);
+extern HcclResult hcom_reduce_scatter(const char *tag, void *inputPtr, void *outputPtr, u64 count,
+                                      HcclDataType dataType, HcclReduceOp op, const char *group, rtStream_t stream);
 
 /**
  * @brief Get the rank number in the group.
  *
  * @param group A string identifying the group name.
  * @param rankSize A pointer identifying the rank number.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_get_rank_size(const char *group, u32 *rankSize);
+HcclResult hcom_get_rank_size(const char *group, u32 *rankSize);
 
 /**
  * @brief Get the rank number of this rank's server within the group.
  *
  * @param group A string identifying the group name.
  * @param localRankSize A pointer identifying the rank number.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_get_local_rank_size(const char *group, u32 *localRankSize);
+HcclResult hcom_get_local_rank_size(const char *group, u32 *localRankSize);
 
 /**
  * @brief Get the rank id of this rank.
  *
  * @param group A string identifying the group name.
  * @param rankId A pointer identifying the rank id.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_get_rank_id(const char *group, u32 *rankId);
+HcclResult hcom_get_rank_id(const char *group, u32 *rankId);
 
 /**
  * @brief Get the local rank id of this rank's server within the group.
  *
  * @param group A string identifying the group name.
  * @param localRankId A pointer identifying the local rank id.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_get_local_rank_id(const char *group, u32 *localRankId);
+HcclResult hcom_get_local_rank_id(const char *group, u32 *localRankId);
 
 /**
  * @brief Get the world rank id according to the group rank id.
@@ -169,9 +170,9 @@ hcclResult_t hcom_get_local_rank_id(const char *group, u32 *localRankId);
  * @param group A string identifying the group name.
  * @param groupRank An integer(u32) identifying the group rank id.
  * @param worldRank A pointer identifying the world rank id.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank, u32 *worldRank);
+HcclResult hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank, u32 *worldRank);
 
 /**
  * @brief Get the group rank id according to the world rank id.
@@ -179,9 +180,9 @@ hcclResult_t hcom_get_world_rank_from_group_rank(const char *group, u32 groupRan
  * @param worldRank An integer(u32) identifying the world rank id.
  * @param group A string identifying the group name.
  * @param groupRank A pointer identifying the group rank id.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group, u32 *groupRank);
+HcclResult hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group, u32 *groupRank);
 
 /**
  * @brief Create group.
@@ -189,17 +190,17 @@ hcclResult_t hcom_get_group_rank_from_world_rank(u32 worldRank, const char *grou
  * @param group A string identifying the group name.
  * @param rankNum An integer(u32) identifying the number of ranks in the group.
  * @param rankIds A list identifying the ranks in the group.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_create_group(const char *group, u32 rankNum, u32 *rankIds);
+HcclResult hcom_create_group(const char *group, u32 rankNum, u32 *rankIds);
 
 /**
  * @brief Destroy group
  *
  * @param group A string identifying the group name.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_destroy_group(const char *group);
+HcclResult hcom_destroy_group(const char *group);
 
 /**
  * @brief Send operator.
@@ -213,9 +214,9 @@ hcclResult_t hcom_destroy_group(const char *group);
  * The message will be send by the receive operator with the same "sr_tag".
  * @param group A string identifying the group name of ranks participating in the operator.
  * @param stream A pointer identifying the stream information.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_send(const char *tag, void *inputPtr, u64 count, hcclDataType_t dataType,
+HcclResult hcom_send(const char *tag, void *inputPtr, u64 count, HcclDataType dataType,
     u32 destRank, u32 srTag, const char *group, rtStream_t stream);
 
 /**
@@ -230,9 +231,9 @@ hcclResult_t hcom_send(const char *tag, void *inputPtr, u64 count, hcclDataType_
  * The message will be send by the send operator with the same "sr_tag".
  * @param group A string identifying the group name of ranks participating in the operator.
  * @param stream A pointer identifying the stream information.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_receive(const char *tag, void *outputPtr, u64 count, hcclDataType_t dataType,
+HcclResult hcom_receive(const char *tag, void *outputPtr, u64 count, HcclDataType dataType,
     u32 srcRank, u32 srTag, const char *group, rtStream_t stream);
 
 /**
@@ -243,9 +244,9 @@ hcclResult_t hcom_receive(const char *tag, void *outputPtr, u64 count, hcclDataT
  * @param maxSegmentNum An integer(u32) identifying the max segments of gradients.
  * @param segmentNum A pointer identifying the segments number of gradients.
  * @param segmentIdx A list identifying the index of end gradient in each segment.
- * @return hcclResult_t 
+ * @return HcclResult 
  */
-hcclResult_t hcom_get_split_strategy(const char *group, const struct model_feature *feature, u32 maxSegmentNum,
+HcclResult hcom_get_split_strategy(const char *group, const struct model_feature *feature, u32 maxSegmentNum,
     u32 *segmentNum, u32 *segmentIdx, GradSplitForceMode force = FORCE_NONE,
     OriginalGraphShapeType shapeType = KNOWN_SHAPE);
 
@@ -255,9 +256,9 @@ hcclResult_t hcom_get_split_strategy(const char *group, const struct model_featu
  * @param group A string identifying the group name.
  * @param segmentNum An integer(u32) identifying the segments number of gradients.
  * @param IdxList A list identifying the index of end gradient in each segment.
- * @return hcclResult_t
+ * @return HcclResult
  */
-extern hcclResult_t hcom_set_split_strategy_by_index(const char *group, u32 segmentNum, const u32 *IdxList);
+extern HcclResult hcom_set_split_strategy_by_index(const char *group, u32 segmentNum, const u32 *IdxList);
 
 /**
  * @brief Set the gradient split strategy with in the group, according to gradient data size.
@@ -265,9 +266,9 @@ extern hcclResult_t hcom_set_split_strategy_by_index(const char *group, u32 segm
  * @param group A string identifying the group name.
  * @param segmentNum An integer(u32) identifying the segments number of gradients.
  * @param sizeList A list identifying the percent of each segment.
- * @return hcclResult_t
+ * @return HcclResult
  */
-extern hcclResult_t hcom_set_split_strategy_by_size(const char *group, u32 segmentNum, const float *sizeList);
+extern HcclResult hcom_set_split_strategy_by_size(const char *group, u32 segmentNum, const float *sizeList);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/fwkacllib/inc/ops/aipp.h b/third_party/fwkacllib/inc/ops/aipp.h
index 85666223..d11fdc95 100644
--- a/third_party/fwkacllib/inc/ops/aipp.h
+++ b/third_party/fwkacllib/inc/ops/aipp.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file aipp.h
+ * \brief
+ */
 #ifndef GE_OP_AIPP_H
 #define GE_OP_AIPP_H
 
@@ -41,7 +45,6 @@ REG_OP(Aipp)
     .OUTPUT(features, TensorType({DT_FLOAT16, DT_UINT8}))
     .ATTR(aipp_config_path, String, "./aipp.cfg")
     .OP_END_FACTORY_REG(Aipp)
-} // namespace ge
 
 /**
 *@brief Performs this op is for dynamic aipp.If you set aipp-mode to dynamic \n
@@ -59,12 +62,11 @@ in aipp config file, framework will auto add one input node to graph at last.
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AippData.
 */
-namespace ge {
 REG_OP(AippData)
     .INPUT(data, TensorType::ALL())
     .OUTPUT(out, TensorType::ALL())
     .ATTR(index, Int, 0)
     .OP_END_FACTORY_REG(AippData)
-}
+} // namespace ge
 
 #endif // GE_OP_AIPP_H
diff --git a/third_party/fwkacllib/inc/ops/all_ops.h b/third_party/fwkacllib/inc/ops/all_ops.h
index c30bf32b..84ff3d08 100644
--- a/third_party/fwkacllib/inc/ops/all_ops.h
+++ b/third_party/fwkacllib/inc/ops/all_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file all_ops.h
+ * \brief
+ */
 #ifndef BUILT_IN_OP_PROTO_INC_ALL_OPS_H_
 #define BUILT_IN_OP_PROTO_INC_ALL_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/array_ops.h b/third_party/fwkacllib/inc/ops/array_ops.h
index 7c6f9b2c..ea82e0fa 100644
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file array_ops.h
+ * \brief
+ */
 #ifndef GE_OP_ARRAY_OPS_H_
 #define GE_OP_ARRAY_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/audio_ops.h b/third_party/fwkacllib/inc/ops/audio_ops.h
index 6db181f9..feecd7ae 100644
--- a/third_party/fwkacllib/inc/ops/audio_ops.h
+++ b/third_party/fwkacllib/inc/ops/audio_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file audio_ops.h
+ * \brief
+ */
 #ifndef GE_OP_AUDIO_OPS_H_
 #define GE_OP_AUDIO_OPS_H_
 
@@ -43,11 +47,12 @@ per time slice.
 *@attention Constraints: \n
 *Mfcc runs on the Ascend AI CPU, which delivers poor performance. \n
 
-
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator Mfcc.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(Mfcc)
     .INPUT(spectrogram, TensorType({DT_FLOAT}))
     .INPUT(sample_rate, TensorType({DT_INT32}))
@@ -79,6 +84,9 @@ poor performance.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AudioSpectrogram.
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 
 REG_OP(AudioSpectrogram)
@@ -110,6 +118,9 @@ Length of audio requested.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator DecodeWav.
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 
 REG_OP(DecodeWav)
@@ -136,6 +147,9 @@ REG_OP(DecodeWav)
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow Operator EncodeWav.
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 
 REG_OP(EncodeWav)
diff --git a/third_party/fwkacllib/inc/ops/batch_ops.h b/third_party/fwkacllib/inc/ops/batch_ops.h
index 47c5b06b..dd2efade 100644
--- a/third_party/fwkacllib/inc/ops/batch_ops.h
+++ b/third_party/fwkacllib/inc/ops/batch_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file batch_ops.h
+ * \brief
+ */
 #ifndef GE_OP_BATCH_OPS_H_
 #define GE_OP_BATCH_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/bitwise_ops.h b/third_party/fwkacllib/inc/ops/bitwise_ops.h
index ccbeb04c..0a6cbe9b 100644
--- a/third_party/fwkacllib/inc/ops/bitwise_ops.h
+++ b/third_party/fwkacllib/inc/ops/bitwise_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file bitwise_ops.h
+ * \brief
+ */
 #ifndef GE_OP_BITWISE_OPS_H_
 #define GE_OP_BITWISE_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
index 37345833..cded3acd 100644
--- a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
+++ b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file boosted_trees_ops.h
+ * \brief
+ */
 #ifndef GE_OP_BOOSTED_TREES_OPS_H_
 #define GE_OP_BOOSTED_TREES_OPS_H_
 
@@ -44,8 +48,10 @@ a single feature.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator BoostedTreesBucketize.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(BoostedTreesBucketize)
     .DYNAMIC_INPUT(float_values, TensorType({DT_FLOAT}))
     .DYNAMIC_INPUT(bucket_boundaries, TensorType({DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
index 50178a59..c0109fca 100644
--- a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
+++ b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file candidate_sampling_ops.h
+ * \brief
+ */
 #ifndef GE_OP_CANDIDATE_SAMPLING_OPS_H_
 #define GE_OP_CANDIDATE_SAMPLING_OPS_H_
 
@@ -60,8 +64,10 @@ which delivers poor performance.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ThreadUnsafeUnigramCandidateSampler.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(ThreadUnsafeUnigramCandidateSampler)
     .INPUT(true_classes, TensorType({ DT_INT64 }))
     .OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
@@ -114,8 +120,10 @@ which delivers poor performance.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator UniformCandidateSampler.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(UniformCandidateSampler)
     .INPUT(true_classes, TensorType({ DT_INT64 }))
     .OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
@@ -180,8 +188,10 @@ which delivers poor performance.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator FixedUnigramCandidateSampler.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(FixedUnigramCandidateSampler)
     .INPUT(true_classes, TensorType({ DT_INT64 }))
     .OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
@@ -239,8 +249,10 @@ poor performance.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator LearnedUnigramCandidateSampler.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(LearnedUnigramCandidateSampler)
     .INPUT(true_classes, TensorType({ DT_INT64 }))
     .OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
@@ -291,8 +303,10 @@ poor performance.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator LogUniformCandidateSampler.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(LogUniformCandidateSampler)
     .INPUT(true_classes, TensorType({ DT_INT64 }))
     .OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
@@ -339,8 +353,10 @@ to occur in a batch of sampled candidates. If "unique" is true, then this is a p
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AllCandidateSampler.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(AllCandidateSampler)
     .INPUT(true_classes, TensorType({ DT_INT64 }))
     .OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
@@ -379,8 +395,10 @@ each element is -FLOAT_MAX.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ComputeAccidentalHits.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(ComputeAccidentalHits)
     .INPUT(true_classes, TensorType({ DT_INT64 }))
     .INPUT(sampled_candidates, TensorType({ DT_INT64 }))
diff --git a/third_party/fwkacllib/inc/ops/condtake_ops.h b/third_party/fwkacllib/inc/ops/condtake_ops.h
index 37d3b92a..72bf46a0 100644
--- a/third_party/fwkacllib/inc/ops/condtake_ops.h
+++ b/third_party/fwkacllib/inc/ops/condtake_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file condtake_ops.h
+ * \brief
+ */
 #ifndef GE_OP_CONDTAKE_OPS_H_
 #define GE_OP_CONDTAKE_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/control_flow_ops.h b/third_party/fwkacllib/inc/ops/control_flow_ops.h
index fa68d49a..75992103 100644
--- a/third_party/fwkacllib/inc/ops/control_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/control_flow_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file control_flow_ops.h
+ * \brief
+ */
 #ifndef GE_CONTROL_FLOW_OPS_H_
 #define GE_CONTROL_FLOW_OPS_H_
 
@@ -377,6 +381,27 @@ REG_OP(RefExit)
  */
 REG_OP(ControlTrigger)
     .OP_END_FACTORY_REG(ControlTrigger)
+
+/**
+*@brief Returns index of shape in the map.
+
+*@par Inputs:
+* Three inputs, including:
+*@li x: One dimensional tensore of type int32, specifying queried shape, max size is 8.
+*@li data_seq: One dimensional tensore of type int32, specifying the mapped table is queried.
+*@li level_index: One dimensional tensore of type int32, specifying secondary index.
+
+*@par Outputs:
+*@li y: A Tensor with shape [batch, 8], of type int32, specifying index of shape in the map.
+*@par Third-party framework compatibility
+* It is a custom operator. It has no corresponding operator in Caffe.
+*/
+REG_OP(MapIndex)
+    .INPUT(x, TensorType({DT_INT32}))
+    .INPUT(data_seq, TensorType({DT_INT32}))
+    .OPTIONAL_INPUT(level_index, TensorType({DT_INT32}))
+    .OUTPUT(y, TensorType({DT_INT32}))
+    .OP_END_FACTORY_REG(MapIndex)
 }  // namespace ge
 
 #endif  // GE_CONTROL_FLOW_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/ctc_ops.h b/third_party/fwkacllib/inc/ops/ctc_ops.h
index 74b797f3..eaf6f9e9 100644
--- a/third_party/fwkacllib/inc/ops/ctc_ops.h
+++ b/third_party/fwkacllib/inc/ops/ctc_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file ctc_ops.h
+ * \brief
+ */
 #ifndef GE_OP_CTC_OPS_H
 #define GE_OP_CTC_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/data_flow_ops.h b/third_party/fwkacllib/inc/ops/data_flow_ops.h
index c766167a..d407c4cd 100644
--- a/third_party/fwkacllib/inc/ops/data_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file data_flow_ops.h
+ * \brief
+ */
 #ifndef GE_OP_DATA_FLOW_OPS_H_
 #define GE_OP_DATA_FLOW_OPS_H_
 
@@ -486,8 +490,10 @@ DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator DynamicPartition.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(DynamicPartition)
     .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
         DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \
@@ -521,8 +527,10 @@ DT_QUINT8, DT_QINT8, DT_STRING, DT_COMPLEX64, DT_COMPLEX128.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator DynamicStitch.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(DynamicStitch)
     .DYNAMIC_INPUT(indices, TensorType({DT_INT32}))
     .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
@@ -1603,8 +1611,10 @@ the given name across multiple sessions.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator Barrier.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(Barrier)
     .OUTPUT(handle, TensorType({DT_STRING_REF}))
     .REQUIRED_ATTR(component_types, ListType)
@@ -1635,8 +1645,10 @@ DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128,  DT_RESOURCE, DT_STRING.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator BarrierInsertMany.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(BarrierInsertMany)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .INPUT(keys, TensorType({DT_STRING}))
@@ -1683,8 +1695,10 @@ DT_RESOURCE, DT_STRING.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator BarrierTakeMany.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(BarrierTakeMany)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .INPUT(num_elements, TensorType(DT_INT32))
@@ -1718,8 +1732,10 @@ even if no new key is introduced.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator BarrierClose.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(BarrierClose)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .ATTR(cancel_pending_enqueues, Bool, false)
@@ -1740,8 +1756,10 @@ REG_OP(BarrierClose)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator BarrierReadySize.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(BarrierReadySize)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .OUTPUT(size, TensorType(DT_INT32))
@@ -1762,8 +1780,10 @@ REG_OP(BarrierReadySize)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator BarrierIncompleteSize.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(BarrierIncompleteSize)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .OUTPUT(size, TensorType(DT_INT32))
@@ -1824,8 +1844,10 @@ name across multiple sessions.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ConditionalAccumulator.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(ConditionalAccumulator)
     .OUTPUT(handle, TensorType({DT_STRING_REF}))
     .REQUIRED_ATTR(dtype, Type)
@@ -1858,8 +1880,10 @@ which delivers poor performance.\n
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AccumulatorApplyGradient.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(AccumulatorApplyGradient)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .INPUT(local_step, TensorType({DT_INT64}))
@@ -1884,8 +1908,10 @@ which delivers poor performance.\n
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AccumulatorNumAccumulated.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(AccumulatorNumAccumulated)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .OUTPUT(y, TensorType({DT_INT32}))
@@ -1904,8 +1930,10 @@ REG_OP(AccumulatorNumAccumulated)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AccumulatorSetGlobalStep.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(AccumulatorSetGlobalStep)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .INPUT(new_global_step, TensorType({DT_INT64}))
@@ -1935,8 +1963,10 @@ DT_FLOAT16, DT_FLOAT, DT_DOUBLE.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AccumulatorTakeGradient.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(AccumulatorTakeGradient)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .INPUT(num_required, TensorType({DT_INT32}))
@@ -1962,8 +1992,10 @@ default is "MEAN".
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow SparseConditionalAccumulator operator.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(SparseConditionalAccumulator)
     .OUTPUT(handle, TensorType({DT_STRING_REF}))
     .REQUIRED_ATTR(shape, ListInt)
@@ -1996,8 +2028,10 @@ the type of the accumulator.
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow SparseAccumulatorApplyGradient operator.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(SparseAccumulatorApplyGradient)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .INPUT(local_step, TensorType({DT_INT64}))
@@ -2030,8 +2064,10 @@ type of the accumulator.
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow SparseAccumulatorTakeGradient operator.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(SparseAccumulatorTakeGradient)
     .INPUT(handle, TensorType({DT_STRING_REF}))
     .INPUT(num_required, TensorType({DT_INT32}))
@@ -2062,8 +2098,10 @@ name across multiple sessions.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ResourceConditionalAccumulator.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(ResourceConditionalAccumulator)
     .OUTPUT(handle, TensorType({DT_RESOURCE}))
     .REQUIRED_ATTR(dtype, Type)
@@ -2089,8 +2127,10 @@ DT_FLOAT16, DT_FLOAT, DT_DOUBLE
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ResourceAccumulatorApplyGradient.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(ResourceAccumulatorApplyGradient)
     .INPUT(handle, TensorType({DT_RESOURCE}))
     .INPUT(local_step, TensorType({DT_INT64}))
@@ -2111,8 +2151,10 @@ REG_OP(ResourceAccumulatorApplyGradient)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ResourceAccumulatorNumAccumulated.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(ResourceAccumulatorNumAccumulated)
     .INPUT(handle, TensorType({DT_RESOURCE}))
     .OUTPUT(num_accumulated, TensorType({DT_INT32}))
@@ -2130,8 +2172,10 @@ REG_OP(ResourceAccumulatorNumAccumulated)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ResourceAccumulatorSetGlobalStep.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(ResourceAccumulatorSetGlobalStep)
     .INPUT(handle, TensorType({DT_RESOURCE}))
     .INPUT(new_global_step, TensorType({DT_INT64}))
@@ -2158,8 +2202,10 @@ DT_FLOAT16, DT_FLOAT, DT_DOUBLE.
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ResourceAccumulatorTakeGradient.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(ResourceAccumulatorTakeGradient)
     .INPUT(handle, TensorType({DT_RESOURCE}))
     .INPUT(num_required, TensorType({DT_INT32}))
diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
index 741a9071..cd42b707 100644
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file elewise_calculation_ops.h
+ * \brief
+ */
 #ifndef GE_OP_ELEWISE_CALCULATION_OPS_H
 #define GE_OP_ELEWISE_CALCULATION_OPS_H
 #include "graph/operator_reg.h"
@@ -2910,14 +2914,14 @@ REG_OP(Bias)
     .OP_END_FACTORY_REG(Bias)
 
 /**
-*@brief Function multiply gradients calculation. \n
+*@brief Function multiply gradients calculation.
 output0 is the result of which input0 dot multily input1.
 output1 is the result of which input0 dot multily input1, then reducesum it.
 
 *@par Inputs:
 *@li input0: A Tensor of input of mul, and dtype supports float16, float32.
 *@li input1: A Tensor of input of mul and mul_1, and dtype supports float16, float32.
-*@li input2: A Tensor of input of mul_1, and dtype supports float16, float32'.
+*@li input2: A Tensor of input of mul_1, and dtype supports float16, float32.
 
 *@par Attributes:
 *@li axes: The dimensions to reduce. Default:(), reduce all dimensions. \n
@@ -2940,12 +2944,12 @@ REG_OP(ConfusionMulGrad)
     .OP_END_FACTORY_REG(ConfusionMulGrad)
 
 /**
-*@brief Function fused multiply l2 loss calculation. \n
+*@brief Function fused multiply l2 loss calculation.
 
 *@par Inputs:
-*@li x1: A Tensor of type float16, float32.
-*@li x2: A Tensor of type float16, float32.
-*@li x3: A Tensor of type float16, float32.
+*@li x1: A Tensor of number type.
+*@li x2: A Tensor of number type.
+*@li x3: A Tensor of number type.
 
 *@par Outputs:
 *@li y1: A Tensor of shape and dtype of first output, which should have \n
@@ -3092,7 +3096,6 @@ REG_OP(Fills)
 *@brief Add tensor with scale.
 
 *@par Inputs:
-*Five inputs, including:
 * @li x1: A Tensor dtype of int32, float16, float32.
 * @li x2: A Tensor dtype of int32, float16, float32.
 
@@ -3178,6 +3181,27 @@ REG_OP(TensorMove)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8, DT_BOOL}))
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8, DT_BOOL}))
     .OP_END_FACTORY_REG(TensorMove)
+
+/**
+*@brief copy data from x to x.
+
+*@par Inputs:
+*One inputs, including:
+* @li x: A Tensor. Must be one of the following types: float16, float32, int8, uint8, int32, bool.
+
+*@par Outputs:
+*x: A Tensor. Has the same type as "x".
+
+*@par Third-party framework compatibility
+*/
+REG_OP(TensorRedirect)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8,
+                           DT_INT64, DT_INT16, DT_UINT16, DT_DOUBLE,
+                           DT_COMPLEX64}))
+    .OUTPUT(output_x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8,
+                           DT_INT64, DT_INT16, DT_UINT16, DT_DOUBLE,
+                           DT_COMPLEX64}))
+    .OP_END_FACTORY_REG(TensorRedirect)
 }  // namespace ge
 
 
diff --git a/third_party/fwkacllib/inc/ops/functional_ops.h b/third_party/fwkacllib/inc/ops/functional_ops.h
index f4a88661..33dce25d 100644
--- a/third_party/fwkacllib/inc/ops/functional_ops.h
+++ b/third_party/fwkacllib/inc/ops/functional_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file functional_ops.h
+ * \brief
+ */
 #ifndef GE_FUNCTIONAL_OPS_H_
 #define GE_FUNCTIONAL_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/get_data_ops.h b/third_party/fwkacllib/inc/ops/get_data_ops.h
index 0a9b174b..33a64903 100644
--- a/third_party/fwkacllib/inc/ops/get_data_ops.h
+++ b/third_party/fwkacllib/inc/ops/get_data_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file get_data_ops.h
+ * \brief
+ */
 #ifndef GE_OP_GET_DATA_OPS_H_
 #define GE_OP_GET_DATA_OPS_H_
 
@@ -21,12 +25,31 @@
 
 namespace ge {
 
+/**
+*@brief Binding dataset and GetNext
+*@par Attributes: None
+*@par Inputs: Dataset and GetNext operator
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(MakeIterator)
     .INPUT(x, TensorType::ALL())
     .INPUT(x1, TensorType::ALL())
     .ATTR(_kernel, String, "dp")
     .OP_END_FACTORY_REG(MakeIterator)
 
+/**
+*@brief Dataset iterator
+*@par Attributes:
+*output_types: Data type of output
+*output_shapes: Shapes of output
+*container: Iterator container name
+*shared_name: Iterator id
+*@par Inputs: None
+*@par Outputs: Dataset
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(IteratorV2)
     .OUTPUT(y, TensorType::ALL())
     .ATTR(output_types, ListInt, {})
@@ -35,6 +58,17 @@ REG_OP(IteratorV2)
     .ATTR(shared_name, String, "")
     .OP_END_FACTORY_REG(IteratorV2)
 
+/**
+*@brief Dataset GetNext iterator
+*@par Attributes:
+*output_types: Data type of output
+*output_shapes: Shapes of output
+*output_num: Num of output
+*@par Inputs: Queue data
+*@par Outputs: Input of computer graph
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(IteratorGetNext)
     .INPUT(x, TensorType::ALL())
     .DYNAMIC_OUTPUT(y, TensorType::ALL())
@@ -44,6 +78,17 @@ REG_OP(IteratorGetNext)
     .ATTR(_kernel, String, "dp")
     .OP_END_FACTORY_REG(IteratorGetNext)
 
+/**
+*@brief Device queue data area.
+*@par Attributes:
+*output_types: Data type of output
+*output_shapes: Shapes of output
+*channel_name: Channel ID corresponding to TDT
+*@par Inputs: None
+*@par Outputs: Dataset GetNext iterator
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(DeviceQueueDataset)
     .OUTPUT(y, TensorType::ALL())
     .ATTR(output_types, ListInt, {})
diff --git a/third_party/fwkacllib/inc/ops/hcom_ops.h b/third_party/fwkacllib/inc/ops/hcom_ops.h
index bdacebdf..231729ce 100644
--- a/third_party/fwkacllib/inc/ops/hcom_ops.h
+++ b/third_party/fwkacllib/inc/ops/hcom_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file hcom_ops.h
+ * \brief huawei collective communication library ops.
+ */
 #ifndef GE_OP_HCOM_OPS_H_
 #define GE_OP_HCOM_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/hvd_ops.h b/third_party/fwkacllib/inc/ops/hvd_ops.h
index 09748b8e..89282ca5 100644
--- a/third_party/fwkacllib/inc/ops/hvd_ops.h
+++ b/third_party/fwkacllib/inc/ops/hvd_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file hvd_ops.h
+ * \brief Horovod collective communication library ops.
+ */
 #ifndef GE_OP_HVD_OPS_H_
 #define GE_OP_HVD_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h
index 1ea62fa9..9412112c 100644
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file image_ops.h
+ * \brief
+ */
 #ifndef GE_OP_MAGE_OPS_H_
 #define GE_OP_MAGE_OPS_H_
 
@@ -144,36 +148,33 @@ REG_OP(CropAndResize)
     .OP_END_FACTORY_REG(CropAndResize)
 
 /**
-*@brief Extracts crops from the input image tensor and resizes them. Extracts \n
-crops from the input image tensor and resizes them using bilinear sampling or \n
-nearest neighbor sampling to a common output size specified by crop_size.
+*@brief Extracts crops from the input image tensor and resizes them.
+* Extracts crops from the input image tensor and resizes them using bilinear sampling or
+* nearest neighbor sampling to a common output size specified by crop_size.
 
 *@par Inputs:
-*Input images must be a 5HD tensor. Inputs include: \n
-*@li images:A Tensor. Must be one of the following types:float. A 5HD tensor of shape \n
-[batch, C1, image_height, image_width, C0].
+*Input images must be a 5HD tensor. Inputs include:
+*@li x:A Tensor. Must be one of the following types:float16, float. A 5HD tensor of shape
+* [batch, C1, image_height, image_width, C0].
 *@li boxes: A Tensor of type float. A 2-D tensor of shape [num_boxes, 4].
-*@li box_index: A Tensor of type int32. A 1-D tensor of shape [num_boxes] with \n
-int32 values in [0, batch - 1).
+*@li box_index: A Tensor of type int32. A 1-D tensor of shape [num_boxes] with int32 values in [0, batch).
 
 *@par Attributes:
 *@li crop_size: list int. [crop_height, crop_width]. All cropped image patches are resized to this size.
-*@li extrapolation_value: An optional float. Defaults to 0. Value used for \n
-extrapolation, when applicable.
-*@li method: An optional string from: '"bilinear"'. Defaults to \n
-"bilinear".
+*@li extrapolation_value: An optional float. Defaults to 0. Value used for extrapolation, when applicable.
+*@li method: An optional string from: '"bilinear"'. Defaults to "bilinear".
 
 *@par Outputs:
 *y:A Tensor of type float.
 
-*@attention Constraints: \n
+*@attention Constraints:
 *Input images must be a 5HD tensor.
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow CropAndResize operator.
 */
 REG_OP(CropAndResizeD)
-    .INPUT(x, TensorType({DT_FLOAT}))
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(boxes, TensorType({DT_FLOAT}))
     .INPUT(box_index, TensorType({DT_INT32}))
     .OUTPUT(y, TensorType({DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/internal_ops.h b/third_party/fwkacllib/inc/ops/internal_ops.h
index 0f9fd12f..014b7a1b 100644
--- a/third_party/fwkacllib/inc/ops/internal_ops.h
+++ b/third_party/fwkacllib/inc/ops/internal_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file internal_ops.h
+ * \brief
+ */
 #ifndef GE_OP_INTERNAL_OPS_H_
 #define GE_OP_INTERNAL_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/linalg_ops.h b/third_party/fwkacllib/inc/ops/linalg_ops.h
index 916c3267..145e021e 100644
--- a/third_party/fwkacllib/inc/ops/linalg_ops.h
+++ b/third_party/fwkacllib/inc/ops/linalg_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file linalg_ops.h
+ * \brief
+ */
 #ifndef GE_OP_LINALG_OPS_H_
 #define GE_OP_LINALG_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/logging_ops.h b/third_party/fwkacllib/inc/ops/logging_ops.h
index 897fc699..7ca04188 100644
--- a/third_party/fwkacllib/inc/ops/logging_ops.h
+++ b/third_party/fwkacllib/inc/ops/logging_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file logging_ops.h
+ * \brief
+ */
 #ifndef GE_OP_LOGGING_OPS_H
 #define GE_OP_LOGGING_OPS_H
 
@@ -35,8 +39,10 @@ the graph.
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow Timestamp operator.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(Timestamp)
   .OUTPUT(y, TensorType({DT_DOUBLE}))
   .OP_END_FACTORY_REG(Timestamp)
@@ -55,8 +61,10 @@ Inputs include: \n
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow Assert operator.
-*/
 
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(Assert)
   .INPUT(input_condition, TensorType{DT_BOOL})
   .DYNAMIC_INPUT(input_data, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8,
@@ -72,6 +80,9 @@ REG_OP(Assert)
 *x: The tensor to print, it is a dynamic_input.
 
 *Compatible with aicpu Print operator.
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(Print)
 .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32,
@@ -91,6 +102,9 @@ to print to.
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow PrintV2 operator.
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(PrintV2)
   .INPUT(x, TensorType({DT_STRING}))
diff --git a/third_party/fwkacllib/inc/ops/lookup_ops.h b/third_party/fwkacllib/inc/ops/lookup_ops.h
index 4dd87a8e..bd34ab64 100644
--- a/third_party/fwkacllib/inc/ops/lookup_ops.h
+++ b/third_party/fwkacllib/inc/ops/lookup_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file lookup_ops.h
+ * \brief
+ */
 #ifndef GE_OP_LOOKUP_OPS_H_
 #define GE_OP_LOOKUP_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/math_ops.h b/third_party/fwkacllib/inc/ops/math_ops.h
index 6d1e2cd2..9ee4f6d4 100644
--- a/third_party/fwkacllib/inc/ops/math_ops.h
+++ b/third_party/fwkacllib/inc/ops/math_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file math_ops.h
+ * \brief
+ */
 #ifndef GE_OP_MATH_OPS_H_
 #define GE_OP_MATH_OPS_H_
 
@@ -630,6 +634,44 @@ REG_OP(NLLLossGrad)
     .OUTPUT(x_grad, TensorType({DT_FLOAT}))
     .ATTR(reduction, String, "mean")
     .OP_END_FACTORY_REG(NLLLossGrad)
+
+/**
+*@brief The ifmr.
+
+*@par Inputs:
+*@li data:A Tensor of feature map
+*@li data_min:A Tensor of min value of feature map.
+*@li data_max:A Tensor of max value of feature map.
+*@li cumsum:A Tensor of cumsum bin of data.
+
+*@par Attributes:
+*min_percentile: min init percentile.
+*max_percentile: max init percentile.
+*search_range: search range.
+*search_step: step size of searching.
+*with_offset: whether using offset.
+
+*@par Outputs:
+*scale: optimal scale.
+*offset: optimal offset.
+
+*@par Third-party framework compatibility
+*Compatible with mindspore 
+*/
+
+REG_OP(IFMR)
+  .INPUT(data, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .INPUT(data_min, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .INPUT(data_max, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .INPUT(cumsum, TensorType({DT_INT32}))
+  .OUTPUT(scale, TensorType({DT_FLOAT}))
+  .OUTPUT(offset, TensorType({DT_FLOAT}))
+  .REQUIRED_ATTR(min_percentile, Float)
+  .REQUIRED_ATTR(max_percentile, Float)
+  .REQUIRED_ATTR(search_range, ListFloat)
+  .REQUIRED_ATTR(search_step, Float)
+  .REQUIRED_ATTR(with_offset, Bool)
+  .OP_END_FACTORY_REG(IFMR)
 }  // namespace ge
 
 #endif  // GE_OP_MATH_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
index 7cb24ee7..de94b58e 100644
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file matrix_calculation_ops.h
+ * \brief
+ */
 #ifndef GE_OP_MATRIX_CALCULATION_OPS_H
 #define GE_OP_MATRIX_CALCULATION_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
index 296dd63c..a120b31d 100644
--- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file nn_batch_norm_ops.h
+ * \brief
+ */
 #ifndef GE_OP_NN_BATCH_NORM_OPS_H
 #define GE_OP_NN_BATCH_NORM_OPS_H
 
@@ -340,6 +344,8 @@ REG_OP(BnHost)
 *@li mode: An optional attr, not use
 *@par Outputs:\n
 *@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x"
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use BNInference instead.
 */
 REG_OP(BNInferenceD)
     .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
index b2cf56ad..5b84b1fb 100644
--- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file nn_calculation_ops.h
+ * \brief
+ */
 #ifndef GE_OP_NN_CALCULATION_OPS_H
 #define GE_OP_NN_CALCULATION_OPS_H
 
@@ -124,6 +128,10 @@ REG_OP(DepthwiseConv2DBackpropFilter)
 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator DepthwiseConv2DBackpropFilter.
 * @li Compatible with the Caffe operator DepthwiseConv2DBackpropFilter.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use DepthwiseConv2DBackpropFilter
+* instead.
 */
 REG_OP(DepthwiseConv2DBackpropFilterD)
     .INPUT(input, TensorType({float16}))
@@ -239,6 +247,10 @@ REG_OP(DepthwiseConv2DBackpropInput)
 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator DepthwiseConv2DBackpropInput.
 * @li Compatible with the Caffe operator DepthwiseConv2DBackpropInput.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use DepthwiseConv2DBackpropInput
+* instead.
 */
 REG_OP(DepthwiseConv2DBackpropInputD)
     .INPUT(filter, TensorType({DT_FLOAT16}))
@@ -340,20 +352,30 @@ REG_OP(BiasAddGrad)
 *@brief Computes the gradients of convolution with respect to the input.
 *@par Inputs:
  * Three inputs:
- * @li input_size: A Tensor of type int32. An integer vector representing the shape of input,
- * where input is a 4-D tensor [batch, height, width, channels] or [batch, channels, height, width].
- * @li filter: A Tensor. Must be one of the following types: float16, float32, float64.
- * 4-D with shape [filter_height, filter_width, in_channels, out_channels]
- * or [out_channels, filter_height, filter_width, in_channels] or [out_channels, in_channel, filter_height, filter_width].
- * @li out_backprop: A Tensor. Must have the same type as filter. 4-D with shape [batch, out_height, out_width, out_channels]
- * or [batch, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution.
+ * @li input_size: A Tensor of type int32. An integer vector representing the
+ * shape of input, where input is a 4-D tensor [batch, height, width, channels]
+ * or [batch, channels, height, width].
+ * @li filter: A Tensor. Must be one of the following types: float16, float32,
+ * float64. 4-D with shape
+ * [filter_height, filter_width, in_channels, out_channels]
+ * or [out_channels, filter_height, filter_width, in_channels]
+ * or [out_channels, in_channel, filter_height, filter_width].
+ * @li out_backprop: A Tensor. Must have the same type as filter.
+ * 4-D with shape [batch, out_height, out_width, out_channels]
+ * or [batch, out_channels, out_height, out_width].
+ * Gradients with respect to the output of the convolution.
 *@par Attributes:
  * Five attributes:
- * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension.
- * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map
- * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1]
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data.
+ * @li strides: A tuple/list of 4 integers. The stride of the sliding window
+ * for H/W dimension. The index of H/W is same as data_format.
+ * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads
+ * on feature map
+ * @li dilations: A tuple/list of 4 integers, The dilation factor for each
+ * dimension of input, now only support [1,1,1,1]
+ * @li groups: Number of blocked connections from input channels to output
+ * channels.
+ * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to
+ * "NHWC". Specify the data format of the input and output data.
 *@par Outputs:
  * y: A Tensor. Has the same type as filter,and has same format as input_size
 *@par Third-party framework compatibility
@@ -376,23 +398,35 @@ REG_OP(Conv2DBackpropInput)
 *@par Inputs:
  * Two inputs:
  * @li filter: A Tensor. Types is float16.
- * 4-D with shape [filter_height, filter_width, in_channels, out_channels] or [out_channels, filter_height, filter_width, in_channels]
+ * 4-D with shape [filter_height, filter_width, in_channels, out_channels]
+ * or [out_channels, filter_height, filter_width, in_channels]
  * or [out_channels, in_channel, filter_height, filter_width].
- * @li out_backprop: A Tensor. Must have the same type as filter. 4-D with shape [batch, out_height, out_width, out_channels]
- * or [batch, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution.
+ * @li out_backprop: A Tensor. Must have the same type as filter.
+ * 4-D with shape [batch, out_height, out_width, out_channels]
+ * or [batch, out_channels, out_height, out_width].
+ * Gradients with respect to the output of the convolution.
 *@par Attributes:
  * Six attributes:
- * @li input_size A Tensor of type int32. An integer vector representing the shape of input,
- * where input is a 4-D tensor [batch, height, width, channels] or [batch, channels, height, width].
- * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension.
- * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map
- * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1]
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data.
+ * @li input_size A Tensor of type int32. An integer vector representing the
+ * shape of input, where input is a 4-D tensor [batch, height, width, channels]
+ * or [batch, channels, height, width].
+ * @li strides: A tuple/list of 4 integers. The stride of the sliding window
+ * for H/W dimension. The index of H/W is same as data_format.
+ * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on
+ * feature map
+ * @li dilations: A tuple/list of 4 integers, The dilation factor for each
+ * dimension of input, now only support [1,1,1,1]
+ * @li groups: Number of blocked connections from input channels to output
+ * channels.
+ * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to
+ * "NHWC". Specify the data format of the input and output data.
 *@par Outputs:
- * y: A Tensor. Has the same type as filter,4-D tensor [batch, height, width, channels] or [batch, channels, height, width].
+ * y: A Tensor. Has the same type as filter,4-D tensor [batch, height, width,
+ * channels] or [batch, channels, height, width].
 *@par Third-party framework compatibility
  * Compatible with Tensorflow's conv2d_backprop_input
+*@par Restrictions:
+ * Warning: THIS FUNCTION IS DEPRECATED. Please use Conv2DBackpropInput instead.
 */
 REG_OP(Conv2DBackpropInputD)
     .INPUT(filter, TensorType({DT_FLOAT16, DT_INT8}))
@@ -431,7 +465,8 @@ REG_OP(Conv2DBackpropInputD)
  output channels. Defaults to "1".
  * @li data_format: An optional string from: "NCHW". Defaults to "NCHW". \n
   Specify the data format of the input and output data.
- * @li offset_x: An optional integer for quantized deconvolution. Defaults to "0".
+ * @li offset_x: An optional integer for quantized deconvolution.
+ * Defaults to "0".
 *@par Outputs:
  * y: A Tensor. 4D tensor with shape [batch, channels, height, width].
  * When type of x is float16, the type of y must be float16.
@@ -454,20 +489,30 @@ REG_OP(Deconvolution)
 *@brief Computes the gradients of convolution with respect to the filter
 *@par Inputs:
  * Three inputs:
- * @li x: A Tensor. Must be one of the following types: float16, float32, float64.
- * 4-D with shape [batch, in_height, in_width, in_channels] or [batch, in_channels, in_height, in_width].
- * @li filter_size: A Tensor of type int32. An integer vector representing the tensor shape of filter,
- * where filter is a 4-D tensor [filter_height, filter_width, in_channels, out_channels]
- * or [out_channels, filter_height, filter_width, in_channels] or [out_channels, in_channel, filter_height, filter_width].
- * @li out_backprop: A Tensor. Must have the same type as x. 4-D with shape [batch, out_height, out_width, out_channels]
- * or [batch, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution.
+ * @li x: A Tensor. Must be one of the following types: float16, float32,
+ * float64.4-D with shape [batch, in_height, in_width, in_channels] or
+ * [batch, in_channels, in_height, in_width].
+ * @li filter_size: A Tensor of type int32. An integer vector representing the
+ * tensor shape of filter, where filter is a 4-D tensor [filter_height,
+ * filter_width, in_channels, out_channels] or [out_channels, filter_height,
+ * filter_width, in_channels] or [out_channels, in_channel, filter_height,
+ * filter_width].
+ * @li out_backprop: A Tensor. Must have the same type as x. 4-D with shape
+ * [batch, out_height, out_width, out_channels] or [batch, out_channels,
+ * out_height, out_width]. Gradients with respect to the output of the
+ * convolution.
 *@par Attributes:
  * Five attributes:
- * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension.
- * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map.
- * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1].
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data.
+ * @li strides: A tuple/list of 4 integers. The stride of the sliding window
+ * for H/W dimension. The index of H/W is same as data_format.
+ * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on
+ * feature map.
+ * @li dilations: A tuple/list of 4 integers, The dilation factor for each
+ * dimension of input, now only support [1,1,1,1].
+ * @li groups: Number of blocked connections from input channels to output
+ * channels.
+ * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to
+ * "NHWC". Specify the data format of the input and output data.
 *@par Outputs:
  * y: A Tensor. Has the same type as x
 *@par Third-party framework compatibility
@@ -490,23 +535,35 @@ REG_OP(Conv2DBackpropFilter)
 *@par Inputs:
  * Two inputs:
  * @li x: A Tensor. Type is float16.
- * 4-D with shape [batch, in_height, in_width, in_channels] or [batch, in_channels, in_height, in_width].
- * @li out_backprop: A Tensor. Must have the same type as x. 4-D with shape [batch, out_height, out_width, out_channels]
- * or [batch, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution.
+ * 4-D with shape [batch, in_height, in_width, in_channels] or [batch,
+ * in_channels, in_height, in_width].
+ * @li out_backprop: A Tensor. Must have the same type as x. 4-D with shape
+ * [batch, out_height, out_width, out_channels] or [batch, out_channels,
+ * out_height, out_width]. Gradients with respect to the output of the
+ * convolution.
 *@par Attributes:
  * Six attributes:
- * @li filter_size: A Tensor of type integers. An integer vector representing the tensor shape of filter,
- * where filter is a 4-D tensor [filter_height, filter_width, in_channels, out_channels]
- * or [out_channels, filter_height, filter_width, in_channels] or [out_channels, in_channel, filter_height, filter_width].
- * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension.
- * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map
- * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1].
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data.
+ * @li filter_size: A Tensor of type integers. An integer vector representing
+ * the tensor shape of filter,
+ * where filter is a 4-D tensor [filter_height, filter_width, in_channels,
+ * out_channels] or [out_channels, filter_height, filter_width, in_channels]
+ * or [out_channels, in_channel, filter_height, filter_width].
+ * @li strides: A tuple/list of 4 integers. The stride of the sliding window
+ * for H/W dimension. The index of H/W is same as data_format.
+ * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on
+ * feature map
+ * @li dilations: A tuple/list of 4 integers, The dilation factor for each
+ * dimension of input, now only support [1,1,1,1].
+ * @li groups: Number of blocked connections from input channels to output
+ * channels.
+ * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to
+ * "NHWC". Specify the data format of the input and output data.
 *@par Outputs:
  * y: A Tensor. Type is float32
 *@par Third-party framework compatibility
  * Compatible with Tensorflow's conv2d_backprop_filter
+*@par Restrictions:
+ * Warning: THIS FUNCTION IS DEPRECATED. Please use Conv2DBackpropFilter instead.
 */
 REG_OP(Conv2DBackpropFilterD)
     .INPUT(x, TensorType({DT_FLOAT16}))
@@ -638,25 +695,34 @@ REG_OP(Conv2DCompress)
 /**
 *@brief Computes a 3D convolution given 5D "x" and "filter" tensors.
  *@par Inputs:
- * @li x: A 5D tensor. Must be one of the following types: float16, (Currently does not support int8).
- * The format of x is NCDHW or NDHWC.
- * @li filter: A 5D tensor of the same type as "x". The format is NCDHW, NDHWC or DHWCN.
+ * @li x: A 5D tensor. Must be one of the following types: float16,
+ * (Currently does not support int8). The format of x is NCDHW or NDHWC.
+ * @li filter: A 5D tensor of the same type as "x".
+ * (Currently does not support int8).
+ * The format is NCDHW, NDHWC or DHWCN.
 
 *@par Optional input:
  * @li bias: An optional 1D tensor of the same type as "x".
  * @li offset_w: An optional 1D tensor for quantized deconvolution. Reserved.
 
 *@par Required Attributes:
- * @li strides: A list of 5 integers. Specifies the stride of the sliding window for each dimension of "x".
+ * @li strides: A list of 5 integers. Specifies the stride of the sliding window
+ * for each dimension of "x".
  * The N and C dimensions must be 1. Has the same format as "x".
- * @li pads: A list of 6 integers. Supports only padding along the D, H and W dimensions in sequence of head, tail, top, bottom, left and right.
+ * @li pads: A list of 6 integers.
+ * Supports only padding along the D, H and W dimensions in sequence of head,
+ * tail, top, bottom, left and right.
 
 *@par Attributes:
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data.
- * @li dilations: A list of 5 integers. Specifies the dilation factor for each dimension of "x".
+ * @li groups: Number of blocked connections from input channels to output
+ * channels. Reserved.
+ * @li data_format: An optional string from: "NDHWC", "NCDHW".
+ * Defaults to "NDHWC". Specify the data format of the input and output data.
+ * @li dilations: A list of 5 integers. Specifies the dilation factor for each
+ * dimension of "x", now only support [1,1,1,1,1]
  * The N and C dimensions must be 1. Has the same format as "x".
- * @li offset_x: An optional int. Input offset, used for quantized inference. Defaults to 0.
+ * @li offset_x: An optional int. Input offset, used for quantized inference.
+ * Defaults to 0. Reserved.
 
 *@par Outputs:
  *y: A Tensor. Has the same type as "x".
@@ -687,22 +753,33 @@ REG_OP(Conv3D)
 *@brief Computes the gradients of convolution 3d with respect to the input.
 *@par Inputs:
  * Three inputs:
- * @li input_size: A Tensor of type int32, int64. An integer vector representing the shape of input,
- * where input is a 5-D tensor [batch, depth, height, width, channels] or [batch, channels, depth, height, width].
- * @li filter: A Tensor. Must be one of the following types: float16, float32, float64.
- * @li out_backprop: A Tensor. Must have the same type as filter. 5-D with shape [batch, depth, out_height, out_width, out_channels]
- * or [batch, out_channels, depth, out_height, out_width]. Gradients with respect to the output of the convolution.
+ * @li input_size: A Tensor of type int32, int64. An integer vector representing
+ * the shape of input, where input is a 5-D tensor
+ * [batch, depth, height, width, channels] or
+ * [batch, channels, depth, height, width].
+ * @li filter: A Tensor. Must be one of the following types: float16, float32,
+ * float64.
+ * @li out_backprop: A Tensor. Must have the same type as filter.
+ * 5-D with shape [batch, depth, out_height, out_width, out_channels]
+ * or [batch, out_channels, depth, out_height, out_width]. Gradients with
+ * respect to the output of the convolution.
 
 *@par Required Attributes:
- * @li strides: A list of 5 integers. Specifies the stride of the sliding window for each dimension of "x".
+ * @li strides: A list of 5 integers. Specifies the stride of the sliding window
+ * for each dimension of "x".
  * The N and C dimensions must be 1. Has the same format as "x".
- * @li pads: A list of 6 integers. Supports only padding along the D, H and W dimensions in sequence of head, tail, top, bottom, left and right.
+ * @li pads: A list of 6 integers.
+ * Supports only padding along the D, H and W dimensions in sequence of head,
+ * tail, top, bottom, left and right.
 
 *@par Attributes:
  * Three attributes:
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data.
- * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of the input, now only support [1,1,1,1,1]
+ * @li groups: Number of blocked connections from input channels to output
+ * channels. Reserved.
+ * @li data_format: An optional string from: "NDHWC", "NCDHW".
+ * Defaults to "NDHWC". Specify the data format of the input and output data.
+ * @li dilations: A tuple/list of 5 integers, The dilation factor for each
+ * dimension of the input, now only support [1,1,1,1,1]
 
 *@par Outputs:
  * y: A Tensor. Has the same type as filter,and has same format as input_size
@@ -730,22 +807,31 @@ REG_OP(Conv3DBackpropInput)
  * @li out_backprop: A Tensor. Must have the same type as filter.
 
 *@par Required Attributes:
- * @li strides: A list of 5 integers. Specifies the stride of the sliding window for
- * each dimension of "x". The N and C dimensions must be 1. Has the same format as "x".
+ * @li strides: A list of 5 integers. Specifies the stride of the sliding window
+ * for each dimension of "x".
+ * The N and C dimensions must be 1. Has the same format as "x".
  * @li pads: A list of 6 integers. Supports only padding along the D, H and W
  * dimensions in sequence of head, tail, top, bottom, left and right.
- * @li input_size: A tuple/list of type int32, int64. An integer vector representing the shape of input,
- * where input is a 5-D tensor [batch, depth, height, width, channels] or [batch, channels, depth, height, width].
+ * @li input_size: A tuple/list of type int32, int64. An integer vector
+ * representing the shape of input, where input is a 5-D tensor
+ * [batch, depth, height, width, channels] or
+ * [batch, channels, depth, height, width].
 
 *@par Attributes:
  * Three attributes:
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data.
- * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1]
+ * @li groups: Number of blocked connections from input channels to output
+ * channels. Reserved.
+ * @li data_format: An optional string from: "NDHWC", "NCDHW".
+ * Defaults to "NDHWC". Specify the data format of the input and output data.
+ * @li dilations: A tuple/list of 5 integers, The dilation factor for each
+ * dimension of input, now only support [1,1,1,1,1]
 *@par Outputs:
  * y: A Tensor. Has the same type as filter
 *@par Third-party framework compatibility
  * Compatible with Tensorflow's conv3d_backprop_input
+
+*@par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Conv3DBackpropInput instead.
 */
 REG_OP(Conv3DBackpropInputD)
     .INPUT(filter, TensorType({DT_FLOAT16}))
@@ -760,7 +846,7 @@ REG_OP(Conv3DBackpropInputD)
     .OP_END_FACTORY_REG(Conv3DBackpropInputD)
 
 /**
-*@brief Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence..
+*@brief Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
 
 *@par Inputs:
 * @li x: A Tensor dtype of float16.
@@ -806,24 +892,35 @@ REG_OP(LSTM)
 *@brief Computes the gradients of convolution3D with respect to the filter
 *@par Inputs:
  * Three inputs:
- * @li x: A Tensor. Must be one of the following types: float16, float32, double.
- * 5-D with shape [batch, in_depth, in_height, in_width, in_channels] or [batch, in_depth, in_channels, in_height, in_width].
- * @li filter_size: A Tensor of type int32. An integer vector representing the tensor shape of filter,
- * where filter is a 5-D tensor [filter_depth, filter_height, filter_width, in_channels, out_channels]
- * or [out_channels, filter_depth, filter_height, filter_width, in_channels] or [out_channels, filter_depth, in_channel, filter_height, filter_width].
- * @li out_backprop: A Tensor. Must have the same type as x. 5-D with shape [batch, out_depth, out_height, out_width, out_channels]
- * or [batch, out_depth, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution.
+ * @li x: A Tensor. Must be one of the following types: float16, float32,
+ * double.
+ * 5-D with shape [batch, in_depth, in_height, in_width, in_channels]
+ * or [batch, in_depth, in_channels, in_height, in_width].
+ * @li filter_size: A Tensor of type int32. An integer vector representing the
+ * tensor shape of filter, where filter is a 5-D tensor
+ * [filter_depth, filter_height, filter_width, in_channels, out_channels]
+ * or [out_channels, filter_depth, filter_height, filter_width, in_channels]
+ * or [out_channels, filter_depth, in_channel, filter_height, filter_width].
+ * @li out_backprop: A Tensor. Must have the same type as x.
+ * 5-D with shape [batch, out_depth, out_height, out_width, out_channels]
+ * or [batch, out_depth, out_channels, out_height, out_width].
+ * Gradients with respect to the output of the convolution.
 
 *@par Required Attributes:
- * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding window for
- * each dimension of "x". The N and C dimensions must be 1. Has the same format as "x".
- * @li pads: A tuple/list of 6 integers, [front, back, top, bottom, left, right] pads on feature map.
+ * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding
+ * window for each dimension of "x". The N and C dimensions must be 1.
+ * Has the same format as "x".
+ * @li pads: A tuple/list of 6 integers, [front, back, top, bottom, left, right]
+ * pads on feature map.
 
 *@par Attributes:
  * Three attributes:
- * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1].
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data.
+ * @li dilations: A tuple/list of 5 integers, The dilation factor for each
+ * dimension of input, now only support [1,1,1,1,1].
+ * @li groups: Number of blocked connections from input channels to output
+ * channels. Reserved.
+ * @li data_format: An optional string from: "NDHWC", "NCDHW".
+ * Defaults to "NDHWC". Specify the data format of the input and output data.
 
 *@par Outputs:
  * y: A Tensor. Has the same type as x
@@ -847,28 +944,40 @@ REG_OP(Conv3DBackpropFilter)
 *@par Inputs:
  * Two inputs:
  * @li x: A Tensor of type float16.
- * 5-D with shape [batch, in_depth, in_height, in_width, in_channels] or [batch, in_depth, in_channels, in_height, in_width].
- * @li out_backprop: A Tensor. Must have the same type as x. 5-D with shape [batch, out_depth, out_height, out_width, out_channels]
- * or [batch, out_depth, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution.
+ * 5-D with shape [batch, in_depth, in_height, in_width, in_channels]
+ * or [batch, in_depth, in_channels, in_height, in_width].
+ * @li out_backprop: A Tensor. Must have the same type as x.
+ * 5-D with shape [batch, out_depth, out_height, out_width, out_channels]
+ * or [batch, out_depth, out_channels, out_height, out_width].
+ * Gradients with respect to the output of the convolution.
 
 *@par Required Attributes:
- * @li filter_size: A tuple/list of type integers. An integer vector representing the tensor shape of filter,
- * where filter is a 5-D tensor [filter_depth, filter_height, filter_width, in_channels, out_channels]
- * or [out_channels, filter_depth, filter_height, filter_width, in_channels] or [out_channels, filter_depth, in_channel, filter_height, filter_width].
- * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding window for each dimension of "x".
+ * @li filter_size: A tuple/list of type integers. An integer vector
+ * representing the tensor shape of filter, where filter is a 5-D tensor
+ * [filter_depth, filter_height, filter_width, in_channels, out_channels]
+ * or [out_channels, filter_depth, filter_height, filter_width, in_channels]
+ * or [out_channels, filter_depth, in_channel, filter_height, filter_width].
+ * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding
+ * window for each dimension of "x".
  * The N and C dimensions must be 1. Has the same format as "x".
- * @li pads: A tuple/list of 6 integers, [front, back, top, bottom, left, right] pads on feature map
+ * @li pads: A tuple/list of 6 integers, [front, back, top, bottom, left, right]
+ * pads on feature map
 
 *@par Attributes:
  * Three attributes:
- * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1].
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data.
+ * @li dilations: A tuple/list of 5 integers, The dilation factor for each
+ * dimension of input, now only support [1,1,1,1,1].
+ * @li groups: Number of blocked connections from input channels to output
+ * channels. Reserved.
+ * @li data_format: An optional string from: "NDHWC", "NCDHW".
+ * Defaults to "NDHWC". Specify the data format of the input and output data.
 
 *@par Outputs:
- * y: A Tensor. Has the same type as x
+ * y: A Tensor of type float32
 *@par Third-party framework compatibility
  * Compatible with Tensorflow's conv3d_backprop_filter
+*@par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Conv3DBackpropFilter instead.
 */
 
 
@@ -888,27 +997,32 @@ REG_OP(Conv3DBackpropFilterD)
 *@brief Computes the transpose of convolution 3d with respect to the input.
 *@par Inputs:
  * Three inputs:
- * @li input_size: A Tensor of type int32. An integer vector representing the shape of input
+ * @li input_size: A Tensor of type int32. An integer vector representing the
+ * shape of input
  * @li x: A Tensor of type float16, currently does not support int8
- * @li filter: A Tensor of type float16.
+ * @li filter: A Tensor of type float16, currently does not support int8
 
 *@par Optional input:
  * Two optional inputs
- * @li bias: An optional 1D tensor of the same type as "x".
+ * @li bias: An optional 1D tensor of the same type as "x". Reserved.
  * @li offset_w: An optional 1D tensor for quantized deconvolution. Reserved.
 
 *@par Required Attributes:
- * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding window for each dimension of "x".
+ * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding
+ * window for each dimension of "x".
  * The N and C dimensions must be 1. Has the same format as "x".
  * @li pads: A tuple/list of 6 integers
 
 *@par Attributes:
  * Five attributes:
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1]
- * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data.
+ * @li groups: Number of blocked connections from input channels to output
+ * channels. Reserved.
+ * @li dilations: A tuple/list of 5 integers,
+ * The dilation factor for each dimension of input, now only support [1,1,1,1,1]
+ * @li data_format: An optional string from: "NDHWC", "NCDHW".
+ * Defaults to "NDHWC". Specify the data format of the input and output data.
  * @li output_padding: The size will be added in the output shape.
- * @li offset_x: Input offset_x value
+ * @li offset_x: Input offset_x value. Reserved.
 *@par Outputs:
  * y: A Tensor. Has the same type as filter
 */
@@ -931,28 +1045,35 @@ REG_OP(Conv3DTranspose)
 /**
 *@brief Computes the transpose of convolution 3d with respect to the input.
 *@par Inputs:
- * @li x: A Tensor of type float16.
- * @li filter: A Tensor of type float16.
+ * @li x: A Tensor of type float16, currently does not support int8
+ * @li filter: A Tensor of type float16, currently does not support int8
 
 *@par Optional inputs:
- * @li bias: An optional 1D tensor of the same type as "x".
+ * @li bias: An optional 1D tensor of the same type as "x". Reserved.
  * @li offset_w: An optional 1D tensor for quantized deconvolution. Reserved.
 
 *@par Required Attributes:
- * @li input_size: A tuple/list of type int32. An integer vector representing the shape of input
- * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding window for each dimension of "x".
+ * @li input_size: A tuple/list of type int32.
+ * An integer vector representing the shape of input
+ * @li strides: A tuple/list of 5 integers.
+ * Specifies the stride of the sliding window for each dimension of "x".
  * The N and C dimensions must be 1. Has the same format as "x".
  * @li pads: A tuple/list of 6 integers.
 
 *@par Attributes:
  * Five attributes:
- * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1]
- * @li groups: Number of blocked connections from input channels to output channels.
- * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data.
+ * @li dilations: A tuple/list of 5 integers, The dilation factor for each
+ * dimension of input, now only support [1,1,1,1,1]
+ * @li groups: Number of blocked connections from input channels to output
+ * channels. Reserved.
+ * @li data_format: An optional string from: "NDHWC", "NCDHW".
+ * Defaults to "NDHWC". Specify the data format of the input and output data.
  * @li output_padding: The size will be added in the output shape.
- * @li offset_x: Input offset_x value
+ * @li offset_x: Input offset_x value. Reserved.
 *@par Outputs:
  * y: A Tensor. Has the same type as filter
+*@par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Conv3DTranspose instead.
 */
 REG_OP(Conv3DTransposeD)
     .INPUT(x, TensorType({DT_FLOAT16}))
@@ -974,23 +1095,29 @@ REG_OP(Conv3DTransposeD)
 *@brief Computes the transpose of convolution 2d with respect to the input.
 *@par Inputs:
  * Five inputs:
- * @li input_size: A Tensor of type int32 or int64. An integer vector representing
- * the shape of input.
- * @li x: A Tensor of type float16, int8.
+ * @li input_size: A Tensor of type int32 or int64. An integer vector
+ * representing the shape of input, where input is a 4-D tensor
+ * [batch, height, width, channels] or [batch, channels, height, width].
+ * @li x: A Tensor of type float16, int8. 4-D with shape [batch, out_height,
+ * out_width, out_channels] or [batch, out_channels, out_height, out_width].
  * @li filter: A Tensor of type float16, int8. Must have the same type as "x".
- * @li bias: An optional 1D tensor of the same type as "x".
+ * 4-D with shape [filter_height, filter_width, in_channels, out_channels]
+ * or [out_channels, filter_height, filter_width, in_channels]
+ * or [out_channels, in_channel, filter_height, filter_width].
+ * @li bias: An optional 1D tensor of type float16 or int32. Format is "ND".
  * @li offset_w: An optional 1D tensor for quantized inference. Reserved.
 *@par Required Attributes:
- * @li strides: A required list or tuple. The stride of the sliding window for
- * height and width for H/W dimension.
- * @li pads: A required list or tuple of int32. Padding added to each dimension
- * of the input.
+ * @li strides: A required tuple/list of 4 integers. The stride of the sliding
+ * window for H/W dimension. The index of H/W is same as data_format.
+ * @li pads: A required tuple/list of 4 integers, [top, bottom, left, right]
+ * pads on feature map.
 *@par Attributes:
  * Five attributes:
- * @li groups: Number of blocked connections from input channels to output channels.
+ * @li groups: Number of blocked connections from input channels to output
+ * channels.
  * Defaults to "1".
- * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension
- * of input. Must be [1, 1, 1, 1].
+ * @li dilations: A tuple/list of 4 integers, The dilation factor for each
+ * dimension of input. Must be [1, 1, 1, 1].
  * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC".
  * Specify the data format of the input and output data.
  * @li output_padding: The size will be added in the output shape. Defaults
@@ -998,7 +1125,8 @@ REG_OP(Conv3DTransposeD)
  * @li offset_x: An optional int. Input offset, used for quantized inference.
  * Defaults to "0".
 *@par Outputs:
- * y: A Tensor. Has the same type as "filter".
+ * y: A Tensor. A Tensor of type float16 or int32, and has same format as
+ * input_size.
 */
 REG_OP(Conv2DTranspose)
     .INPUT(input_size, TensorType({DT_INT32, DT_INT64}))
@@ -1045,6 +1173,8 @@ REG_OP(Conv2DTranspose)
  * Defaults to "0".
 *@par Outputs:
  * y: A Tensor. Has the same type as "filter".
+*@par Restrictions:
+ * Warning: THIS FUNCTION IS DEPRECATED. Please use Conv2DTranspose instead.
 */
 REG_OP(Conv2DTransposeD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_INT8}))
diff --git a/third_party/fwkacllib/inc/ops/nn_detect_ops.h b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
index 9a17cd0d..38612463 100644
--- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file nn_detect_ops.h
+ * \brief
+ */
 #ifndef GE_OP_NN_DETECT_OPS_H_
 #define GE_OP_NN_DETECT_OPS_H_
 
@@ -293,6 +297,8 @@ REG_OP(ROIAlign)
 *@see SSDDetectionOutput()
 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
+*@par Restrictions：
+*Warning: THIS FUNCTION IS DEPRECATED. Please use PriorBox instead.
 */
  REG_OP(PriorBoxD)
      .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -314,6 +320,55 @@ REG_OP(ROIAlign)
      .ATTR(variance, ListFloat, {0.1})
      .OP_END_FACTORY_REG(PriorBoxD);
 
+/**
+*@brief Performs SSD prior box detection, with four additional matrices and the "aspect_ratio" attribute deleted compared to PriorBox.
+
+*@par Inputs:
+* Six inputs, including:
+*@li x: An NC1HWC0 or NCHW feature map of type is float32 or float16.
+*@li img: source image. Has the same type and format as "x".
+*@li boxes: An ND tensor of type float32 or float16, specifying the prior box information. Same as output y
+
+*@par Attributes:
+*@li min_size: A required float32, specifying the minimum edge length of a square prior box.
+*@li max_size: A required float32, specifying the maximum edge length of a square prior box: sqrt(min_size * max_size)
+*@li img_h: An optional int32, specifying the height of the source image.
+*@li img_w: An optional int32, specifying the width of the source image.
+*@li step_h: An optional float32, specifying the height step for mapping the center point from the feature map to the source image.
+*@li step_w: An optional float32, specifying the width step for mapping the center point from the feature map to the source image.
+*@li flip: An optional bool. If "True", "aspect_ratio" will be flipped. Defaults to "True".
+*@li clip: An optional bool. If "True", a prior box is clipped to within [0, 1]. Defaults to "False".
+*@li offset: An optional float32, specifying the offset. Defaults to "0.5".
+*@li variance: An optional float32, specifying the variance of a prior box, either one or four variances. Defaults to "0.1" (one value).
+
+*@par Outputs:
+*y: An ND tensor of type float32 or float16, specifying the prior box information, including its coordinates and variance.
+
+*@attention Constraints:\n
+* This operator applies only to SSD networks.
+*@see SSDDetectionOutput()
+*@par Third-party framework compatibility
+* It is a custom operator. It has no corresponding operator in Caffe.
+*@par Restrictions：
+*Warning: THIS FUNCTION IS DEPRECATED. Please use PriorBox instead.
+*/
+ REG_OP(PriorBoxDV2)
+     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+     .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT}))
+     .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT}))
+     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+     .REQUIRED_ATTR(min_size, ListFloat)
+     .REQUIRED_ATTR(max_size, ListFloat)
+     .ATTR(img_h, Int, 0)
+     .ATTR(img_w, Int, 0)
+     .ATTR(step_h, Float, 0.0)
+     .ATTR(step_w, Float, 0.0)
+     .ATTR(flip, Bool, true)
+     .ATTR(clip, Bool, false)
+     .ATTR(offset, Float, 0.5)
+     .ATTR(variance, ListFloat, {0.1})
+     .OP_END_FACTORY_REG(PriorBoxDV2);
+
 /**
 *@brief Performs Position Sensitive ROI Pooling.
 
@@ -574,6 +629,8 @@ and the actual image height and width.
 *@see Yolo()
 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
+*@par Restrictions：
+*Warning: THIS FUNCTION IS DEPRECATED. Please use YoloV2DetectionOutput instead.
 */
 REG_OP(YoloV2DetectionOutputD)
     .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
@@ -700,6 +757,8 @@ and the actual image height and width.
 *@see Yolo()
 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
+*@par Restrictions：
+*Warning: THIS FUNCTION IS DEPRECATED. Please use YoloV3DetectionOutput instead.
 */
 REG_OP(YoloV3DetectionOutputD)
     .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT}))
@@ -926,12 +985,17 @@ REG_OP(ClipBoxes)
 /**
 *@brief Computes ClipBoxesD function.
 
+*@par Attributes:
+*img_size: A Tensor of shape [H, W].
+
 *@par Inputs:
-*@li boxes_input: A Tensor. Must be float16. N-D with shape [N, 4].
-*@li img_size: A Tensor. Must be int32. shape [H, W].
+*boxes_input: A Tensor. Must be float16. N-D with shape [N, 4].
 
 *@par Outputs:
 *boxes_output: A Tensor. Must have the same type as boxes_output. N-D with shape [N, 4].
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(ClipBoxesD)
     .INPUT(boxes_input, TensorType({DT_FLOAT16}))
@@ -1032,6 +1096,11 @@ REG_OP(RpnProposals)
 
 * @par Third-party framework compatibility
 * Compatible with the pytorch operator RPNProposals.
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use RpnProposals instead.
 */
 REG_OP(RpnProposalsD)
     .INPUT(rois, TensorType({DT_FLOAT16}))
diff --git a/third_party/fwkacllib/inc/ops/nn_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
index 52e7702c..05470e88 100644
--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file nn_norm_ops.h
+ * \brief
+ */
 #ifndef GE_OP_NN_NORM_OPS_H
 #define GE_OP_NN_NORM_OPS_H
 
@@ -342,9 +346,9 @@ REG_OP(ConfusionSoftmaxGrad)
 *@brief Function softmax gradients ext.
 
 *@par Inputs:
-* @li grad: A Tensor dtype of float16.
+* @li grad: A Tensor dtype of float16, float32.
 * @li x1: A Tensor dtype of float16, float32.
-* @li x2: A Tensor dtype of float16.
+* @li x2: A Tensor dtype of float16, float32.
 
 *@par Attributes:
 *@li axis: A int Scalar. The axis for reduce.
diff --git a/third_party/fwkacllib/inc/ops/nn_ops.h b/third_party/fwkacllib/inc/ops/nn_ops.h
index 7637da07..ea4a5ba3 100644
--- a/third_party/fwkacllib/inc/ops/nn_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file nn_ops.h
+ * \brief
+ */
 #ifndef GE_OP_NN_OPS_H_
 #define GE_OP_NN_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
index a7d4c6e3..4878935f 100644
--- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file nn_pooling_ops.h
+ * \brief
+ */
 #ifndef GE_OP_NN_POOLING_OPS_H
 #define GE_OP_NN_POOLING_OPS_H
 
@@ -223,8 +227,7 @@ REG_OP(MaxPool)
 *@brief Performs max 3d pooling on the input.
 
 *@par Inputs:
-*x: An NC1HWC0 Tensor. Supported type:float16, float32, double, int8, int16, \n
-int32, int64, uint8, uint16, qint8
+*x: An NC1HWC0 Tensor. Supported type float16, float32, double.
 
 *@par Attributes:
 *@li ksize: A required list of int8, int16, int32, or int64 values, \n
@@ -233,11 +236,11 @@ No default value.
 *@li strides: A required list of int8, int16, int32, or int64 values, \n
 specifying the stride of the sliding window for each dimension of  \n
 the input tensor. No default value.
-*@li padding: A required string. No default value.
-*@li pads: A list type of int32. Default value {0, 0, 0, 0, 0, 0}.
-*@li dilation: A list type of int32. Default value {0,0,0}.
+*@li padding: A required string. Default value "SAME".
+*@li pads: A list type of int32. Default value {0, 0, 0}.
+*@li dilation: A list type of int32. Default value {1, 1, 1}.
 *@li ceil_mode: A ceil mode number of int32 . Default value 0.
-*@li data_format: An optional string. Defaults to "NHWC".
+*@li data_format: An optional string. Defaults to "NDHWC".
 
 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "x".
@@ -635,6 +638,9 @@ REG_OP(AvgPoolGrad)
 
 * @par Outputs:
 * @out_grad: A mutable tensor with the same shape and type as "orig_input".
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use AvgPoolGrad instead.
 */
 REG_OP(AvgPoolGradD)
     .INPUT(input_grad, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
@@ -956,6 +962,9 @@ REG_OP(AvgPool1D)
 
 *@par Third-party framework compatibility
 *@li compatible with pytorch AvgPool1D operator.
+*
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use AvgPool1D instead.
 */
 REG_OP(AvgPool1DD)
     .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
diff --git a/third_party/fwkacllib/inc/ops/nn_training_ops.h b/third_party/fwkacllib/inc/ops/nn_training_ops.h
index 0ecaf9a3..0dab8606 100644
--- a/third_party/fwkacllib/inc/ops/nn_training_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file nn_training_ops.h
+ * \brief
+ */
 #ifndef GE_OP_TRAINING_OPS_H
 #define GE_OP_TRAINING_OPS_H
 
@@ -178,6 +182,9 @@ REG_OP(SparseApplyAdagrad)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator SparseApplyAdagrad.
+*
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyAdagrad instead.
 */
 REG_OP(SparseApplyAdagradD)
     .INPUT(var, TensorType({DT_FLOAT}))
@@ -247,6 +254,9 @@ REG_OP(SparseApplyAdagradV2)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator SparseApplyAdagradV2.
+*
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyAdagradV2 instead.
 */
 REG_OP(SparseApplyAdagradV2D)
     .INPUT(var, TensorType({DT_FLOAT}))
@@ -440,6 +450,8 @@ REG_OP(ApplyKerasMomentum)
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ResourceApplyKerasMomentum.
 *
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyKerasMomentum instead.
 */
 REG_OP(ApplyKerasMomentumD)
     .INPUT(var, TensorType::NumberType())
@@ -500,6 +512,9 @@ REG_OP(ApplyKerasMomentumD)
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ResourceApplyKerasMomentum.
 *
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdamWithAmsgrad instead.
+*
 */
 REG_OP(ApplyAdamWithAmsgradD)
     .INPUT(var, TensorType::NumberType())
@@ -1113,6 +1128,8 @@ REG_OP(ApplyAdagradV2)
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator ApplyAdagrad.
 *
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdagradV2 instead.
 */
 REG_OP(ApplyAdagradV2D)
     .INPUT(var, TensorType::NumberType())
@@ -1389,6 +1406,9 @@ REG_OP(ApplyRMSProp)
 *
 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator ApplyRMSProp.
+*
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyRMSProp instead.
 */
 REG_OP(ApplyRMSPropD)
     .INPUT(var, TensorType::NumberType())
@@ -2184,6 +2204,9 @@ REG_OP(SparseApplyFtrl)
 
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator SparseApplyFtrl.
+*
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyFtrl instead.
 */
 REG_OP(SparseApplyFtrlD)
     .INPUT(var, TensorType({DT_FLOAT}))
@@ -2281,6 +2304,9 @@ REG_OP(SparseApplyFtrlV2)
 
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator SparseApplyFtrlV2D.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyFtrlV2 instead.
 */
 REG_OP(SparseApplyFtrlV2D)
     .INPUT(var, TensorType({DT_FLOAT}))
@@ -2389,6 +2415,9 @@ REG_OP(SparseApplyRMSProp)
 * @li Note that in this sparse implementation, "ms" and "mom" will not update
 * in iterations during which "grad" is 0.
 * @li The input tensors "var", "ms" and "mom" must have the same shape.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyRMSProp instead.
 */
 REG_OP(SparseApplyRMSPropD)
     .INPUT(var, TensorType::NumberType())
@@ -2492,6 +2521,9 @@ REG_OP(SparseApplyAdadelta)
 * @li Note that in this sparse implementation, "accum" and "accum_update" will not update
 * in iterations during which "grad" is 0.
 * @li The input tensors "var", "accum" and "accum_update" must have the same shape.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyAdadelta instead.
 */
 REG_OP(SparseApplyAdadeltaD)
     .INPUT(var, TensorType::NumberType())
diff --git a/third_party/fwkacllib/inc/ops/no_op.h b/third_party/fwkacllib/inc/ops/no_op.h
index 61e187c4..9cde8a0f 100644
--- a/third_party/fwkacllib/inc/ops/no_op.h
+++ b/third_party/fwkacllib/inc/ops/no_op.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file no_op.h
+ * \brief
+ */
 #ifndef GE_NO_OP_H_
 #define GE_NO_OP_H_
 
diff --git a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
index 310325c8..d265d4e5 100644
--- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
+++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file nonlinear_fuc_ops.h
+ * \brief
+ */
 #ifndef GE_OP_NONLINEAR_FUC_OPS_H
 #define GE_OP_NONLINEAR_FUC_OPS_H
 
@@ -58,6 +62,43 @@ REG_OP(GeluGrad)
     .OUTPUT(z, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OP_END_FACTORY_REG(GeluGrad)
 
+/**
+*@brief Computes the for the fast_gelu of "x".
+
+*@par Inputs:
+*Two inputs, including:
+* @li x: A Tensor. Must be one of the following types: float16, float32
+
+*@par Outputs:
+*y: A Tensor. Has the same type as "x".
+*@par Third-party framework compatibility
+*Compatible with the TensorFlow operator FastGelu
+*/
+REG_OP(FastGelu)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OP_END_FACTORY_REG(FastGelu)
+
+/**
+*@brief Computes the gradient for the fast_gelu of "x".
+
+*@par Inputs:
+*Three inputs, including:
+* @li dy: A Tensor. Must be one of the following types: float16, float32
+* @li x: A Tensor of the same type as "dy".
+
+*@par Outputs:
+*z: A Tensor. Has the same type as "dy".
+*@par Third-party framework compatibility
+*Compatible with the TensorFlow operator FastGeluGrad
+*/
+REG_OP(FastGeluGrad)
+    .INPUT(dy, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(z, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OP_END_FACTORY_REG(FastGeluGrad)
+
+
 /**
 *@brief Computes the gradient for the tanh of "x".
 
@@ -153,6 +194,9 @@ REG_OP(Relu6)
 
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator Relu6.
+*
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use Relu6 instead.
 */
 REG_OP(Relu6D)
     .INPUT(x, TensorType::RealNumberType())
@@ -535,14 +579,17 @@ REG_OP(LeakyReluGrad)
 *@brief Thresholds grad each element of the input Tensor.
 
 *@par Inputs:
-* @li gradients: A Tensor shape and dtype of input gradients. Support float16, float32, int8, uint8, int32.
-* @li features: A Tensor shape and dtype of input features. Support float16, float32, int8, uint8, int32.
+* @li gradients: A Tensor shape and dtype of input gradients. Support float16, int32.
+* @li features: A Tensor shape and dtype of input features. Support float16, int32.
 
 *@par Attributes:
 *threshold: A float32 scale value to threshold at.
 
 *@par Outputs:
 *backprops: A Tensor of shape and dtype of output backprops, should be same shape and type as inputs.
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(ThresholdGradV2D)
     .INPUT(gradients, TensorType({DT_INT32, DT_FLOAT16}))
@@ -555,7 +602,7 @@ REG_OP(ThresholdGradV2D)
 *@brief Thresholds each element of the input Tensor y = (x > threshold) ? x : value.
 
 *@par Inputs:
-*x: A Tensor dtype of float16, float32, int8, uint8, int32.
+*x: A Tensor dtype of real number.
 
 *@par Attributes:
 *@li threshold: A float32 scale value to threshold at.
@@ -563,6 +610,9 @@ REG_OP(ThresholdGradV2D)
 
 *@par Outputs:
 *y: A Tensor of shape and dtype of output, should be same shape and type as input.
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(ThresholdV2D)
     .INPUT(x, TensorType::RealNumberType())
diff --git a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
index 8e9e1638..91aff6ba 100644
--- a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
+++ b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/*!
+ * \file npu_loss_scale_ops.h
+ * \brief
+ */
+
 #ifndef GE_OP_NN_LOSS_SCALE_OPS_H
 #define GE_OP_NN_LOSS_SCALE_OPS_H
 #include "graph/operator_reg.h"
diff --git a/third_party/fwkacllib/inc/ops/outfeed_ops.h b/third_party/fwkacllib/inc/ops/outfeed_ops.h
index af27140a..139e4880 100644
--- a/third_party/fwkacllib/inc/ops/outfeed_ops.h
+++ b/third_party/fwkacllib/inc/ops/outfeed_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file outfeed_ops.h
+ * \brief
+ */
 #ifndef GE_OP_OUTFEED_OPS_H
 #define GE_OP_OUTFEED_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/pad_ops.h b/third_party/fwkacllib/inc/ops/pad_ops.h
index f7153936..6a0492f6 100644
--- a/third_party/fwkacllib/inc/ops/pad_ops.h
+++ b/third_party/fwkacllib/inc/ops/pad_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file pad_ops.h
+ * \brief
+ */
 #ifndef GE_OP_PAD_OPS_H
 #define GE_OP_PAD_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/parsing_ops.h b/third_party/fwkacllib/inc/ops/parsing_ops.h
index a8a3e7a1..e73a69fe 100644
--- a/third_party/fwkacllib/inc/ops/parsing_ops.h
+++ b/third_party/fwkacllib/inc/ops/parsing_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file parsing_ops.h
+ * \brief
+ */
 #ifndef GE_OP_PARSING_OPS_H
 #define GE_OP_PARSING_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/quantize_ops.h b/third_party/fwkacllib/inc/ops/quantize_ops.h
index 4cb80cea..772f9edb 100644
--- a/third_party/fwkacllib/inc/ops/quantize_ops.h
+++ b/third_party/fwkacllib/inc/ops/quantize_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file quantize_ops.h
+ * \brief
+ */
 #ifndef GE_OP_QUANTIZE_OPS_H
 #define GE_OP_QUANTIZE_OPS_H
 #include "graph/operator_reg.h"
@@ -143,14 +147,14 @@ REG_OP(AscendAntiQuant)
 
 *@par Inputs:
 *@li x0: An NC1HWC0 tensor of type int32, specifying the input.
-*@li deq_scale: An NC1HWC0 tensor of type float16 or uint64, specifying the scaling ratio.
+*@li deq_scale: An NC1HWC0 tensor of type uint64, specifying the scaling ratio.
 *@li x1: An NC1HWC0 tensor of type int16, specifying the input.
 
 *@par Attributes:
 *relu_flag: A optional bool, specifying whether to perform ReLU, either "True" or "False". Defaults to "False".
 
 *@par Outputs:
-*y: The dequantized output tensor of type float16 or float32 and with format NC1HWC0.
+*y: The dequantized output tensor of type int16 and with format NC1HWC0.
 
 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
diff --git a/third_party/fwkacllib/inc/ops/ragged_array_ops.h b/third_party/fwkacllib/inc/ops/ragged_array_ops.h
index 2b8bba5f..d0f2b1c5 100644
--- a/third_party/fwkacllib/inc/ops/ragged_array_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_array_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file ragged_array_ops.h
+ * \brief
+ */
 #ifndef GE_OP_RAGGED_ARRAY_OPS_H
 #define GE_OP_RAGGED_ARRAY_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
index 82fd84b7..a95884a8 100644
--- a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file ragged_conversion_ops.h
+ * \brief
+ */
 #ifndef GE_OP_RAGGED_CONVERSION_OPS_H
 #define GE_OP_RAGGED_CONVERSION_OPS_H
 #include "graph/operator_reg.h"
diff --git a/third_party/fwkacllib/inc/ops/ragged_math_ops.h b/third_party/fwkacllib/inc/ops/ragged_math_ops.h
index e56c35a5..5acdb7f6 100644
--- a/third_party/fwkacllib/inc/ops/ragged_math_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_math_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file ragged_math_ops.h
+ * \brief
+ */
 #ifndef GE_OP_RAGGED_MATH_OPS_H
 #define GE_OP_RAGGED_MATH_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/random_ops.h b/third_party/fwkacllib/inc/ops/random_ops.h
index a35e8b3a..8c95ea64 100644
--- a/third_party/fwkacllib/inc/ops/random_ops.h
+++ b/third_party/fwkacllib/inc/ops/random_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file random_ops.h
+ * \brief
+ */
 #ifndef GE_OP_RANDOM_OPS_H_
 #define GE_OP_RANDOM_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/reduce_ops.h b/third_party/fwkacllib/inc/ops/reduce_ops.h
index d3dfefe1..a7f8a178 100644
--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file reduce_ops.h
+ * \brief
+ */
 #ifndef GE_OP_REDUCE_OPS_H
 #define GE_OP_REDUCE_OPS_H
 
@@ -208,7 +212,7 @@ REG_OP(BNTrainingUpdateV2)
 
 /**
 *@brief Performs reduced batch normalization v3. For some scene which don't contain
-assignmoving average.
+assign moving average.
 
 *@par Inputs:
 * Five inputs, including: (NC1HWC0 supported)
@@ -222,7 +226,6 @@ assignmoving average.
 *epsilon: A required float32, specifying the small value added to variance to avoid dividing by zero.
 
 *@par Outputs:
-* Three outputs, including: (NC1HWC0 supported)
 *@li y: A 5D Tensor of type float16 or float32, for normalized "x".
 *@li batch_mean: A 5D Tensor of type float32, for the mean of "x".
 *@li batch_variance: A 5D Tensor of type float32, for the variance of "x".
diff --git a/third_party/fwkacllib/inc/ops/resource_variable_ops.h b/third_party/fwkacllib/inc/ops/resource_variable_ops.h
index 04aadf40..a4d54088 100644
--- a/third_party/fwkacllib/inc/ops/resource_variable_ops.h
+++ b/third_party/fwkacllib/inc/ops/resource_variable_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file resource_variable_ops.h
+ * \brief
+ */
 #ifndef GE_OP_RESOURCE_VARIABLE_OPS_H
 #define GE_OP_RESOURCE_VARIABLE_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/rnn.h b/third_party/fwkacllib/inc/ops/rnn.h
index ebc59a34..ee19865f 100644
--- a/third_party/fwkacllib/inc/ops/rnn.h
+++ b/third_party/fwkacllib/inc/ops/rnn.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file rnn.h
+ * \brief
+ */
 #ifndef GE_OP_RNN_H
 #define GE_OP_RNN_H
 
@@ -85,6 +89,76 @@ REG_OP(DynamicLSTM)
     .OUTPUT(output_h, TensorType({DT_FLOAT32}))
     .OP_END_FACTORY_REG(DynamicLSTM)
 
+/**
+*@brief: DynamicRNN calculation.
+*@par Inputs:
+*ten inputs: \n
+*@li x:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li w:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
+*@li b:A 1D Tensor. Must be one of the following types: float16, float32. The format must be ND.
+*@li seq_length:A 1D Tensor. Must be one of the following types: int32. The format must be ND.
+*@li init_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li init_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li wci:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
+*@li wcf:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
+*@li wco:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
+*@li mask:A 1D Tensor. Must be one of the following types: uint8. The format must be ND.
+
+*@par Attributes:
+*@li cell_type:An string identifying the cell type in the op. Default to "LSTM". Only LSTM is currently supported.
+*@li direction:An string identifying the direction in the op. Default to "UNIDIRECTIONAL". Only UNIDIRECTIONAL is currently supported.
+*@li cell_depth:An integer identifying the cell depth in the op. Default to 1.
+*@li use_peephole:An bool identifying if use peephole in the op. Default to false.
+*@li keep_prob:An float identifying the keep prob in the op. Default to 1.
+*@li cell_clip:An float identifying the cell clip in the op. Default to -1.
+*@li num_proj:An integer identifying the num projection in the op. Default to 0.
+*@li time_major:An bool identifying the time major in the op. Default to false.
+*@li activation:An string identifying the type of activation function in the op. Default to "tanh". Only tanh is currently supported.
+*@li forget_bias:An float identifying the forget bias in the op. Default to 0.
+*@li is_training:An bool identifying is training in the op. Default to true.
+
+*@par Outputs:
+*eight outputs: \n
+*@li y:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li output_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li output_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li i:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li j:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li f:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li o:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li tanhct:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*/
+REG_OP(DynamicRNN)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(w, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(b, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(seq_length, TensorType({DT_UINT32}))
+    .OPTIONAL_INPUT(init_h, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(init_c, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(wci, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(wcf, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(wco, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(mask, TensorType({DT_UINT8}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(output_h, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(output_c, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(i, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(j, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(f, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(o, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(tanhc, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(cell_type, String, "LSTM")
+    .ATTR(direction, String, "UNIDIRECTIONAL")
+    .ATTR(cell_depth, Int, 1)
+    .ATTR(use_peephole, Bool, false)
+    .ATTR(keep_prob, Float, 1.0)
+    .ATTR(cell_clip, Float, -1.0)
+    .ATTR(num_proj, Int, 0)
+    .ATTR(time_major, Bool, false)
+    .ATTR(forget_bias, Float, 0.0)
+    .ATTR(is_training, Bool, true)
+    .OP_END_FACTORY_REG(DynamicRNN)
+
 /**
 *@brief: Basic LSTM Cell backward calculation.Calculate the gradient of input and hidden state.
 *@par Inputs:
diff --git a/third_party/fwkacllib/inc/ops/rpn_ops.h b/third_party/fwkacllib/inc/ops/rpn_ops.h
index 252bfdb0..1484e95e 100644
--- a/third_party/fwkacllib/inc/ops/rpn_ops.h
+++ b/third_party/fwkacllib/inc/ops/rpn_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file rpn_ops.h
+ * \brief
+ */
 #ifndef GE_OP_RPN_OPS_H
 #define GE_OP_RPN_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/save_ops.h b/third_party/fwkacllib/inc/ops/save_ops.h
index a3b9d397..7fd853d3 100644
--- a/third_party/fwkacllib/inc/ops/save_ops.h
+++ b/third_party/fwkacllib/inc/ops/save_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file save_ops.h
+ * \brief
+ */
 #ifndef GE_OP_SAVE_OPS_H_
 #define GE_OP_SAVE_OPS_H_
 
@@ -21,6 +25,13 @@
 
 namespace ge {
 
+/**
+*@brief Mark which tensors need to be saved to the ckpt file.
+*@par Inputs:
+*tensors: A list of input tensor.
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(Save)
     .DYNAMIC_INPUT(tensors, TensorType:ALL())
     .OP_END_FACTORY_REG(Save)
@@ -28,4 +39,4 @@ REG_OP(Save)
 } // namespace ge
 
 
-#endif  // GE_OP_SAVE_OPS_H_
\ No newline at end of file
+#endif  // GE_OP_SAVE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/sdca_ops.h b/third_party/fwkacllib/inc/ops/sdca_ops.h
index 2cbafc3c..712fc1fc 100644
--- a/third_party/fwkacllib/inc/ops/sdca_ops.h
+++ b/third_party/fwkacllib/inc/ops/sdca_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file sdca_ops.h
+ * \brief
+ */
 #ifndef GE_OP_SDCA_OPS_H
 #define GE_OP_SDCA_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/selection_ops.h b/third_party/fwkacllib/inc/ops/selection_ops.h
index 47cf4a47..1328ae52 100644
--- a/third_party/fwkacllib/inc/ops/selection_ops.h
+++ b/third_party/fwkacllib/inc/ops/selection_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file selection_ops.h
+ * \brief
+ */
 #ifndef GE_OP_SELECTION_OPS_H
 #define GE_OP_SELECTION_OPS_H
 #include "graph/operator_reg.h"
@@ -125,6 +129,8 @@ REG_OP(Tile)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator Tile.
+*@par Restrictions：
+*Warning: THIS FUNCTION IS DEPRECATED. Please use Tile instead.
 */
 REG_OP(TileD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
@@ -540,6 +546,8 @@ REG_OP(ReverseV2)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ReverseV2.
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use ReverseV2 instead.
 */
 REG_OP(ReverseV2D)
     .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32,
@@ -755,6 +763,8 @@ REG_OP(Slice)
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x". The slice extracted from the tensor.
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use Slice instead.
 */
 REG_OP(SliceD)
     .INPUT(x, TensorType::BasicType())
@@ -915,6 +925,9 @@ REG_OP(ScatterNdD)
 
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator InTopK.
+*
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use InTopK instead.
 */
 REG_OP(InTopKD)
     .INPUT(x1, TensorType({DT_FLOAT}))
@@ -1027,6 +1040,9 @@ REG_OP(StridedSliceAssign)
 * "value" shape must be exactly the shape produced by the slice of "var".
 
 * @see StridedSlice()
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use StridedSliceAssign instead.
 */
 REG_OP(StridedSliceAssignD)
     .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT16}))
@@ -1407,7 +1423,10 @@ REG_OP(UnsortedSegmentMin)
 * @par Outputs:
 * y: A Tensor.Must have the same type as input "x".
 
-* @see UnsortedSegmentProdD(),
+* @see UnsortedSegmentProdD(), UnsortedSegmentSumD(),
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentMin instead.
 */
 REG_OP(UnsortedSegmentMinD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT16}))
@@ -1457,6 +1476,9 @@ REG_OP(UnsortedSegmentMax)
 * y: A Tensor.Must have the same type as input "x".
 
 * @see UnsortedSegmentProdD(),
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentMax instead.
 */
 REG_OP(UnsortedSegmentMaxD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT16}))
@@ -1505,6 +1527,9 @@ REG_OP(UnsortedSegmentProd)
 * y: A Tensor.Must have the same type as input "x".
 
 * @see UnsortedSegmentMinD()
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentProd instead.
 */
 REG_OP(UnsortedSegmentProdD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT16}))
@@ -1580,6 +1605,8 @@ REG_OP(UnsortedSegmentProdD)
 *@li actual_rois_num: A Tensor with shape [batch, 8], of type int32, specifying the number of BBoxes output per batch.
 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use Proposal instead.
 */
 REG_OP(ProposalD)
      .INPUT(cls_prob, TensorType({DT_FLOAT16, DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/set_ops.h b/third_party/fwkacllib/inc/ops/set_ops.h
index d9478380..f4d5c4ba 100644
--- a/third_party/fwkacllib/inc/ops/set_ops.h
+++ b/third_party/fwkacllib/inc/ops/set_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file set_ops.h
+ * \brief
+ */
 #ifndef GE_OP_SET_OPS_H_
 #define GE_OP_SET_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/sparse_ops.h b/third_party/fwkacllib/inc/ops/sparse_ops.h
index 6b5600f7..eb3629a4 100644
--- a/third_party/fwkacllib/inc/ops/sparse_ops.h
+++ b/third_party/fwkacllib/inc/ops/sparse_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file sparse_ops.h
+ * \brief
+ */
 #ifndef GE_OP_SPARSE_OPS_H_
 #define GE_OP_SPARSE_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/spectral_ops.h b/third_party/fwkacllib/inc/ops/spectral_ops.h
index 53b3e848..4c412a1f 100644
--- a/third_party/fwkacllib/inc/ops/spectral_ops.h
+++ b/third_party/fwkacllib/inc/ops/spectral_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file spectral_ops.h
+ * \brief
+ */
 #ifndef GE_OP_SPECTRAL_OPS_H
 #define GE_OP_SPECTRAL_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/split_combination_ops.h b/third_party/fwkacllib/inc/ops/split_combination_ops.h
index 7e4428d0..de7300d2 100644
--- a/third_party/fwkacllib/inc/ops/split_combination_ops.h
+++ b/third_party/fwkacllib/inc/ops/split_combination_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file split_combination_ops.h
+ * \brief
+ */
 #ifndef GE_OP_SPLIT_COMBINATION_OPS_H
 #define GE_OP_SPLIT_COMBINATION_OPS_H
 #include "graph/operator_reg.h"
@@ -197,6 +201,8 @@ REG_OP(ParallelConcat)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ConcatV2.
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use ConcatV2 instead.
 */
 REG_OP(ConcatV2D)
     .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_INT64, DT_UINT64, DT_UINT32, DT_INT16, DT_UINT16, DT_UINT8}))
@@ -254,6 +260,8 @@ REG_OP(ConcatV2)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Concat.
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use Concat instead.
 */
 REG_OP(ConcatD)
     .DYNAMIC_INPUT(x, TensorType({DT_FLOAT,DT_FLOAT16,DT_INT8,DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_UINT32,DT_UINT64}))
@@ -360,6 +368,8 @@ REG_OP(ConcatOffset)
 
 *@par Third-party framework compatibility
 *@ Compatible with the TensorFlow operator ConcatOffset.
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use ConcatOffset instead.
 */
 REG_OP(ConcatOffsetD)
     .DYNAMIC_INPUT(x, TensorType({DT_INT32}))
diff --git a/third_party/fwkacllib/inc/ops/state_ops.h b/third_party/fwkacllib/inc/ops/state_ops.h
index 4e759688..2261cd3e 100644
--- a/third_party/fwkacllib/inc/ops/state_ops.h
+++ b/third_party/fwkacllib/inc/ops/state_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file state_ops.h
+ * \brief
+ */
 #ifndef GE_OP_STATE_OPS_H_
 #define GE_OP_STATE_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/stateful_random_ops.h b/third_party/fwkacllib/inc/ops/stateful_random_ops.h
index eb3db1cc..0bcb87cd 100644
--- a/third_party/fwkacllib/inc/ops/stateful_random_ops.h
+++ b/third_party/fwkacllib/inc/ops/stateful_random_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file stateful_random_ops.h
+ * \brief
+ */
 #ifndef GE_OP_STATEFUL_RANDOM_OPS_H
 #define GE_OP_STATEFUL_RANDOM_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/stateless_random_ops.h b/third_party/fwkacllib/inc/ops/stateless_random_ops.h
index 03fc824a..ddfda47d 100644
--- a/third_party/fwkacllib/inc/ops/stateless_random_ops.h
+++ b/third_party/fwkacllib/inc/ops/stateless_random_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file stateless_random_ops.h
+ * \brief
+ */
 #ifndef GE_OP_STATELESS_RANDOM_OPS_H
 #define GE_OP_STATELESS_RANDOM_OPS_H
 
diff --git a/third_party/fwkacllib/inc/ops/string_ops.h b/third_party/fwkacllib/inc/ops/string_ops.h
index d085a868..8b4b7250 100644
--- a/third_party/fwkacllib/inc/ops/string_ops.h
+++ b/third_party/fwkacllib/inc/ops/string_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file string_ops.h
+ * \brief
+ */
 #ifndef GE_OP_STRING_OPS_H_
 #define GE_OP_STRING_OPS_H_
 
@@ -44,6 +48,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with StringSplit op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringSplit)
     .INPUT(input, TensorType({DT_STRING}))
@@ -76,6 +83,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with StringSplitV2 op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringSplitV2)
     .INPUT(input, TensorType({DT_STRING}))
@@ -108,6 +118,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with UnicodeScript op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(UnicodeScript)
     .INPUT(x, TensorType({DT_INT32}))
@@ -139,6 +152,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with Substr op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(Substr)
     .INPUT(input, TensorType({DT_STRING}))
@@ -169,6 +185,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with StringToHashBucketFast op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringToHashBucketFast)
     .INPUT(x, TensorType({DT_STRING}))
@@ -203,6 +222,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with StringToHashBucketStrong op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringToHashBucketStrong)
     .INPUT(x, TensorType({DT_STRING}))
@@ -228,6 +250,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with StringToHashBucket op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringToHashBucket)
     .INPUT(string_tensor, TensorType({DT_STRING}))
@@ -249,6 +274,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with StringStrip op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringStrip)
     .INPUT(x, TensorType({DT_STRING}))
@@ -277,6 +305,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with StringLength op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringLength)
     .INPUT(x, TensorType({DT_STRING}))
@@ -309,6 +340,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with StringJoin op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringJoin)
     .DYNAMIC_INPUT(x, TensorType({DT_STRING}))
@@ -341,6 +375,9 @@ include: \n
 
 *@par Third-party framework compatibility
 * compatible with StringFormat op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringFormat)
     .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
@@ -372,6 +409,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with RegexFullMatch op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(RegexFullMatch)
     .INPUT(x, TensorType({DT_STRING}))
@@ -404,6 +444,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with RegexReplace op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(RegexReplace)
     .INPUT(x, TensorType({DT_STRING}))
@@ -439,6 +482,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with AsString op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(AsString)
     .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \
@@ -477,6 +523,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with EncodeBase64 op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(EncodeBase64)
     .INPUT(x, TensorType({DT_STRING}))
@@ -500,6 +549,9 @@ include: \n
 
 *@par Third-party framework compatibility
 *compatible with DecodeBase64 op of tensorflow
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DecodeBase64)
     .INPUT(x, TensorType({DT_STRING}))
diff --git a/third_party/fwkacllib/inc/ops/swap_co_ops.h b/third_party/fwkacllib/inc/ops/swap_co_ops.h
index 02f1451b..a6c0f9ca 100644
--- a/third_party/fwkacllib/inc/ops/swap_co_ops.h
+++ b/third_party/fwkacllib/inc/ops/swap_co_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file swap_co_ops.h
+ * \brief
+ */
 #ifndef GE_OP_SWAP_CO_OPS_H_
 #define GE_OP_SWAP_CO_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/transformation_ops.h b/third_party/fwkacllib/inc/ops/transformation_ops.h
index ddbb1b4d..a7c33ab5 100644
--- a/third_party/fwkacllib/inc/ops/transformation_ops.h
+++ b/third_party/fwkacllib/inc/ops/transformation_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file transformation_ops.h
+ * \brief
+ */
 #ifndef GE_OP_TRANSFORMATION_OPS_H
 #define GE_OP_TRANSFORMATION_OPS_H
 
@@ -93,6 +97,8 @@ REG_OP(DepthwiseWeight6DTo4D)
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x".
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED. Please use Transpose instead.
 */
 REG_OP(TransposeD)
     .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8,
@@ -124,17 +130,17 @@ REG_OP(Transpose)
     .OP_END_FACTORY_REG(Transpose)
 
 /**
-*@brief Doing format_transfer for various data format only \n
-support NHWC/NCHW to NC1HWC0 and NC1HWC0 to NHWC/NCHW \n
-NCHW to FRACTAL_Zn or FRACTAL_Zn to NCHW \n
-HWCN to FRACTAL_Zn or FRACTAL_Zn to HWCN.
+*@brief Doing format_transfer for various data format only
+support "NHWC/NCHW" to "NC1HWC0" and "NC1HWC0" to "NHWC/NCHW"
+"NCHW" to "FRACTAL_Zn" or "FRACTAL_Zn" to "NCHW".
+"HWCN" to "FRACTAL_Zn" or "FRACTAL_Zn" to "HWCN".
 
 *@par Inputs:
 *src: A Tensor dtype of all types.
 
 *@par Attributes:
-*@li src_format: A string source data format, can be NHWC, NCHW, FRACTAL_Zn etc.
-*@li expose_hidden: A string target data format, can be NC1HWC0, NCHW, FRACTAL_Zn etc.
+*@li src_format: A string source data format, can be "NHWC", "NCHW", "FRACTAL_Zn" etc.
+*@li dst_format: A string target data format, can be "NC1HWC0", "NCHW", "FRACTAL_Zn" etc.
 
 *@par Outputs:
 *dst: A Tensor dtype of all types.
diff --git a/third_party/fwkacllib/inc/ops/warp_perspective_ops.h b/third_party/fwkacllib/inc/ops/warp_perspective_ops.h
index 7da49c1e..bf8ecd53 100644
--- a/third_party/fwkacllib/inc/ops/warp_perspective_ops.h
+++ b/third_party/fwkacllib/inc/ops/warp_perspective_ops.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/*!
+ * \file warp_perspective_ops.h
+ * \brief
+ */
 #ifndef GE_OP_WARP_PERSPECTIVE_OPS_H_
 #define GE_OP_WARP_PERSPECTIVE_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/register/host_cpu_context.h b/third_party/fwkacllib/inc/register/host_cpu_context.h
new file mode 100644
index 00000000..f7d4f52f
--- /dev/null
+++ b/third_party/fwkacllib/inc/register/host_cpu_context.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_REGISTER_HOST_CPU_CONTEXT_H_
+#define INC_REGISTER_HOST_CPU_CONTEXT_H_
+
+#include "external/ge/ge_api_error_codes.h"
+#include "register/register_types.h"
+
+namespace ge {
+class HostCpuContext {
+ public:
+  HostCpuContext() = default;
+  ~HostCpuContext() = default;
+ private:
+  class Impl;
+  Impl *impl_;
+};
+} // namespace ge
+
+extern "C" {
+// Unified definition for registering host_cpu_kernel_wrapper when so is opened
+FMK_FUNC_HOST_VISIBILITY ge::Status Initialize(const ge::HostCpuContext &ctx);
+}
+
+#endif //INC_REGISTER_HOST_CPU_CONTEXT_H_
diff --git a/third_party/fwkacllib/inc/register/op_registry.h b/third_party/fwkacllib/inc/register/op_registry.h
index 1dc14b8b..3feea0df 100644
--- a/third_party/fwkacllib/inc/register/op_registry.h
+++ b/third_party/fwkacllib/inc/register/op_registry.h
@@ -65,6 +65,9 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistry {
 
   domi::FusionParseParamFunc GetFusionParseParamFunc(const std::string &op_type, const std::string &ori_type);
 
+  domi::FusionParseParamByOpFunc GetFusionParseParamByOpFunc(const std::string &op_type,
+                                                             const std::string &ori_type);
+
   domi::ParseSubgraphFunc GetParseSubgraphPostFunc(const std::string &op_type);
 
   domi::ImplyType GetImplyTypeByOriOpType(const std::string &ori_optype);
@@ -78,6 +81,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistry {
   std::unordered_map<std::string, ParseParamFunc> op_parse_params_fn_map_;
   std::unordered_map<std::string, ParseParamByOpFunc> parse_params_by_op_func_map_;
   std::unordered_map<std::string, FusionParseParamFunc> fusion_op_parse_params_fn_map_;
+  std::unordered_map<std::string, FusionParseParamByOpFunc> fusion_parse_params_by_op_fn_map_;
   std::unordered_map<std::string, ParseSubgraphFunc> op_types_to_parse_subgraph_post_func_;
   std::unordered_map<std::string, std::vector<RemoveInputConfigure>> remove_input_configure_map_;
   std::unordered_map<std::string, std::string> origin_type_to_om_type_;
diff --git a/third_party/fwkacllib/inc/register/register.h b/third_party/fwkacllib/inc/register/register.h
index 27da0b0b..d98edaa4 100644
--- a/third_party/fwkacllib/inc/register/register.h
+++ b/third_party/fwkacllib/inc/register/register.h
@@ -18,6 +18,7 @@
 #define INC_REGISTER_REGISTRY_H_
 
 #include "external/register/register.h"
+#include "external/ge/ge_api_error_codes.h"
 
 namespace ge {
 class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOp {
diff --git a/third_party/fwkacllib/inc/runtime/base.h b/third_party/fwkacllib/inc/runtime/base.h
index 572053f6..2ab522fa 100644
--- a/third_party/fwkacllib/inc/runtime/base.h
+++ b/third_party/fwkacllib/inc/runtime/base.h
@@ -201,6 +201,7 @@ typedef enum tagRtError {
     RT_ERROR_FEATURE_NOT_SUPPROT,
     RT_ERROR_MEMORY_ALLOCATION,
     RT_ERROR_MEMORY_FREE,
+    RT_ERROR_INVALID_MEMORY_TYPE,
 
     RT_ERROR_DEBUG_BASE                     = 0x07120000,
     RT_ERROR_DEBUG_NULL,
diff --git a/third_party/fwkacllib/inc/runtime/dev.h b/third_party/fwkacllib/inc/runtime/dev.h
index bf2ce447..048be69a 100644
--- a/third_party/fwkacllib/inc/runtime/dev.h
+++ b/third_party/fwkacllib/inc/runtime/dev.h
@@ -164,7 +164,7 @@ RTS_API rtError_t rtGetDevicePhyIdByIndex(uint32_t devIndex, uint32_t *phyId);
  * @return RT_ERROR_NONE for ok
  * @return RT_ERROR_INVALID_VALUE for error input
  */
-RTS_API rtError_t rtEnableP2P(uint32_t devIdDes, uint32_t phyIdSrc);
+RTS_API rtError_t rtEnableP2P(uint32_t devIdDes, uint32_t phyIdSrc, uint32_t flag);
 
 /**
  * @ingroup dvrt_dev
@@ -176,6 +176,17 @@ RTS_API rtError_t rtEnableP2P(uint32_t devIdDes, uint32_t phyIdSrc);
  */
 RTS_API rtError_t rtDisableP2P(uint32_t devIdDes, uint32_t phyIdSrc);
 
+/**
+ * @ingroup dvrt_dev
+ * @brief get cability of P2P omemry copy betwen device and peeredevic.
+ * @param [in] device   the logical device id
+ * @param [in] peerDevice   the physical device id
+ * @param [outv] *canAccessPeer   1:enable 0:disable
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtDeviceCanAccessPeer(int32_t* canAccessPeer, uint32_t device, uint32_t peerDevice);
+
 /**
  * @ingroup dvrt_dev
  * @brief get status
diff --git a/third_party/fwkacllib/inc/runtime/kernel.h b/third_party/fwkacllib/inc/runtime/kernel.h
index aec290da..956e033b 100644
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@@ -177,6 +177,7 @@ typedef void (*rtCallback_t)(void *fnData);
 #define RT_KERNEL_CONVERT (0x01)
 #define RT_KERNEL_DUMPFLAG (0x02)
 #define RT_FUSION_KERNEL_DUMPFLAG (0x04)
+#define RT_KERNEL_CUSTOM_AICPU (0x08)
 
 /**
  * @ingroup rt_kernel
diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h
index 3280f3c6..8e159dd7 100644
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -46,6 +46,15 @@ extern "C" {
 #define RT_MEMORY_L1 ((uint32_t)0x1<<16)
 #define RT_MEMORY_L2 ((uint32_t)0x1<<17)
 
+/**
+ * @ingroup dvrt_mem
+ * @brief memory info type
+ */
+#define RT_MEM_INFO_TYPE_DDR_SIZE          ((uint32_t)0x1)
+#define RT_MEM_INFO_TYPE_HBM_SIZE          ((uint32_t)0x2)
+#define RT_MEM_INFO_TYPE_DDR_P2P_SIZE      ((uint32_t)0x3)
+#define RT_MEM_INFO_TYPE_HBM_P2P_SIZE      ((uint32_t)0x4)
+
 /**
  * @ingroup dvrt_mem
  * @brief memory Policy
@@ -54,6 +63,9 @@ extern "C" {
 #define RT_MEMORY_POLICY_HUGE_PAGE_FIRST ((uint32_t)0x1 << 10)    // Malloc mem prior hage page, then default page
 #define RT_MEMORY_POLICY_HUGE_PAGE_ONLY ((uint32_t)0x1 << 11)     // Malloc mem only use hage page
 #define RT_MEMORY_POLICY_DEFAULT_PAGE_ONLY ((uint32_t)0x1 << 12)  // Malloc mem only use default page
+#define RT_MEMORY_POLICY_HUGE_PAGE_FIRST_P2P ((uint32_t)0x1 << 13)    // Malloc mem prior hage page, then default page, use for p2p
+#define RT_MEMORY_POLICY_HUGE_PAGE_ONLY_P2P ((uint32_t)0x1 << 14)     // Malloc mem only use hage page, use for p2p
+#define RT_MEMORY_POLICY_DEFAULT_PAGE_ONLY_P2P ((uint32_t)0x1 << 15)  // Malloc mem only use default page, use for p2p
 
 #define MEM_ALLOC_TYPE_BIT ((uint32_t)0x3FF)  // mem type bit in <0, 9>
 
@@ -88,6 +100,19 @@ typedef enum tagRtMemcpyKind {
   RT_MEMCPY_RESERVED,
 } rtMemcpyKind_t;
 
+typedef enum tagRtMemInfoType {
+  RT_MEMORYINFO_DDR,
+  RT_MEMORYINFO_HBM,
+  RT_MEMORYINFO_DDR_HUGE,               // Hugepage memory of DDR
+  RT_MEMORYINFO_DDR_NORMAL,             // Normal memory of DDR
+  RT_MEMORYINFO_HBM_HUGE,               // Hugepage memory of HBM
+  RT_MEMORYINFO_HBM_NORMAL,             // Normal memory of HBM
+  RT_MEMORYINFO_DDR_P2P_HUGE,           // Hugepage memory of DDR
+  RT_MEMORYINFO_DDR_P2P_NORMAL,         // Normal memory of DDR
+  RT_MEMORYINFO_HBM_P2P_HUGE,           // Hugepage memory of HBM
+  RT_MEMORYINFO_HBM_P2P_NORMAL,         // Normal memory of HBM
+} rtMemInfoType_t;
+
 typedef enum tagRtRecudeKind {
   RT_MEMCPY_SDMA_AUTOMATIC_ADD = 10,  // D2D, SDMA inline reduce, include 1P, and P2P
   RT_RECUDE_KIND_END
@@ -350,6 +375,16 @@ RTS_API rtError_t rtMemsetAsync(void *ptr, uint64_t destMax, uint32_t value, uin
  */
 RTS_API rtError_t rtMemGetInfo(size_t *free, size_t *total);
 
+/**
+ * @ingroup dvrt_mem
+ * @brief get current device memory total and free
+ * @param [in] memInfoType
+ * @param [out] free
+ * @param [out] total
+ * @return RT_ERROR_NONE for ok, errno for failed
+ */
+RTS_API rtError_t rtMemGetInfoEx(rtMemInfoType_t memInfoType, size_t *free, size_t *total);
+
 /**
  * @ingroup dvrt_mem
  * @brief set memory with uint32_t value