diff --git a/inc/external/acl/acl_op_compiler.h b/inc/external/acl/acl_op_compiler.h
index a0a3f786..faf5e649 100644
--- a/inc/external/acl/acl_op_compiler.h
+++ b/inc/external/acl/acl_op_compiler.h
@@ -36,7 +36,8 @@ typedef enum {
   ACL_DEBUG_DIR,
   ACL_OP_COMPILER_CACHE_MODE,
   ACL_OP_COMPILER_CACHE_DIR,
-  ACL_OP_PERFORMANCE_MODE
+  ACL_OP_PERFORMANCE_MODE,
+  ACL_OP_JIT_COMPILE
 } aclCompileOpt;
 
 typedef enum aclCompileFlag { ACL_OP_COMPILE_DEFAULT, ACL_OP_COMPILE_FUZZ } aclOpCompileFlag;
diff --git a/inc/external/acl/acl_prof.h b/inc/external/acl/acl_prof.h
index 4a9a5be9..a6f0c832 100644
--- a/inc/external/acl/acl_prof.h
+++ b/inc/external/acl/acl_prof.h
@@ -128,8 +128,8 @@ MSVP_PROF_API aclError aclprofStart(const aclprofConfig *profilerConfig);
  * @see aclprofDestroyConfig
  */
 MSVP_PROF_API aclprofConfig *aclprofCreateConfig(uint32_t *deviceIdList, uint32_t deviceNums,
-                                                 aclprofAicoreMetrics aicoreMetrics, aclprofAicoreEvents *aicoreEvents,
-                                                 uint64_t dataTypeConfig);
+                                                 aclprofAicoreMetrics aicoreMetrics,
+                                                 const aclprofAicoreEvents *aicoreEvents, uint64_t dataTypeConfig);
 
 /**
  * @ingroup AscendCL
diff --git a/inc/external/acl/error_codes/rt_error_codes.h b/inc/external/acl/error_codes/rt_error_codes.h
index cc607b74..56c56438 100644
--- a/inc/external/acl/error_codes/rt_error_codes.h
+++ b/inc/external/acl/error_codes/rt_error_codes.h
@@ -113,6 +113,7 @@ static const int32_t ACL_ERROR_RT_AICORE_TRAP_READ_OVERFLOW = 507042;        //
 static const int32_t ACL_ERROR_RT_AICORE_TRAP_WRITE_OVERFLOW = 507043;       // aic trap write overflow
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_READ_OVERFLOW = 507044;   // aiv trap read overflow
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_WRITE_OVERFLOW = 507045;  // aiv trap write overflow
+static const int32_t ACL_ERROR_RT_STREAM_SYNC_TIMEOUT = 507046;              // stream sync time out
 
 static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR = 507899;    // drv internal error
 static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR = 507900;  // aicpu internal error
diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h
index b77bebfc..42c6f980 100644
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -30,6 +30,8 @@ namespace ge {
 // Option key: graph run mode
 const char_t *const OPTION_GRAPH_RUN_MODE = "ge.graphRunMode";
 const char_t *const OPTION_DEVICE_TYPE = "ge.deviceType";
+// Option key: topo sorting mode
+const char *const OPTION_TOPO_SORTING_MODE = "ge.topoSortingMode";
 
 // Option key: ome init
 const char_t *const OPTION_EXEC_SESSION_ID = "ge.exec.sessionId";
@@ -129,6 +131,7 @@ const char_t *const MODIFY_MIXLIST = "ge.exec.modify_mixlist";
 const char_t *const OP_PRECISION_MODE = "ge.exec.op_precision_mode";
 const char_t *const CUSTOMIZE_DTYPES = "ge.customizeDtypes";
 const char_t *const COMPRESSION_OPTIMIZE_CONF = "ge.compressionOptimizeConf";
+const char_t *const BUILD_GRAPH_ALREADY_INITIALIZED = "build_graph_already_initialized";
 }  // namespace configure_option
 // Configure stream num by Session constructor options param,
 // its value should be int32_t type, default value is "1"
@@ -293,6 +296,9 @@ const std::string FUSION_SWITCH_FILE = "ge.fusionSwitchFile";
 // Configure compression optimize file path
 const std::string COMPRESSION_OPTIMIZE_CONF = "ge.compressionOptimizeConf";
 
+// Configure for amct
+const std::string BUILD_GRAPH_ALREADY_INITIALIZED = "build_graph_already_initialized";
+
 // Configure customize dtypes path
 const std::string CUSTOMIZE_DTYPES = "ge.customizeDtypes";
 
@@ -394,7 +400,8 @@ const char_t *const GRAPH_PARALLEL_OPTION_PATH = "ge.graphParallelOptionPath";
 const char_t *const EVALUATE_GRAPH_RESOURCE_MODE = "ge.evaluateGraphResourceMode";
 // Graph run mode
 enum GraphRunMode { PREDICTION = 0, TRAIN };
-
+// Topo sorting mode
+enum class TopoSortingMode { BFS = 0, DFS = 1 };
 // Input/Output tensor info
 struct InputTensorInfo {
   uint32_t data_type;         // data type
@@ -478,6 +485,8 @@ static const char_t *const MODIFY_MIXLIST = ge::MODIFY_MIXLIST.c_str();
 static const char_t *const OP_PRECISION_MODE = ge::OP_PRECISION_MODE.c_str();
 static const char_t *const CUSTOMIZE_DTYPES = "ge.customizeDtypes";
 static const char_t *const COMPRESSION_OPTIMIZE_CONF = "ge.compressionOptimizeConf";
+static const char_t *const BUILD_GRAPH_ALREADY_INITIALIZED = "build_graph_already_initialized";
+static const char_t *const INPUT_DATA_NAMES = "input_data_names";
 
 // for interface: aclgrphBuildModel
 #ifdef __GNUC__
@@ -514,8 +523,8 @@ const std::set<std::string> ir_builder_suppported_options = {INPUT_FORMAT,
 
 // for interface: aclgrphParse
 const std::set<std::string> ir_parser_suppported_options = {
-    INPUT_FP16_NODES, IS_INPUT_ADJUST_HW_LAYOUT, IS_OUTPUT_ADJUST_HW_LAYOUT, OUTPUT,
-    OUT_NODES,        ENABLE_SCOPE_FUSION_PASSES};
+    INPUT_FP16_NODES,           IS_INPUT_ADJUST_HW_LAYOUT, IS_OUTPUT_ADJUST_HW_LAYOUT, OUTPUT, OUT_NODES,
+    ENABLE_SCOPE_FUSION_PASSES, INPUT_DATA_NAMES};
 
 // for interface: aclgrphBuildInitialize
 const std::set<std::string> global_options = {CORE_TYPE,
@@ -540,7 +549,8 @@ const std::set<std::string> global_options = {CORE_TYPE,
                                               OP_COMPILER_CACHE_DIR,
                                               OP_COMPILER_CACHE_MODE,
                                               MODIFY_MIXLIST,
-                                              COMPRESSION_OPTIMIZE_CONF};
+                                              COMPRESSION_OPTIMIZE_CONF,
+                                              BUILD_GRAPH_ALREADY_INITIALIZED};
 #endif
 }  // namespace ir_option
 }  // namespace ge
diff --git a/inc/external/runtime/rt_error_codes.h b/inc/external/runtime/rt_error_codes.h
index cc607b74..56c56438 100644
--- a/inc/external/runtime/rt_error_codes.h
+++ b/inc/external/runtime/rt_error_codes.h
@@ -113,6 +113,7 @@ static const int32_t ACL_ERROR_RT_AICORE_TRAP_READ_OVERFLOW = 507042;        //
 static const int32_t ACL_ERROR_RT_AICORE_TRAP_WRITE_OVERFLOW = 507043;       // aic trap write overflow
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_READ_OVERFLOW = 507044;   // aiv trap read overflow
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_WRITE_OVERFLOW = 507045;  // aiv trap write overflow
+static const int32_t ACL_ERROR_RT_STREAM_SYNC_TIMEOUT = 507046;              // stream sync time out
 
 static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR = 507899;    // drv internal error
 static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR = 507900;  // aicpu internal error
diff --git a/inc/framework/common/aicpu_op.h b/inc/framework/common/aicpu_op.h
deleted file mode 100644
index 773d42fd..00000000
--- a/inc/framework/common/aicpu_op.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_FRAMEWORK_COMMON_AICPU_OP_H_
-#define INC_FRAMEWORK_COMMON_AICPU_OP_H_
-
-#include "cce/customize.h"
-
-#endif  // INC_FRAMEWORK_COMMON_AICPU_OP_H_
diff --git a/inc/framework/common/file_constant_util.h b/inc/framework/common/file_constant_util.h
deleted file mode 100644
index a53c5a24..00000000
--- a/inc/framework/common/file_constant_util.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_FRAMEWORK_COMMON_FILE_CONSTANT_UTIL_H
-#define INC_FRAMEWORK_COMMON_FILE_CONSTANT_UTIL_H
-
-#include <map>
-#include <string>
-#include <vector>
-#include "ge/ge_api_error_codes.h"
-#include "nlohmann/json.hpp"
-#include "graph/op_desc.h"
-#include "graph/ge_tensor.h"
-
-namespace ge {
-struct FileConstantInfo {
-  std::string value_bin_file_id;
-  std::string value_bin_file_path;
-};
-
-struct OptionInfo {
-  std::vector<FileConstantInfo> info;
-};
-
-void from_json(const nlohmann::json &j, FileConstantInfo &info);
-
-void from_json(const nlohmann::json &j, OptionInfo &option_info);
-
-Status GetFilePathFromOption(std::map<std::string, std::string> &file_id_and_path_map);
-
-Status CopyOneWeightFromFile(const void *const curr_dev_ptr, const std::string &value, const size_t file_constant_size,
-                             size_t &left_size);
-
-Status GetFilePath(const OpDescPtr &op_desc, const std::map<std::string, std::string> &file_id_and_path_map,
-                   std::string &file_path);
-}  // namespace ge
-
-#endif  // INC_FRAMEWORK_COMMON_FILE_CONSTANT_UTIL_H
diff --git a/inc/framework/common/fmk_error_codes.h b/inc/framework/common/fmk_error_codes.h
index dd1bd678..0ef4f412 100644
--- a/inc/framework/common/fmk_error_codes.h
+++ b/inc/framework/common/fmk_error_codes.h
@@ -19,18 +19,14 @@
 
 #if defined(_MSC_VER)
 #ifdef FUNC_VISIBILITY
-#define GE_FUNC_VISIBILITY _declspec(dllexport)
 #define GE_OBJECT_VISIBILITY
 #else
-#define GE_FUNC_VISIBILITY
 #define GE_OBJECT_VISIBILITY
 #endif
 #else
 #ifdef FUNC_VISIBILITY
-#define GE_FUNC_VISIBILITY __attribute__((visibility("default")))
 #define GE_OBJECT_VISIBILITY
 #else
-#define GE_FUNC_VISIBILITY
 #define GE_OBJECT_VISIBILITY __attribute__((visibility("hidden")))
 #endif
 #endif
@@ -40,6 +36,7 @@
 
 #include "framework/common/fmk_types.h"
 #include "register/register_error_codes.h"
+#include "external/ge/ge_error_codes.h"
 
 // Each module uses the following four macros to define error codes:
 #define DECLARE_ERRORNO_OMG(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_OMG, name, value)
diff --git a/inc/framework/common/gflags_util.h b/inc/framework/common/gflags_util.h
deleted file mode 100644
index 5d374261..00000000
--- a/inc/framework/common/gflags_util.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_FRAMEWORK_COMMON_GFLAGS_UTIL_H_
-#define INC_FRAMEWORK_COMMON_GFLAGS_UTIL_H_
-
-#if defined(_MSC_VER)
-#ifdef FUNC_VISIBILITY
-#define GE_FUNC_VISIBILITY _declspec(dllexport)
-#else
-#define GE_FUNC_VISIBILITY
-#endif
-#else
-#ifdef FUNC_VISIBILITY
-#define GE_FUNC_VISIBILITY __attribute__((visibility("default")))
-#else
-#define GE_FUNC_VISIBILITY
-#endif
-#endif
-
-#include <gflags/gflags.h>
-#include <string>
-
-namespace ge {
-class GE_FUNC_VISIBILITY GflagsUtils {
- public:
-  static bool IsSetCommandTrue(const char *name) {
-    std::string out;
-    return gflags::GetCommandLineOption(name, &out) && out == "true";
-  }
-
-  ///
-  /// @brief Determines whether the parameter is empty
-  /// @param name name parameter name
-  /// @return true if empty otherwise false
-  ///
-  static bool IsSetCommandNotEmpty(const char *name) {
-    std::string out;
-    return gflags::GetCommandLineOption(name, &out) && !out.empty();
-  }
-
-  ///
-  /// @brief Determines whether the parameter is not default
-  /// @param flag_name name parameter name
-  /// @return true if not default otherwise false
-  ///
-  static bool IsCommandLineNotDefault(const char *flag_name) {
-    google::CommandLineFlagInfo info;
-    return GetCommandLineFlagInfo(flag_name, &info) && !info.is_default;
-  }
-
-  ///
-  /// @brief Modify gflags to print help information
-  /// @param flags_h Pass in the self-defined help parameter, it is recommended to be FLAGS_h
-  /// @return void
-  ///
-  static void ChangeHelpFlags(bool flags_h) {
-    if (flags_h || IsSetCommandTrue("help") || IsSetCommandTrue("helpfull") || IsSetCommandNotEmpty("helpon") ||
-        IsSetCommandNotEmpty("helpmatch") || IsSetCommandTrue("helppackage") || IsSetCommandTrue("helpxml")) {
-      gflags::SetCommandLineOption("help", "false");
-      gflags::SetCommandLineOption("helpfull", "false");
-      gflags::SetCommandLineOption("helpon", "");
-      gflags::SetCommandLineOption("helpmatch", "");
-      gflags::SetCommandLineOption("helppackage", "false");
-      gflags::SetCommandLineOption("helpxml", "false");
-      gflags::SetCommandLineOption("helpshort", "true");
-    }
-  }
-};
-}  // namespace ge
-
-#endif  // INC_FRAMEWORK_COMMON_GFLAGS_UTIL_H_
diff --git a/inc/framework/common/helper/model_helper.h b/inc/framework/common/helper/model_helper.h
index e4141a4b..5fc8f1f8 100644
--- a/inc/framework/common/helper/model_helper.h
+++ b/inc/framework/common/helper/model_helper.h
@@ -34,6 +34,8 @@ class GE_FUNC_VISIBILITY ModelHelper {
 
   Status SaveToOmModel(const GeModelPtr &ge_model, const SaveParam &save_param, const std::string &output_file,
                        ge::ModelBufferData &model) const;
+  Status GenerateGeModel(const OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, const size_t mode_index,
+                         const bool is_dyn_root) const;
   Status SaveToOmRootModel(const GeRootModelPtr &ge_root_model, const SaveParam &save_param,
                            const std::string &output_file, ModelBufferData &model, const bool is_unknown_shape) const;
   Status SaveOriginalGraphToOmModel(const ge::Graph &graph, const std::string &output_file) const;
@@ -67,8 +69,6 @@ class GE_FUNC_VISIBILITY ModelHelper {
 
   bool IsPartitionedGraph(const GeModelPtr &cur_model) const;
 
-  Status GenerateGeModel(const OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, const size_t mode_index,
-                         const bool is_dyn_root) const;
   Status GenerateGeRootModel(const OmFileLoadHelper &om_load_helper);
 
   Status LoadModelData(const OmFileLoadHelper &om_load_helper, const GeModelPtr &cur_model,
diff --git a/inc/framework/common/op/attr_value_util.h b/inc/framework/common/op/attr_value_util.h
deleted file mode 100644
index 5a41de05..00000000
--- a/inc/framework/common/op/attr_value_util.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_FRAMEWORK_COMMON_OP_ATTR_VALUE_UTIL_H_
-#define INC_FRAMEWORK_COMMON_OP_ATTR_VALUE_UTIL_H_
-
-#if defined(_MSC_VER)
-#ifdef FUNC_VISIBILITY
-#define GE_FUNC_VISIBILITY _declspec(dllexport)
-#else
-#define GE_FUNC_VISIBILITY
-#endif
-#else
-#ifdef FUNC_VISIBILITY
-#define GE_FUNC_VISIBILITY __attribute__((visibility("default")))
-#else
-#define GE_FUNC_VISIBILITY
-#endif
-#endif
-
-#include <google/protobuf/map.h>
-#include <unordered_map>
-#include <string>
-#include "graph/debug/ge_attr_define.h"
-#include "proto/om.pb.h"
-
-namespace ge {
-GE_FUNC_VISIBILITY void SetAttrDef(const std::string &value, domi::AttrDef *const out);
-}
-#endif  // INC_FRAMEWORK_COMMON_OP_ATTR_VALUE_UTIL_H_
\ No newline at end of file
diff --git a/inc/framework/common/string_util.h b/inc/framework/common/string_util.h
index c369d04f..40c0f5cc 100644
--- a/inc/framework/common/string_util.h
+++ b/inc/framework/common/string_util.h
@@ -17,20 +17,6 @@
 #ifndef INC_FRAMEWORK_COMMON_STRING_UTIL_H_
 #define INC_FRAMEWORK_COMMON_STRING_UTIL_H_
 
-#if defined(_MSC_VER)
-#ifdef FUNC_VISIBILITY
-#define GE_FUNC_VISIBILITY _declspec(dllexport)
-#else
-#define GE_FUNC_VISIBILITY
-#endif
-#else
-#ifdef FUNC_VISIBILITY
-#define GE_FUNC_VISIBILITY __attribute__((visibility("default")))
-#else
-#define GE_FUNC_VISIBILITY
-#endif
-#endif
-
 #include <cctype>
 #include <securec.h>
 
@@ -40,6 +26,7 @@
 #include <string>
 #include <vector>
 #include "graph/types.h"
+#include "external/ge/ge_error_codes.h"
 
 namespace ge {
 class GE_FUNC_VISIBILITY StringUtils {
diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h
index f8529fa2..c3d04ee2 100644
--- a/inc/framework/common/types.h
+++ b/inc/framework/common/types.h
@@ -45,7 +45,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string PROFIL
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_TASKS;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_TASK_GEN_BASE_ADDR;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_TASK_GEN_HOST_BASE_ADDR;
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_TASK_GEN_HOST_SVM_BASE_ADDR;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_HOST_MEMORY_SIZE;
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_HOST_SVM_SIZE;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_TASK_GEN_WEIGHT_ADDR;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_FUSION_MODEL_DEF;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint64_t ALLOC_MEMORY_MAX_SIZE;  // Max size of 8 GB.
diff --git a/inc/framework/executor/ge_executor.h b/inc/framework/executor/ge_executor.h
index 1c2a52e6..8951697f 100644
--- a/inc/framework/executor/ge_executor.h
+++ b/inc/framework/executor/ge_executor.h
@@ -323,7 +323,7 @@ class GE_FUNC_VISIBILITY GeExecutor {
 
   static Status ReleaseSingleOpResource(void *const stream);
 
-  static Status ClearCustomAicpuSo();
+  static Status ClearCustomAicpuSo(const uint32_t &device_id);
 
   static Status GetDeviceIdByModelId(const uint32_t model_id, uint32_t &device_id);
 
diff --git a/inc/framework/memory/memory_api.h b/inc/framework/memory/memory_api.h
index d771bddc..d7afd933 100644
--- a/inc/framework/memory/memory_api.h
+++ b/inc/framework/memory/memory_api.h
@@ -25,6 +25,7 @@ enum MemStorageType {
   HBM = 0,
   RDMA_HBM,
   HOST_DDR,
+  HOST_SVM,
 };
 
 struct HostVarInfo {
@@ -38,27 +39,23 @@ struct TensorInfo {
   DataType data_type;
 };
 
-///
 /// \param size [in] rdma pool memory size to be allocated.
 /// \param mem_type [in] memory type for rdma pool.
 /// \return Status result of function
 GE_FUNC_VISIBILITY Status InitRdmaPool(size_t size, rtMemType_t mem_type = RT_MEMORY_HBM);
 
-///
 /// \param var_info [in] host variable addr infos.
 /// \param mem_type [in] memory type for rdma pool.
 /// \return Status result of function
 GE_FUNC_VISIBILITY Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info,
                                              rtMemType_t mem_type = RT_MEMORY_HBM);
 
-///
 /// \param tensor_info [in] description for tensor stored shared memory.
 /// \param dev_addr [out] malloced shared memory addr.
 /// \param memory_size [out] malloced shared memory size.
 /// \return Status result of function
 GE_FUNC_VISIBILITY Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uint64_t &memory_size);
 
-///
 /// \param var_name [in] var_name name of host variable.
 /// \param base_addr [out] base_addr vase addr of host variable.
 /// \param var_size [out] var_size memory_size of host variable.
diff --git a/inc/framework/omg/model_tool.h b/inc/framework/omg/model_tool.h
deleted file mode 100644
index 24554e65..00000000
--- a/inc/framework/omg/model_tool.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_FRAMEWORK_OMG_MODEL_TOOL_H_
-#define INC_FRAMEWORK_OMG_MODEL_TOOL_H_
-
-#include <memory>
-#include <string>
-
-#include "framework/common/debug/ge_log.h"
-#include "proto/ge_ir.pb.h"
-
-namespace ge {
-class GE_FUNC_VISIBILITY ModelTool {
- public:
-  static Status GetModelInfoFromOm(const char *model_file, ge::proto::ModelDef &model_def, uint32_t &modeldef_size);
-
-  static Status GetModelInfoFromPbtxt(const char *model_file, ge::proto::ModelDef &model_def);
-};
-}  // namespace ge
-
-#endif  // INC_FRAMEWORK_OMG_MODEL_TOOL_H_
diff --git a/inc/framework/omg/omg_inner_types.h b/inc/framework/omg/omg_inner_types.h
index 1addd326..5931d60c 100644
--- a/inc/framework/omg/omg_inner_types.h
+++ b/inc/framework/omg/omg_inner_types.h
@@ -99,6 +99,7 @@ struct OmgContext {
   std::string atc_cmdline;
   bool user_attr_index_valid = false;
   bool is_online_model = false;
+  bool is_subgraph_multi_batch = false;
 };
 }  // namespace ge
 
diff --git a/inc/framework/runtime/model_v2_executor.h b/inc/framework/runtime/model_v2_executor.h
index dad4f4e4..45655795 100644
--- a/inc/framework/runtime/model_v2_executor.h
+++ b/inc/framework/runtime/model_v2_executor.h
@@ -28,10 +28,10 @@
 #include "subscriber/executor_subscribers_scheduler.h"
 
 namespace gert {
+enum class ExecutorState { kInit, kLoaded };
 enum SubExeGraphType { kInitExeGraph, kMainExeGraph, kDeInitExeGraph, kSubExeGraphTypeEnd };
-static constexpr char *kSubExeGraphTypeStrs[kSubExeGraphTypeEnd] = {
-    const_cast<char *>("Init"), const_cast<char *>("Main"), const_cast<char *>("DeInit")};
 inline const char *GetSubExeGraphTypeStr(SubExeGraphType type) {
+  constexpr const char *kSubExeGraphTypeStrs[kSubExeGraphTypeEnd] = {"Init", "Main", "DeInit"};
   return kSubExeGraphTypeStrs[type];
 }
 
@@ -74,6 +74,7 @@ class VISIBILITY_EXPORT ModelV2Executor {
   ModelDesc *model_desc_ = nullptr;
   rtStream_t default_stream_ = nullptr;
   ExecutorSubscribersScheduler subscribers_;
+  ExecutorState state_ = ExecutorState::kInit;
 };
 }  // namespace gert
 
diff --git a/inc/framework/runtime/subscriber/built_in_subscriber_definitions.h b/inc/framework/runtime/subscriber/built_in_subscriber_definitions.h
index 4ee2aff2..0dcfbf69 100644
--- a/inc/framework/runtime/subscriber/built_in_subscriber_definitions.h
+++ b/inc/framework/runtime/subscriber/built_in_subscriber_definitions.h
@@ -32,8 +32,7 @@ enum class BuiltInSubscriberType { kProfiling, kDumper, kNum };
 enum class ProfilingType {
   kHost,  // 打开Host侧调度的profiling
   kDevice,
-  kGeHost,       // 打开GE Host侧调度的profiling
-  kSingleOpReg,  // 单算子需要打开此开关开始register node name和kernel type
+  kGeHost,  // 打开GE Host侧调度的profiling
   kNum,
   kAll = kNum
 };
diff --git a/inc/framework/runtime/subscriber/executor_subscribers_scheduler.h b/inc/framework/runtime/subscriber/executor_subscribers_scheduler.h
index 1a9936f8..f697d578 100644
--- a/inc/framework/runtime/subscriber/executor_subscribers_scheduler.h
+++ b/inc/framework/runtime/subscriber/executor_subscribers_scheduler.h
@@ -23,9 +23,6 @@
 #include "global_profiling.h"
 #include "framework/common/ge_visibility.h"
 namespace gert {
-namespace {
-constexpr size_t kInitSubscriberSize = 1UL;
-}
 class ModelV2Executor;
 class VISIBILITY_EXPORT ExecutorSubscribersScheduler {
  public:
@@ -78,7 +75,7 @@ class VISIBILITY_EXPORT ExecutorSubscribersScheduler {
     if (ins == nullptr) {
       return nullptr;
     }
-
+    constexpr size_t kInitSubscriberSize = 1UL;
     // profiler exists when ess init
     if (subscribers_.size() == kInitSubscriberSize) {
       enabled_ = true;
diff --git a/inc/framework/runtime/subscriber/global_profiling.h b/inc/framework/runtime/subscriber/global_profiling.h
index 6b84214e..afdbcef1 100644
--- a/inc/framework/runtime/subscriber/global_profiling.h
+++ b/inc/framework/runtime/subscriber/global_profiling.h
@@ -54,6 +54,11 @@ class GlobalProfiler {
 
 class VISIBILITY_EXPORT GlobalProfilingWrapper {
  public:
+  GlobalProfilingWrapper(const GlobalProfilingWrapper &) = delete;
+  GlobalProfilingWrapper(GlobalProfilingWrapper &&) = delete;
+  GlobalProfilingWrapper &operator=(const GlobalProfilingWrapper &) = delete;
+  GlobalProfilingWrapper &operator=(GlobalProfilingWrapper &&) = delete;
+
   static GlobalProfilingWrapper *GetInstance() {
     static GlobalProfilingWrapper global_prof_wrapper;
     return &global_prof_wrapper;
diff --git a/metadef b/metadef
index 62c14e1c..03482feb 160000
--- a/metadef
+++ b/metadef
@@ -1 +1 @@
-Subproject commit 62c14e1cde161dccf6967f151ece9509f778c416
+Subproject commit 03482feb52fd7cc8544231f32891c86db3bc91a2
diff --git a/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h b/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
index 8c0c1847..3332cb34 100644
--- a/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
+++ b/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef AICPU_ENGINE_STRUCT_H__
-#define AICPU_ENGINE_STRUCT_H__
+#ifndef AICPU_ENGINE_STRUCT_H
+#define AICPU_ENGINE_STRUCT_H
 
 #include "fwk_adpt_struct.h"
 
@@ -53,4 +53,4 @@ struct SessionInfo {
 #ifdef __cplusplus
 }
 #endif
-#endif  // AICPU_ENGINE_STRUCT_H__
+#endif  // AICPU_ENGINE_STRUCT_H
diff --git a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
index ec92a036..c5873a1b 100644
--- a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
+++ b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef FWK_ADPT_STRUCT_H__
-#define FWK_ADPT_STRUCT_H__
+#ifndef FWK_ADPT_STRUCT_H
+#define FWK_ADPT_STRUCT_H
 
 #include <cstdint>
 
@@ -112,7 +112,7 @@ struct StrFWKKernel {
 };
 #pragma pack(pop)
 
-typedef StrFWKKernel FWKOperateParam;
+using FWKOperateParam = StrFWKKernel;
 
 // Extent info ShapeAndType
 const uint32_t kMaxShapeDims = 8U;
@@ -154,4 +154,4 @@ struct AsyncWait {
 }  // end  namespace FWKAdapter
 }  // namespace aicpu
 
-#endif  // FWK_ADPT_STRUCT_H__
+#endif  // FWK_ADPT_STRUCT_H
diff --git a/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h b/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
index a5a2642c..4ee0eac5 100644
--- a/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
+++ b/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
@@ -107,6 +107,7 @@ static const int32_t ACL_ERROR_RT_AICORE_TRAP_READ_OVERFLOW       = 507042; // a
 static const int32_t ACL_ERROR_RT_AICORE_TRAP_WRITE_OVERFLOW      = 507043; // aic trap write overflow
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_READ_OVERFLOW  = 507044; // aiv trap read overflow
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_WRITE_OVERFLOW = 507045; // aiv trap write overflow
+static const int32_t ACL_ERROR_RT_STREAM_SYNC_TIMEOUT        = 507046; // stream sync time out
 
 static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR         = 507899; // drv internal error
 static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR       = 507900; // aicpu internal error
diff --git a/third_party/fwkacllib/inc/ops/OWNERS b/third_party/fwkacllib/inc/ops/OWNERS
index f95df23c..60662397 100755
--- a/third_party/fwkacllib/inc/ops/OWNERS
+++ b/third_party/fwkacllib/inc/ops/OWNERS
@@ -21,7 +21,6 @@ reviewers:
 - luanma_bl
 - LDLD0524
 - wywismygod2020
-- lipeiyang3699
 - koala-zhang
 - zhu-jingjing
 - zhaozhihui5
@@ -41,18 +40,14 @@ reviewers:
 - djh602
 - wangjiangben_hw
 - li1jie
-- clinglai
 - liujun2014
 - soupkey
 - wu-shengji
 - cimeng
-- ccl_ligang
-- xiaozhedeng
 - granpad7
 - tc1qaz
 - Ronnie_zheng
 - xiexianhu
-- zhouyujoe
 - zhaoping12
 - tanshengshun
 - fanqirui
@@ -62,4 +57,4 @@ reviewers:
 - gegenhua
 - qiaohairong
 options:
-  no_parent_owners: true
\ No newline at end of file
+  no_parent_owners: true
diff --git a/third_party/fwkacllib/inc/ops/cluster.h b/third_party/fwkacllib/inc/ops/cluster.h
index d2ee7f09..81cee365 100644
--- a/third_party/fwkacllib/inc/ops/cluster.h
+++ b/third_party/fwkacllib/inc/ops/cluster.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021 Huawei Technologies Co., Ltd
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2022. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
index 4e8fb312..782c8796 100644
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -125,7 +125,7 @@ REG_OP(MinimumGrad)
 *@par Inputs:
 *One input:
 *x:A Tensor. Must be one of the following types: bool, float16, float, int8, int32, uint32, uint8,
-   int64, uint64, int16, uint16, double, complex64, complex128, qint8, quint8, qint16, quint16, qint32.
+   int64, uint64, int16, uint16, double, complex64, complex128, qint8, quint8, qint16, quint16, qint32, uint1.
    For float32 type, the actual calculation on the chip is based on float16.  \n
 
 *@par Attributes:
@@ -137,7 +137,7 @@ REG_OP(MinimumGrad)
 REG_OP(Cast)
     .INPUT(x, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT32, DT_UINT8,
                           DT_INT64, DT_UINT64, DT_INT16, DT_UINT16, DT_DOUBLE, DT_COMPLEX64,
-                          DT_COMPLEX128, DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32, DT_BF16}))
+                          DT_COMPLEX128, DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32, DT_BF16, DT_UINT1}))
     .OUTPUT(y, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT32, DT_UINT8,
                            DT_INT64, DT_UINT64, DT_INT16, DT_UINT16, DT_DOUBLE, DT_COMPLEX64,
                            DT_COMPLEX128, DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32, DT_BF16}))
diff --git a/third_party/fwkacllib/inc/ops/experiment_ops.h b/third_party/fwkacllib/inc/ops/experiment_ops.h
index 769b5a0b..82965c39 100644
--- a/third_party/fwkacllib/inc/ops/experiment_ops.h
+++ b/third_party/fwkacllib/inc/ops/experiment_ops.h
@@ -77,6 +77,49 @@ REG_OP(ApplyAdamW)
     .ATTR(maximize, Bool, false)
     .OP_END_FACTORY_REG(ApplyAdamW)
 
+/**
+* @brief Calculate SQ distance. \n
+*
+* @par Inputs:
+* @li ivf: A Tensor, dtype is uint8.
+* @li query: A Tensor, dtype is float16 or float32.
+* @li bucket_list: A Tensor, dtype is int32 or int64.
+* @li bucket_limits: A Tensor, dtype is int32 or int64.
+* @li bucket_offsets: A Tensor, dtype is int32 or int64.
+* @li vmin: A Tensor, dtype is float16 or float32.
+* @li vdiff: A Tensor, dtype is float16 or float32. \n
+*
+* @par Outputs:
+* @li actual_count: A Tensor, dtype is int32 or int64, the actual number of sq_distance.
+* @li sq_distance: A Tensor, dtype is float16 or float32.
+* @li grouped_extreme_distance: A Tensor, dtype is float16 or float32, the extremum in each group of sq_distance.
+* @li sq_ivf: A Tensor, dtype is int32 or int64.
+* @li sq_index: A Tensor, dtype is int32 or int64. \n
+*
+* @par Attributes:
+* @li total_limit: A Int, indicates the max length of the output sq_distance.
+* @li group_size: A Int, indicates the group size of the extremum.
+* @li extreme_mode: A Int, indicates the type of extremum, 0 means minimum, and 1 means maximum. \n
+*
+*/
+REG_OP(ScanSQCodes)
+    .INPUT(ivf, TensorType({DT_UINT8}))
+    .INPUT(query, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(bucket_list, TensorType({DT_INT32, DT_INT64}))
+    .INPUT(bucket_limits, TensorType({DT_INT32, DT_INT64}))
+    .INPUT(bucket_offsets, TensorType({DT_INT32, DT_INT64}))
+    .INPUT(vmin, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(vdiff, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(actual_count, TensorType({DT_INT32, DT_INT64}))
+    .OUTPUT(sq_distance, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(grouped_extreme_distance, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(sq_ivf, TensorType({DT_INT32, DT_INT64}))
+    .OUTPUT(sq_index, TensorType({DT_INT32, DT_INT64}))
+    .REQUIRED_ATTR(total_limit, Int)
+    .ATTR(group_size, Int, 64)
+    .ATTR(extreme_mode, Int, 0)
+    .OP_END_FACTORY_REG(ScanSQCodes)
+
 /**
 * @brief Multiplies matrix "a" by matrix "b", producing "a * b". \n
 * @par Inputs:
@@ -153,6 +196,103 @@ REG_OP(RotatedNMS)
     .OUTPUT(keep_indices, TensorType({DT_INT32, DT_INT64}))
     .REQUIRED_ATTR(iou_threshold, Float)
     .OP_END_FACTORY_REG(RotatedNMS)
+
+/**
+* @brief Performs average pooling on the input. Used in the combination of conv + avgpoolupdate to replace avgpool
+* @par Inputs:
+* x1: Output of upstream Conv2d. A tensor of type float16, float32.
+* x2: Input feature map of upstream Conv2d. A tensor of type int8, float16, float32.
+
+* @par Attributes:
+* @li ksize: A required list of 4 ints, specifying the size (N, C, H, and W) of the sliding window,
+* where N = C = 1, and H and W are positive integers within the range [1, 255].
+* @li strides: A required list of 4 ints, specifying the stride of the sliding window.
+* The strides of the N and C dimensions are 1.
+* The strides of the H and W dimensions are positive integers within the range [1, 63].
+* @li padding_mode: A required string, specifying the padding algorithm,
+* either "VALID", "SAME" and "CALCULATED".
+* With "SAME" means that the outputs will have the same spatial dimensions as its inputs.
+* With "VALID" means no padding.
+* @li pads: Pad value when padding_mode is "CALCULATED".
+* @li data_format: An optional string, specifying the data format of "ksize" and "strides",
+* either "NCHW", or "NHWC" (default).
+* @li ceil_mode: Use ceil or floor to calculate the output size when padding_mode is "CALCULATED".
+* @li exclusive: Ignore padding area or not when calculating average.
+
+* @par Outputs:
+* y: The average pooled output tensor. Has the same type and format as input "x1".
+
+* @attention Constraints:
+* @li Only single input and single output are supported.
+* @li "ksize_H" and "ksize_W" are positive integers within the range [1, 255]. ksize_H * ksize_W < 256
+* @li Due to instruction restrictions,
+* the values of "strides_h" and "strides_w" are positive integers within the range [1, 63].
+* @par Third-party framework compatibility
+* Compatible with the TensorFlow/Pytorch/Onnx operator AvgPoolV2.
+*/
+REG_OP(AvgPoolUpdate)
+    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(x2, TensorType({DA_INT4, DT_INT8, DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .REQUIRED_ATTR(ksize, ListInt)
+    .REQUIRED_ATTR(strides, ListInt)
+    .ATTR(padding_mode, String, "CALCULATED")
+    .ATTR(pads, ListInt, {0, 0, 0, 0})
+    .ATTR(data_format, String, "NHWC")
+    .ATTR(ceil_mode, Bool, false)
+    .ATTR(exclusive, Bool, true)
+    .OP_END_FACTORY_REG(AvgPoolUpdate)
+
+/**
+* @brief batch input by time
+* @par Inputs:
+* x: A list of input tensors. It's a dynamic input
+
+* @par Attributes:
+* @li window: time window, [-1, int64_max], if -1 will batch by input data flag,
+* else will batch by input timestamp and data flag.
+* @li batch_dim: [-1, input_shape_range), if -1 input shape:[x, ..., x] ---> output shape:[-1, x, ..., x],
+* else output shape:[x, ..., -1(batch_dim), ..., x];
+* @li drop_remainder: a bool flag, take effect when window > -1,
+* if true when batch data window < window, will drop data.
+
+* @par Outputs:
+* y: A list of output tensors. It's a dynamic input, the same size as "x".
+
+* @attention Constraints:
+* @li Only support in helper udf
+*/
+REG_OP(TimeBatch)
+    .DYNAMIC_INPUT(x, TensorType::RealNumberType())
+    .DYNAMIC_OUTPUT(y, TensorType::RealNumberType())
+    .REQUIRED_ATTR(window, Int)
+    .ATTR(batch_dim, Int, -1)
+    .ATTR(drop_remainder, Bool, false)
+    .OP_END_FACTORY_REG(TimeBatch)
+
+/**
+* @brief Auto Batch process. \n
+
+* @par Inputs:
+* @li x: A list of input tensor objects. It's a dynamic input. \n
+
+* @par Outputs:
+* @li y: A list of output tensor objects. It's a dynamic output. \n
+
+* @par Attributes:
+* @li batch_size: auto batch size.
+* @li timeout: auto batch wait timeout(unit:ms).
+* @li padding: weather to pad when batch is insufficient.
+* @li slide_stride: sliding window step.
+*/
+REG_OP(AutoBatch)
+    .DYNAMIC_INPUT(x, TensorType::RealNumberType())
+    .DYNAMIC_OUTPUT(y, TensorType::RealNumberType())
+    .REQUIRED_ATTR(batch_size, Int)
+    .ATTR(timeout, Int, 0)
+    .ATTR(padding, Bool, false)
+    .ATTR(slide_stride, Int, 0)
+    .OP_END_FACTORY_REG(AutoBatch)
 }  // namespace ge
 
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_EXPERIMENT_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h
index 3db3cb84..0446aff8 100644
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -2039,24 +2039,24 @@ REG_OP(IMGWarpOffsets)
     .OP_END_FACTORY_REG(IMGWarpOffsets)
 
 /**
-*@brief This operation samples 3d input x by using interpolation based on flow field grid,
- which is usually gennerated by affine_grid.
+* @brief This operation samples 3d input x by using interpolation based on flow field grid,
+  which is usually gennerated by affine_grid.
 
-*@par Inputs:
-*@li x: 5-D Tensor with shape `[batch, channels, depth, height, width]`.
-*@li grid: flow field grid, 5-D Tensor with shape `[batch, depth, height, width, 2]`.
+* @par Inputs:
+* @li x: 5-D Tensor with shape `[batch, channels, depth, height, width]`.
+* @li grid: flow field grid, 5-D Tensor with shape `[batch, depth, height, width, 2]`.
 
-*@par Attributes:
-*@li interpolation_mode: An optional string specifying the interpolation method.
-*@li padding_mode: An optional string specifying the pad method.
-*@li align_corners: An optional bool. If "true", the centers of the corner
- pixels of the input and output tensors are aligned. Defaults to "false" .
+* @par Attributes:
+* @li interpolation_mode: An optional string specifying the interpolation method.
+* @li padding_mode: An optional string specifying the pad method.
+* @li align_corners: An optional bool. If "true", the centers of the corner
+  pixels of the input and output tensors are aligned. Defaults to "false" .
 
-*@par Outputs:
-*y: Returns 5-D Tensor with the same dtype as `x`.
+* @par Outputs:
+* y: Returns 5-D Tensor with the same dtype as `x`.
 
-*@par Third-party framework compatibility
-*Compatible with pytorch GridSampler3D operator.
+* @par Third-party framework compatibility
+* Compatible with pytorch GridSampler3D operator.
 */
 REG_OP(GridSampler3D)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
diff --git a/third_party/fwkacllib/inc/ops/map_ops.h b/third_party/fwkacllib/inc/ops/map_ops.h
index 6ac15bf6..91dd665b 100644
--- a/third_party/fwkacllib/inc/ops/map_ops.h
+++ b/third_party/fwkacllib/inc/ops/map_ops.h
@@ -88,7 +88,7 @@ REG_OP(TensorMapInsert)
 
 * @par Inputs:
 * @li input_handle: The input map. Must be type: DT_VARIANT.
-* @li key: A Tensor,the key to be looked up. Must be one of 
+* @li key: A Tensor, the key to be looked up. Must be one of
   the following types: int32,int64,string . \n
 
 * @par Attributes:
diff --git a/third_party/fwkacllib/inc/ops/math_ops.h b/third_party/fwkacllib/inc/ops/math_ops.h
index 3d162d3a..f4c5d2b4 100644
--- a/third_party/fwkacllib/inc/ops/math_ops.h
+++ b/third_party/fwkacllib/inc/ops/math_ops.h
@@ -674,7 +674,7 @@ REG_OP(Conj)
 *@par Inputs:
 *The input x and weight must have the same type. Inputs include:
 *@li x: A Tensor dtype of float32.
-*@li target: A Tensor dtype of int32.
+*@li target: A Tensor dtype of int32 or int64.
 *@li weight: A Tensor dtype of float32 . \n
 
 *@par Attributes:
@@ -690,7 +690,7 @@ REG_OP(Conj)
 */
 REG_OP(NLLLoss)
     .INPUT(x, TensorType({DT_FLOAT}))
-    .INPUT(target, TensorType({DT_INT32}))
+    .INPUT(target, TensorType({DT_INT32, DT_INT64}))
     .OPTIONAL_INPUT(weight, TensorType({DT_FLOAT}))
     .OUTPUT(y, TensorType({DT_FLOAT}))
     .OUTPUT(total_weight, TensorType({DT_FLOAT}))
@@ -704,7 +704,7 @@ REG_OP(NLLLoss)
 *@par Inputs:
 *@li x:A Tensor dtype of float32.
 *@li y_grad:A Tensor dtype of float32.
-*@li target:A Tensor dtype of int32.
+*@li target:A Tensor dtype of int32, int64.
 *@li weight:A Tensor dtype of float32.
 *@li total_weight:A Tensor dtype of float32 . \n
 
@@ -721,7 +721,7 @@ REG_OP(NLLLoss)
 REG_OP(NLLLossGrad)
     .INPUT(x, TensorType({DT_FLOAT}))
     .INPUT(y_grad, TensorType({DT_FLOAT}))
-    .INPUT(target, TensorType({DT_INT32}))
+    .INPUT(target, TensorType({DT_INT32, DT_INT64}))
     .INPUT(weight, TensorType({DT_FLOAT}))
     .INPUT(total_weight, TensorType({DT_FLOAT}))
     .OUTPUT(x_grad, TensorType({DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
index 272d4021..57724273 100644
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@@ -210,10 +210,10 @@ REG_OP(SwinTransformerLnQKV)
 * float32, int32. Has format [ND, NHWC]. \n
 
 * @par Attributes:
-* @li transpose_x1: A bool. If True, changes the shape of "x1" from [M, K] to
-* [K, M].
-* @li transpose_x2: A bool. If True, changes the shape of "x2" from [M, K] to
-* [K, M]. \n
+* @li transpose_x1: A bool. If True, changes the shape of "x1" from [K, M] to
+* [M, K] before multiplication.
+* @li transpose_x2: A bool. If True, changes the shape of "x2" from [N, K] to
+* [K, N] before multiplication. \n
 
 * @par Outputs:
 * y: The result matrix Tensor. 2D. Must be one of the following types: float16,
@@ -246,9 +246,9 @@ REG_OP(MatMul)
 
 * @par Attributes:
 * @li transpose_x1: A bool. If True, changes the shape of "x1" from [K, M] to
-* [M, K].
+* [M, K] before multiplication.
 * @li transpose_x2: A bool. If True, changes the shape of "x2" from [N, K] to
-* [K, N].
+* [K, N] before multiplication.
 * @li offset_x: An optional integer for quantized MatMulV2.
 * The negative offset added to the input x1 for int8 type. Ensure offset_x
 * within the effective range of int8 [-128, 127]. Defaults to "0". \n
@@ -289,9 +289,9 @@ REG_OP(MatMulV2)
 
 * @par Attributes:
 * @li transpose_x1: A bool. If True, changes the shape of "x1" from [K, M] to
-* [M, K].
+* [M, K] before multiplication.
 * @li transpose_x2: A bool. If True, changes the shape of "x2" from [N, K] to
-* [K, N].
+* [K, N] before multiplication.
 * @li offset_x: An optional integer for quantized MatMulV2Compress.
 * The negative offset added to the input x1 for int8 type. Ensure offset_x
 * within the effective range of int8 [-128, 127]. Defaults to "0". \n
@@ -333,29 +333,20 @@ REG_OP(MatMulV2Compress)
 * @li c: A matrix Tensor. Must be one of the following types:float32, float16,
 * int8, int32. Has format ND.
 * @li alpha: A 1D Tensor. The shape of alpha is [1].Must be one of the
-* following types: float16, int32, float32, int8. Has format ND.
+* following types: float32, float16, int8, int32. Has format ND.
 * @li beta: A 1D Tensor. The shape of beta is [1]. Must be one of the following
-* types: float16, int32, float32, int8. Has format ND.\n
-* The format of a, b, c has restriction:\n
-* When type of a is int8 and type of c is int32, the format of a, b, c should
-* all be ND.\n
-* When type of a is int8 and type of c is float32, the format of a, b, c
-* should all be ND.\n
-* When type of a is float16 and type of c is float16, the format of a, b, c
-* should all be ND.\n
-* When type of a is float16 and type of c is float32, the format of a, b, c
-* should all be ND. \n
+* types: float32, float16, int8, int32. Has format ND. \n
 
 * @par Attributes:
 * Two attributes, including:
 * @li transpose_a: Optional. A bool. If True, changes the shape of "a" from
-* [M, K] to [K, M].
+* [K, M] to [M, K] before multiplication.
 * @li transpose_b: Optional. A bool. If True, changes the shape of "b" from
-* [K, N] to [N, K]. \n
+* [N, K] to [K, N] before multiplication. \n
 
 * @par Outputs:
-* y: The result matrix Tensor. Must be one of the following types: float16,
-* float32, int32, int8. Has format [ND], the format should be equal to a.
+* y: The result matrix Tensor. Must be one of the following types: float32,
+* float16, int8, int32. Has format [ND], the format should be equal to a.
 */
 
 REG_OP(GEMM)
@@ -379,10 +370,10 @@ REG_OP(GEMM)
 * float32, int32. 2D or higher. Has format [ND, NHWC]. \n
 
 * @par Attributes:
-* @li adj_x1: A bool. If True, changes the shape of "x1" from [B, M, K]
-* to [B, K, M].
-* @li adj_x2: A bool. If True, changes the shape of "x2" from [B, M, K]
-* to [B, K, M]. \n
+* @li adj_x1: A bool. If True, changes the shape of "x1" from [B, K, M]
+* to [B, M, K] before multiplication.
+* @li adj_x2: A bool. If True, changes the shape of "x2" from [B, N, K]
+* to [B, K, N] before multiplication. \n
 
 * @par Outputs:
 * y: The result matrix Tensor. 2D or higher. Must be one of the following
@@ -418,10 +409,10 @@ REG_OP(BatchMatMul)
 * int8, int4. Has format [ND, NHWC]. \n
 
 * @par Attributes:
-* @li adj_x1: A bool. If True, changes the shape of "x1" from [B, M, K] to
-* [B, K, M].
-* @li adj_x2: A bool. If True, changes the shape of "x2" from [B, M, K] to
-* [B, K, M]. \n
+* @li adj_x1: A bool. If True, changes the shape of "x1" from [B, K, M] to
+* [B, M, K] before multiplication.
+* @li adj_x2: A bool. If True, changes the shape of "x2" from [B, N, K] to
+* [B, K, N] before multiplication. \n
 
 * @par Outputs:
 * y: The result matrix Tensor. 2D or higher. Must be one of the following
@@ -784,7 +775,8 @@ REG_OP(TensorScatterUpdate)
 
 * @par Attributes:
 * @li axis: An optional attribute. Defaults to 0.
-* @li reduction: An optional attribute. Defaults to string "none" and can be "add" or "mul".
+* @li reduction: An optional attribute. Defaults to string "none" and can be
+* "add" or "mul". \n
 
 * @par Outputs:
 * y: A Tensor. Has the same type and format as input "data" . \n
@@ -1147,7 +1139,7 @@ REG_OP(DiagPart)
 * with a set of learned weights, and (optionally) adds biases. \n
 * @par Inputs:
 * Four inputs, including:
-* @li x: A Tensor of type float16, int8, int4, float32.
+* @li x: A Tensor of type float16, int8, int4.
 * @li w: A weight matrix of type float16, int8, int4, float32.
 * @li b: An optional Tensor of type float16, int32, float32.
 * @li offset_w: An optional Tensor of type int8, int4.
@@ -1175,11 +1167,11 @@ REG_OP(DiagPart)
 * Yes
 */
 REG_OP(FullyConnection)
-    .INPUT(x, TensorType({DT_FLOAT16, DT_INT8, DT_INT4, DT_FLOAT32, DT_BF16}))
-    .INPUT(w, TensorType({DT_FLOAT16, DT_INT8, DT_INT4, DT_FLOAT32, DT_BF16}))
-    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT16, DT_INT32,DT_FLOAT32, DT_BF16}))
+    .INPUT(x, TensorType({DT_FLOAT16, DT_INT8, DT_INT4, DT_FLOAT, DT_BF16}))
+    .INPUT(w, TensorType({DT_FLOAT16, DT_INT8, DT_INT4, DT_FLOAT, DT_BF16}))
+    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT, DT_BF16}))
     .OPTIONAL_INPUT(offset_w, TensorType({DT_INT8, DT_INT4}))
-    .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32,DT_FLOAT32, DT_BF16}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT, DT_BF16}))
     .REQUIRED_ATTR(num_output, Int)
     .ATTR(transpose, Bool, false)
     .ATTR(axis, Int, 1)
@@ -1643,7 +1635,7 @@ REG_OP(Tril)
 * @par Inputs:
 * @li x: A list of Tensors. Must be one of the following types:  int32,
 * float16, float32. Tensors to be concatenated. All must have size 1 in
-*  the first dimension and same shape.It's a dynamic input. \n
+*  the first dimension and same shape. It's a dynamic input. \n
 
 * @par Attributes:
 * @li equation: The subscripts for the Einstein summation. \n
@@ -1658,7 +1650,7 @@ REG_OP(Tril)
 * Input N must be Int. \n
 
 * @par Third-party framework compatibility
-* Compatible with Pytorch einsum operator.
+* Compatible with Tensorflow 2.x einsum operator.
 */
 REG_OP(Einsum)
     .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
diff --git a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
index 96213764..2bb8c2ec 100644
--- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
@@ -99,8 +99,8 @@ Specifies the variance used for inference. Must be "None"
 if the operation is used for training . \n
 
 *@par Attributes:
-*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. 
-Defaults to "0.0001".
+*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero.
+* Defaults to "0.0001".
 *@li data_format: An optional string, specifying the format of "x". Defaults to "NHWC".
 *@li is_training: An optional bool, specifying if the operation is used for training or inference. 
 Defaults to "True" . \n
diff --git a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
index c6244a81..aca8e217 100644
--- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
@@ -31,7 +31,7 @@ namespace ge {
 * Three inputs include:
 * @li input: 4D origin shape of input tensor [N, C, H, W] or [N, H, W, C],
 * support float16.
-* @li filter_size: A 4D tensor of type int32, int64, with shape [H, W, C, K]
+* @li filter_size: A 4D tensor of type int32.
 * @li out_backprop: 4D tensor with shape [N, C, H, W] or [N, H, W, C].
 * Must be one of the following types: float16. \n
 
@@ -52,9 +52,9 @@ namespace ge {
 
 * @par Outputs:
 * filter_grad: Gradient of the deep convolution relative to the filter with
-* shape [H, W, C, K]. Must be one of the following types: float16. \n
+* shape [H, W, C, K]. Must be one of the following types: float32. \n
 
-* @attention Constraints:\n
+* @attention Constraints:
 * The feature map is 4D with shape [N, C, Hi, Wi] or [N, Hi, Wi, C], but
 * the data is 5D with shape [N, C1, Hi, Wi, C0], where C0 is 16.\n
 * The filter is 4D with shape [Hf, Wf, C, K], but the data is 6D with shape
@@ -90,7 +90,7 @@ REG_OP(DepthwiseConv2DBackpropFilter)
 * Two inputs include: \n
 * @li input: 4D tensor with shape [N, C, H, W] or [N, H, W, C], of type float16
 * @li out_backprop: 4D tensor with shape [N, C, H, W] or [N, H, W, C],
-* of type float16
+* of type float16.
 
 * @par Attributes:
 * @li filter_size: A required list or tuple. Shape of filter.
@@ -133,8 +133,8 @@ REG_OP(DepthwiseConv2DBackpropFilter)
 * instead.
 */
 REG_OP(DepthwiseConv2DBackpropFilterD)
-    .INPUT(input, TensorType({DT_FLOAT16, DT_FLOAT32, DT_BF16}))
-    .INPUT(out_backprop, TensorType({DT_FLOAT16, DT_FLOAT32, DT_BF16}))
+    .INPUT(input, TensorType({DT_FLOAT16, DT_FLOAT, DT_BF16}))
+    .INPUT(out_backprop, TensorType({DT_FLOAT16, DT_FLOAT, DT_BF16}))
     .OUTPUT(filter_grad, TensorType({DT_FLOAT32}))
     .REQUIRED_ATTR(filter_size, ListInt)
     .REQUIRED_ATTR(strides, ListInt)
@@ -147,9 +147,9 @@ REG_OP(DepthwiseConv2DBackpropFilterD)
 * @brief Computes the gradients of depthwise convolution with respect to the
 * input. \n
 * @par Inputs:
-* Three inputs include: \n
+* Three inputs include:
 * @li input_size: 4D shape of input tensor [N, C, H, W] or [N, H, W, C],
-* support int32, int64.
+* support int32.
 * @li filter: 4D filter tensor with shape of [H, W, C, K], support float16.
 * @li out_backprop: 4D tensor with shape [N, C, H, W] or [N, H, W, C].
 * Must be one of the following types: float16 . \n
@@ -172,7 +172,7 @@ REG_OP(DepthwiseConv2DBackpropFilterD)
 * @par Outputs:
 * input_grad: Gradient of the deep convolution relative to the input with shape
 * [N, C, H, W] or [N, H, W, C] Must be one of the following types:
-* float16, float32. \n
+* float16. \n
 
 * @attention Constraints:\n
 * The feature map is 4D with shape [N, C, Hi, Wi] or [N, Hi, Wi, C], but
@@ -184,7 +184,7 @@ REG_OP(DepthwiseConv2DBackpropFilterD)
 * data is 5D with shape [N, C1, Ho, Wo, C0],
 * where C is the same as that of the feature map and C0 is 16.\n
 * Limited by Tiling: max_h_in_l1 >= C0, where max_h_in_l1 = (l1_size - Hf *
-* Wf * C0 * C0 * 2) / (2 * Wo *C0).\n
+* Wf * C0 * C0 * 2) / (2 * Wo *C0). \n
 
 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator DepthwiseConv2DBackpropInput.
@@ -194,7 +194,7 @@ REG_OP(DepthwiseConv2DBackpropInput)
     .INPUT(input_size, TensorType({DT_INT32, DT_INT64}))
     .INPUT(filter, TensorType({DT_FLOAT16}))
     .INPUT(out_backprop, TensorType({DT_FLOAT16}))
-    .OUTPUT(input_grad, TensorType({DT_FLOAT16, DT_FLOAT32}))
+    .OUTPUT(input_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
     .REQUIRED_ATTR(strides, ListInt)
     .ATTR(dilations, ListInt, {1, 1, 1, 1})
     .REQUIRED_ATTR(pads, ListInt)
@@ -355,9 +355,8 @@ REG_OP(BiasAddGrad)
  * data tensor. An integer vector representing the shape of input, where
  * input is a 4-D tensor [batch, height, width, channels]
  * or [batch, channels, height, width].
- * @li filter: A Tensor. Must be one of the following types: float16, float32,
- * float64. 4-D with shape
- * [filter_height, filter_width, in_channels, out_channels]
+ * @li filter: A Tensor. Must be one of the following types: float16.
+ * 4-D with shape [filter_height, filter_width, in_channels, out_channels]
  * or [out_channels, filter_height, filter_width, in_channels]
  * or [out_channels, in_channel, filter_height, filter_width].
  * @li out_backprop: A Tensor. Must have the same type as filter.
@@ -372,14 +371,9 @@ REG_OP(BiasAddGrad)
     | Tensor    | out_bckprop | filter  | y      |\n
     |-----------|-------------|---------|--------|\n
     | Data Type | float16     | float16 | float16|\n
-    |           | float32     | float32 | float32|\n
-    |           | float64     | float64 | float64|\n
     | Format    | NCHW        | NCHW    | NCHW   |\n
     |           | NHWC        | HWCN    | NHWC   |\n
  *\n
- * For float32 and float64 type, the actual calculation on the chip is based
- * on float16.
- *\n
  *
 *@par Attributes:
  * Five attributes:
@@ -400,13 +394,13 @@ REG_OP(BiasAddGrad)
  *\n
     | Name             | Field    | Scope        |\n
     |------------------|----------|--------------|\n
-    | input_size       | H        | [1, 200000]  |\n
+    | input_size       | H        | [1, 4096]    |\n
     |                  | W        | [1, 4096]    |\n
     | Filter           | H        | [1, 255]     |\n
     |                  | W        | [1, 255]     |\n
-    | out_backprop     | H*strideH| [1, 200000]  |\n
+    | out_backprop     | H*strideH| [1, 4096]    |\n
     |                  | W*strideW| [1, 4096]    |\n
-    | y(fmap)          | H        | [1, 200000]  |\n
+    | y(fmap)          | H        | [1, 4096]    |\n
     |                  | W        | [1, 4096]    |\n
     | Stride           | H        | [1, 63]      |\n
     |                  | W        | [1, 63]      |\n
@@ -455,7 +449,7 @@ REG_OP(Conv2DBackpropInput)
 *@brief Computes the gradients of convolution with respect to the input.
 * @par Inputs:
  * Two inputs:
- * @li filter: A Tensor. Types is float16.
+ * @li filter: A Tensor. Types is float16 or int8.
  * 4-D with shape [filter_height, filter_width, in_channels, out_channels]
  * or [out_channels, filter_height, filter_width, in_channels]
  * or [out_channels, in_channel, filter_height, filter_width].
@@ -479,8 +473,8 @@ REG_OP(Conv2DBackpropInput)
  * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to
  * "NHWC". Specify the data format of the input and output data.
 *@par Outputs:
- * y: A Tensor. Has the same type as filter,4-D tensor [batch, height, width,
- * channels] or [batch, channels, height, width].
+ * y: A Tensor. with the type of: float16, float32, int32, 4-D tensor
+ * [batch, height, width, channels] or [batch, channels, height, width].
 * @par Third-party framework compatibility
  * Compatible with Tensorflow's conv2d_backprop_input
 *@par Restrictions:
@@ -547,11 +541,11 @@ REG_OP(Conv2DBackpropInputD)
  *\n
     | Name             | Field    | Scope        |\n
     |------------------|----------|--------------|\n
-    | x (out_backprop) | H*strideH| [1, 200000]  |\n
+    | x (out_backprop) | H*strideH| [1, 4096]    |\n
     |                  | W*strideW| [1, 4096]    |\n
     | Filter           | H        | [1, 255]     |\n
     |                  | W        | [1, 255]     |\n
-    | y (fmap)         | H        | [1, 200000]  |\n
+    | y (fmap)         | H        | [1, 4096]    |\n
     |                  | W        | [1, 4096]    |\n
     | Stride           | H        | [1, 63]      |\n
     |                  | W        | [1, 63]      |\n
@@ -602,8 +596,8 @@ REG_OP(Deconvolution)
 *@brief Computes the gradients of convolution with respect to the filter
 *@par Inputs:
  * Three inputs:
- * @li x: A Tensor. Must be one of the following types: float16, float32,
- * float64. 4-D with shape [batch, in_height, in_width, in_channels] or
+ * @li x: A Tensor. Must be one of the following types: float16.
+ * 4-D with shape [batch, in_height, in_width, in_channels] or
  * [batch, in_channels, in_height, in_width].
  * @li filter_size: A const Tensor of type int32. Currently does not support
  * data tensor. An integer vector representing the tensor shape of filter,
@@ -621,9 +615,7 @@ REG_OP(Deconvolution)
  *\n
     | Tensor    | x       | out_backprop | y       |\n
     |-----------|---------|--------------|---------|\n
-    | Data Type | float16 |    float16   | float16 |\n
-    |           | float32 |    float32   | float32 |\n
-    |           | float64 |    float64   | float64 |\n
+    | Data Type | float16 |    float16   | float32 |\n
     | Format    | NCHW    |     NCHW     | NCHW    |\n
     |           | NHWC    |     NHWC     | HWCN    |\n
  *\n
@@ -650,13 +642,13 @@ REG_OP(Deconvolution)
  *\n
     | Name             | Field    | Scope        |\n
     |------------------|----------|--------------|\n
-    | x(fmap)          | H        | [1, 200000]  |\n
+    | x(fmap)          | H        | [1, 4096]  |\n
     |                  | W        | [1, 4096]    |\n
     | Filter Size      | H        | [1, 255]     |\n
     |                  | W        | [1, 255]     |\n
-    | out_backprop     | H        | [1, 200000]  |\n
+    | out_backprop     | H        | [1, 4096]  |\n
     |                  | W        | [1, 4096]    |\n
-    | y                | H        | [1, 200000]  |\n
+    | y                | H        | [1, 4096]  |\n
     |                  | W        | [1, 4096]    |\n
     | Stride           | H        | [1, 63]      |\n
     |                  | W        | [1, 63]      |\n
@@ -1015,13 +1007,12 @@ REG_OP(DeformableConv2D)
 /**
 *@brief Computes a 3D convolution given 5D "x" and "filter" tensors.
 *@par Inputs:
- * @li x: A 5D tensor. Must be one of the following types: float16,
- * (Currently does not support int8). The format of x is NCDHW or NDHWC.
+ * @li x: A 5D tensor. Must be one of the following types: float16, int8.
+ * The format of x is NCDHW or NDHWC.
  * @li filter: A 5D tensor of the same type as "x".
- * (Currently does not support int8).
  * The format is NCDHW, NDHWC or DHWCN.
  * @li bias: Optional. An 1D tensor of the same type as "x".
- * @li offset_w: Optional. An 1D tensor for quantized deconvolution. Reserved. \n
+ * @li offset_w: Optional. An 1D tensor for quantized deconvolution. \n
 
 *@par Attributes:
  * @li strides: Required. A list of 5 integers. Specifies the stride of the
@@ -1041,7 +1032,8 @@ REG_OP(DeformableConv2D)
  * Defaults to 0. Reserved. \n
 
 *@par Outputs:
- * y: A Tensor. Has the same type and data format as "x". \n
+ * y: A Tensor. Has the same data format as "x". if the type of "x" is int8,
+ * the type of y is int32. \n
 
 *@attention Constraints:
  * The image size after padding is greater than the filter size. \n
@@ -1051,11 +1043,11 @@ REG_OP(DeformableConv2D)
  * @li Compatible with the Caffe operator Convolution.
 */
 REG_OP(Conv3D)
-    .INPUT(x, TensorType({DT_FLOAT16}))
-    .INPUT(filter, TensorType({DT_FLOAT16}))
-    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_FLOAT32}))
+    .INPUT(x, TensorType({DT_FLOAT16, DT_INT8}))
+    .INPUT(filter, TensorType({DT_FLOAT16, DT_INT8}))
+    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_FLOAT32, DT_INT32}))
     .OPTIONAL_INPUT(offset_w, TensorType({DT_INT8}))
-    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32, DT_INT32}))
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(pads, ListInt)
     .ATTR(dilations, ListInt, {1, 1, 1, 1, 1})
@@ -1068,12 +1060,11 @@ REG_OP(Conv3D)
 /**
 *@brief Computes the gradients of convolution 3d with respect to the input.
 *@par Inputs:
- * @li input_size: A Tensor of type int32, int64. An integer vector
+ * @li input_size: A Tensor of type int32. An integer vector
  *  representing the shape of input, where input is a 5-D tensor
  * [batch, depth, height, width, channels] or
  * [batch, channels, depth, height, width].
- * @li filter: A Tensor. Must be one of the following types: float16, float32.
- * Currently does not support double.
+ * @li filter: A Tensor. Must be one of the following types: float16.
  * @li out_backprop: A Tensor. Must have the same type as filter.
  * 5-D with shape [batch, depth, out_height, out_width, out_channels]
  * or [batch, out_channels, depth, out_height, out_width]. Gradients with
@@ -1095,8 +1086,7 @@ REG_OP(Conv3D)
  * Defaults to "NDHWC". Specify the data format of the input and output data. \n
 
 *@par Outputs:
- * y: A Tensor. Has the same type as filter,and has same format as
- * "input_size". \n
+ * y: A Tensor. Has same format as "input_size". \n
 
 *@par Third-party framework compatibility
  * Compatible with Tensorflow's conv3d_backprop_input
@@ -1207,8 +1197,7 @@ REG_OP(LSTM)
 /**
 *@brief Computes the gradients of convolution3D with respect to the filter
 *@par Inputs:
- * @li x: A Tensor. Must be one of the following types: float16, float32,
- * double. Currently does not support double.
+ * @li x: A Tensor. Must be one of the following types: float16.
  * 5-D with shape [batch, in_depth, in_height, in_width, in_channels]
  * or [batch, in_channels, in_depth, in_height, in_width].
  * @li filter_size: A Tensor of type int32. An integer vector representing the
@@ -1236,7 +1225,7 @@ REG_OP(LSTM)
  * Defaults to "NDHWC". Specify the data format of the input and output data. \n
 
 *@par Outputs:
- * y: A Tensor that has the same type as "x" and the format is NDHWC, NCDHW
+ * y: A Tensor that has the type float32 and the format is NDHWC, NCDHW
  * or DHWCN. \n
 
 *@par Third-party framework compatibility
@@ -1310,7 +1299,7 @@ REG_OP(Conv3DBackpropFilterD)
 *@brief Computes the transpose of convolution 3d with respect to the input.
 
 *@par Inputs:
- * @li input_size: A Tensor of type int32, int64. An integer vector
+ * @li input_size: A Tensor of type int32. An integer vector
  * representing the shape of input.
  * @li x: A Tensor of type float16, currently does not support int8. The format
  * is NDHWC or NCDHW.
@@ -1336,7 +1325,7 @@ REG_OP(Conv3DBackpropFilterD)
  * @li offset_x: Optional. Input offset_x value. Reserved. \n
 
 *@par Outputs:
- * y: A Tensor. Has the same type and format as "x".
+ * y: A Tensor. Has the same format as "x", has the type float16, float32.
 */
 REG_OP(Conv3DTranspose)
     .INPUT(input_size, TensorType({DT_INT32, DT_INT64}))
@@ -1362,7 +1351,7 @@ REG_OP(Conv3DTranspose)
  * The format is NDHWC or NCDHW.
  * @li filter: A Tensor of type float16, currently does not support int8.
  * The format is NDHWC, NCDHW or DHWCN.
- * @li bias: Optional. An 1D tensor of the same type as "x". Reserved.
+ * @li bias: Optional. An 1D tensor of the same type as "x".
  * @li offset_w: Optional. An 1D tensor for quantized deconvolution. Reserved. \n
 
 *@par Attributes:
@@ -1383,7 +1372,7 @@ REG_OP(Conv3DTranspose)
  * @li offset_x: Optional. Input offset_x value. Reserved. \n
 
 *@par Outputs:
- * y: A Tensor. Has the same type and format as "x". \n
+ * y: A Tensor. Has the same format as "x", has the type float16, float32. \n
 
 *@par Restrictions:
  * Warning: THIS FUNCTION IS DEPRECATED. Please use Conv3DTranspose instead.
@@ -1428,7 +1417,7 @@ REG_OP(Conv3DTransposeD)
     | Tensor    | x       | filter  | bias    | y      |\n
     |-----------|---------|---------|---------|--------|\n
     | Data Type | float16 | float16 | float16 | float16|\n
-    |           | int8    | int8    | int32   | int32  |\n
+    |           | float16 | float16 | float32 | float32|\n
     | Format    | NCHW    | NCHW    | ND      | NCHW   |\n
     |           | NHWC    | HWCN    |         | NHWC   |\n
  *\n
@@ -1461,13 +1450,13 @@ REG_OP(Conv3DTransposeD)
  *\n
     | Name             | Field    | Scope        |\n
     |------------------|----------|--------------|\n
-    | input_size       | H        | [1, 200000]  |\n
+    | input_size       | H        | [1, 4096]    |\n
     |                  | W        | [1, 4096]    |\n
-    | x (out_backprop) | H*strideH| [1, 200000]  |\n
+    | x (out_backprop) | H*strideH| [1, 4096]    |\n
     |                  | W*strideW| [1, 4096]    |\n
     | filter           | H        | [1, 255]     |\n
     |                  | W        | [1, 255]     |\n
-    | y (fmap)         | H        | [1, 200000]  |\n
+    | y (fmap)         | H        | [1, 4096]    |\n
     |                  | W        | [1, 4096]    |\n
     | Stride           | H        | [1, 63]      |\n
     |                  | W        | [1, 63]      |\n
@@ -1503,9 +1492,9 @@ REG_OP(Conv2DTranspose)
     .INPUT(input_size, TensorType({DT_INT32, DT_INT64}))
     .INPUT(x, TensorType({DT_FLOAT16, DT_INT8}))
     .INPUT(filter, TensorType({DT_FLOAT16, DT_INT8}))
-    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT32}))
+    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT}))
     .OPTIONAL_INPUT(offset_w, TensorType({DT_INT8}))
-    .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT}))
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(pads, ListInt)
     .ATTR(dilations, ListInt, {1, 1, 1, 1})
@@ -1522,7 +1511,7 @@ REG_OP(Conv2DTranspose)
  * @li x: A Tensor of type float16, int8.
  * @li filter: A Tensor of type float16, int8. Must have the same type as "x".
  * @li bias: An optional 1D tensor of the same type as "x".
- * @li offset_w: An optional 1D tensor for quantized inference. Type is int8. Reserved.
+ * @li offset_w: An optional 1D tensor for quantized inference. Type is int8.
 *@par Required Attributes:
  * @li input_size: A Tensor of type int32 or int64. An integer vector representing the
  * shape of input.
@@ -1550,9 +1539,9 @@ REG_OP(Conv2DTranspose)
 REG_OP(Conv2DTransposeD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_INT8}))
     .INPUT(filter, TensorType({DT_FLOAT16, DT_INT8}))
-    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT32}))
+    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT}))
     .OPTIONAL_INPUT(offset_w, TensorType({DT_INT8}))
-    .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT}))
     .REQUIRED_ATTR(input_size, ListInt)
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(pads, ListInt)
diff --git a/third_party/fwkacllib/inc/ops/nn_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
index 65411e2a..8db82ec9 100644
--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@@ -193,6 +193,39 @@ REG_OP(SigmoidCrossEntropyWithLogitsV2)
     .ATTR(reduction, String, "mean")
     .OP_END_FACTORY_REG(SigmoidCrossEntropyWithLogitsV2)
 
+/**
+* @brief Computes the sigmoid focal loss of "pred" and "target".
+
+* @par Inputs:
+* Three inputs, including:
+* @li pred: A 2-dimensional Tensor of type float16 or float32, specifying the predicted value.
+* @li target: A 1-dimensional Tensor of type int32, specifying the target value.
+* @li weight: A 1-dimensional Tensor, specifying the weight value. \n
+
+* @par Attributes:
+* @li gamma: An optional float, specifying the exponent of the modulating factor (1 - pt)
+* to balance easy/hard examples. Defaults to 2.0. 
+* @li alpha: An optional float, specifying the weighting factor in range (1, 0) to balance
+* the importance of positive/negative examples or less than 0 for ignore. Defaults to 0.25. 
+* @li reduction: A optional character string from "none", "mean", and "sum", specifying the
+* reduction type to be applied to the output. Defaults to "mean".  \n
+
+* @par Outputs:
+* loss: Sigmoid focal loss between the predicted value and target value. Has the same dimensions as "pred". \n
+
+* @par Third-party framework compatibility
+* Compatible with mmcv operator SigmoidFocalLoss.
+*/
+REG_OP(SigmoidFocalLoss)
+    .INPUT(pred, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(target, TensorType({DT_INT32}))
+    .OPTIONAL_INPUT(weight, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(loss, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .ATTR(gamma, Float, 2.0)
+    .ATTR(alpha, Float, 0.25)
+    .ATTR(reduction, String, "mean")
+    .OP_END_FACTORY_REG(SigmoidFocalLoss)
+
 /**
 * @brief Computes the regression box of the RPN. It is a FasterRCNN operator .
 
@@ -1834,5 +1867,79 @@ REG_OP(AxpyWithSoftmaxAndDropOutDoMask)
     .REQUIRED_ATTR(input_keep_prob, Float)
     .ATTR(axis, ListInt, {-1})
     .OP_END_FACTORY_REG(AxpyWithSoftmaxAndDropOutDoMask)
+
+/**
+* @brief MMCV Function: sigmoid_focal_loss_grad  . \n
+
+* @par Inputs:
+* Three inputs, including:
+* @li pred: the predicted tensor. The type support float16 and float32.
+* @li target: the target label Tensor. The type support Int32.
+* @li dout: the grad of previous op grad, which has the sampe shape wth pred. The type support float16 and float32.
+* @li weight: A optioanl input Tensor, default is None, which helps to calculate the loss by supplying sample weights:
+*     shape of pred should be (B，D), B means batch size, D means the number of labels.
+*     shape of target should be (D, ).
+*     shape of weight should be (D, ) \n
+
+* @par Attributes:
+* @li alpha: A attribute is used to reweight the sample. The type is float . \n
+* @li gamma: A attribute is used to calculate the power of the probability.
+*     The type is float . \n
+* @li reduction: a type of the reduce method. default is 'mean', which means computing the average loss. 
+                'sum' means computing the sum of the loss, 'none' means no reducing .\n
+
+* @par Outputs:
+* grad: A mutable Tensor. Has the same type and shape as "pred". \n
+
+* @par Third-party framework compatibility
+* Compatible with the MMCV operator SigmoidFocalLoss.
+*/
+REG_OP(SigmoidFocalLossGrad)
+    .INPUT(pred, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(target, TensorType({DT_INT32}))
+    .INPUT(dout, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(weight, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(grad, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(alpha, Float, 0.25)
+    .ATTR(gamma, Float, 2.0)
+    .ATTR(reduction, String, "mean")
+    .OP_END_FACTORY_REG(SigmoidFocalLossGrad)
+
+/**
+* @brief MMCV Function: softmax_focal_loss_grad  . \n
+
+* @par Inputs:
+* Three inputs, including:
+* @li pred: the predicted tensor. The type support float16 and float32.
+* @li target: the target label Tensor. The type support Int32.
+* @li dout: the grad of previous op grad, which has the sampe shape wth pred. The type support float16 and float32.
+* @li weight: A optioanl input Tensor, default is None, which helps to calculate the loss by supplying sample weights:
+*     shape of pred should be (B，D), B means batch size, D means the number of labels.
+*     shape of target should be (B, D).
+*     shape of weight should be (D, ) \n
+
+* @par Attributes:
+* @li alpha: A attribute is used to reweight the sample. The type is float . \n
+* @li gamma: A attribute is used to calculate the power of the probability.
+*     The type is float . \n
+* @li reduction: a type of the reduce method. default is 'mean', which means computing the average loss. 
+                'sum' means computing the sum of the loss, 'none' means no reducing .\n
+
+* @par Outputs:
+* grad: A mutable Tensor. Has the same type and shape as "pred". \n
+
+* @par Third-party framework compatibility
+* Compatible with the MMCV operator SoftmaxFocalLossGrad.
+*/
+REG_OP(SoftmaxFocalLossGrad)
+    .INPUT(pred, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(target, TensorType({DT_INT32}))
+    .INPUT(dout, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(weight, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(grad, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(alpha, Float, 0.25)
+    .ATTR(gamma, Float, 2.0)
+    .ATTR(reduction, String, "mean")
+    .OP_END_FACTORY_REG(SoftmaxFocalLossGrad)
 }  // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_NORM_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
index 16ec4357..dec8688a 100644
--- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
@@ -174,8 +174,8 @@ REG_OP(AvgPoolV2)
 /**
 * @brief Performs average pooling on the input. \n
 * @par Inputs:
-* x: A 5-D Tensor of shape [batch, depth, height, width, channels] and type
-* float16, float32, double. \n
+* @li x: A 5-D Tensor of shape [batch, depth, height, width, channels] and
+* type float16. \n
 
 * @par Attributes:
 * @li ksize: List of ints that has length 1, 3 or 5. The size of the window
@@ -201,8 +201,8 @@ REG_OP(AvgPoolV2)
 * Compatible with the TensorFlow operator AvgPool3D.
 */
 REG_OP(AvgPool3D)
-    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
-    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
     .REQUIRED_ATTR(ksize, ListInt)
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(pads, ListInt)
@@ -216,9 +216,9 @@ REG_OP(AvgPool3D)
 /**
 * @brief Performs average pooling on the input.
 * @par Inputs:
-* @li x: A 5-D Tensor of shape [batch, depth, height, width, channels] and type float16, float32, double.
-* @li filter: An optional tensor of type float16, float32, double, fractal_z_3d layout.
-* @li multiplier: An optional tensor of float16, float32, double.
+* @li x: A 5-D Tensor of shape [batch, depth, height, width, channels] and type float16.
+* @li filter: An optional tensor of type float16, fractal_z_3d layout.
+* @li multiplier: An optional tensor of float16.
 
 * @par Attributes:
 * @li ksize: List of ints that has length 1, 3 or 5. The size of the window for each dimension of the input tensor.
@@ -239,10 +239,10 @@ REG_OP(AvgPool3D)
 * Compatible with the TensorFlow operator AvgPool3D.
 */
 REG_OP(AvgPool3DD)
-    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
-    .OPTIONAL_INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
-    .OPTIONAL_INPUT(multiplier, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
-    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OPTIONAL_INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OPTIONAL_INPUT(multiplier, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT, DT_DOUBLE}))
     .REQUIRED_ATTR(ksize, ListInt)
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(pads, ListInt)
@@ -256,7 +256,7 @@ REG_OP(AvgPool3DD)
 * @brief Computes AvgPool3DGrad function. \n
 * @par Inputs:
 * @li orig_input_shape: An NDHWC tensor of type int32.
-* @li grads: An NDHWC tensor of type float16, float32, or double. \n
+* @li grads: An NDHWC tensor of type float16. \n
 
 * @par Attributes:
 * @li ksize: List of ints that has length 5. The size of the window for
@@ -284,8 +284,8 @@ REG_OP(AvgPool3DD)
 
 REG_OP(AvgPool3DGrad)
     .INPUT(orig_input_shape, TensorType({DT_INT32}))
-    .INPUT(grads, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
-    .OUTPUT(output, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
+    .INPUT(grads, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(output, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
     .REQUIRED_ATTR(ksize, ListInt)
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(pads, ListInt)
@@ -299,7 +299,7 @@ REG_OP(AvgPool3DGrad)
 * @brief Performs average pooling on the input.
 * @par Inputs:
 * @li grads: An NDHWC tensor of type float16.
-* @li filter: An optional tensor of type float16, fractal_z_3d layout.
+* @li filter: An optional tensor of type float16.
 * @li multiplier: An optional tensor of float16.
 
 * @par Attributes:
@@ -867,8 +867,8 @@ REG_OP(MaxPoolGradGradWithArgmax)
 /**
 * @brief Computes avgpoograd function. \n
 * @par Inputs:
-* @li orig_input_shape: An NHWC tensor of type int32.
-* @li input_grad: An NHWC tensor of type float16, float32, or double. \n
+* @li orig_input_shape: A tensor of type int32.
+* @li input_grad: A tensor of type float16. \n
 
 * @par Attributes:
 * @li ksize: A required tuple or list, specifying the size of the window for
@@ -887,8 +887,8 @@ REG_OP(MaxPoolGradGradWithArgmax)
 */
 REG_OP(AvgPoolGrad)
     .INPUT(orig_input_shape, TensorType({DT_INT32}))
-    .INPUT(input_grad, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
-    .OUTPUT(out_grad, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
+    .INPUT(input_grad, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(out_grad, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
     .REQUIRED_ATTR(ksize, ListInt)
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(padding, String)
@@ -898,9 +898,9 @@ REG_OP(AvgPoolGrad)
 /**
 * @brief Computes gradients of average pooling function . \n
 * @par Inputs:
-* @input_grad: An NHWC tensor of type float16.
-* @mean_matrix: Assist matrix, an NHWC tensor of type float16.
-* @kernel_matrix: Assist matrix, an NHWC tensor of type float16.
+* @li input_grad: An NHWC tensor of type float16.
+* @li mean_matrix: Assist matrix, an NHWC tensor of type float16.
+* @li kernel_matrix: Assist matrix, an NHWC tensor of type float16.
 
 * @par Attributes:
 * @li orig_input_shape: A required Original input dimensions.
@@ -913,7 +913,7 @@ REG_OP(AvgPoolGrad)
 * @li data_format: An optional string. Defaults to "NHWC" . \n
 
 * @par Outputs:
-* @out_grad: A mutable tensor with the same shape and type as "orig_input".
+* @li out_grad: A mutable tensor with the same shape and type as "orig_input".
 *
 * @par Restrictions:
 * Warning: THIS FUNCTION IS DEPRECATED. Please use AvgPoolGrad instead.
diff --git a/third_party/fwkacllib/inc/ops/reduce_ops.h b/third_party/fwkacllib/inc/ops/reduce_ops.h
index 079982db..6fde6d46 100644
--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -277,7 +277,7 @@ REG_OP(BN3DTrainingUpdate)
 *y: A tensor of type float16 or float32 for the normalized "x" . \n
 
 *@attention Constraints:
-*For Ascend 310, the result accuracy fails to reach 1/1000 due to the
+* For Ascend 310, the result accuracy fails to reach 1/1000 due to the
 * square root instruction.
 */
 REG_OP(BNInfer)
@@ -314,8 +314,8 @@ REG_OP(BNInfer)
 *@li batch_variance: A tensor of type float32, for the variance of "x" . \n
 
 *@attention Constraints:
-*This operator is used in conjunction with BNTrainingReduce.
-*For Ascend 310, the result accuracy fails to reach 1/1000 due to
+*@li This operator is used in conjunction with BNTrainingReduce.
+*@li For Ascend 310, the result accuracy fails to reach 1/1000 due to
 * the square root instruction.
 */
 REG_OP(BNTrainingUpdateV2)
diff --git a/third_party/fwkacllib/inc/runtime/config.h b/third_party/fwkacllib/inc/runtime/config.h
index 2081ac97..18a50d2e 100644
--- a/third_party/fwkacllib/inc/runtime/config.h
+++ b/third_party/fwkacllib/inc/runtime/config.h
@@ -83,7 +83,8 @@ typedef enum tagRtPlatformType {
     PLATFORM_CLOUD_V2_910B2 = 11,
     PLATFORM_CLOUD_V2_910B3 = 12,
     PLATFORM_CLOUD_V2_910B4 = 13,
-    PLATFORM_END = 14,
+    PLATFORM_MDC_PG2 = 14,
+    PLATFORM_END = 15,
 } rtPlatformType_t;
 
 typedef enum tagRtCubeFracMKNFp16 {
diff --git a/third_party/fwkacllib/inc/runtime/dev.h b/third_party/fwkacllib/inc/runtime/dev.h
index b1157c2b..d5fb6fc6 100644
--- a/third_party/fwkacllib/inc/runtime/dev.h
+++ b/third_party/fwkacllib/inc/runtime/dev.h
@@ -55,10 +55,10 @@ typedef enum tagRtFeatureType {
 } rtFeatureType_t;
 
 typedef enum tagRtDeviceFeatureType {
-  FEATURE_TYPE_SCHE,
-  FEATURE_TYPE_BLOCKING_OPERATOR,
-  FEATURE_TYPE_FFTS_MODE,
-  FEATURE_TYPE_END,
+    FEATURE_TYPE_SCHE,
+    FEATURE_TYPE_BLOCKING_OPERATOR,
+    FEATURE_TYPE_FFTS_MODE,
+    FEATURE_TYPE_END,
 } rtDeviceFeatureType_t;
 
 typedef enum tagMemcpyInfo {
diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h
index 1c33e1b3..3610ad3c 100644
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -392,7 +392,23 @@ RTS_API rtError_t rtMemcpyHostTask(void * const dst, const uint64_t destMax, con
 RTS_API rtError_t rtMemcpyAsync(void *dst, uint64_t destMax, const void *src, uint64_t cnt, rtMemcpyKind_t kind,
                                 rtStream_t stm);
 
-typedef struct rtMemcpyAddrInfo {
+/**
+ * @ingroup dvrt_mem
+ * @brief asynchronized memcpy
+ * @param [in] dst   destination address pointer
+ * @param [in] Max length of destination address memory
+ * @param [in] src   source address pointer
+ * @param [in] count   the number of byte to copy
+ * @param [in] kind   memcpy type
+ * @param [in] stream   asynchronized task stream
+ * @param [in] qosCfg   asynchronized task qosCfg
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtMemcpyAsyncWithCfg(void *dst, uint64_t destMax, const void *src, uint64_t count,
+    rtMemcpyKind_t kind, rtStream_t stream, uint32_t qosCfg);
+
+typedef struct {
     uint32_t resv0;
     uint32_t resv1;
     uint32_t resv2;
@@ -420,6 +436,23 @@ RTS_API rtError_t rtMemcpyAsyncPtr(void *memcpyAddrInfo, uint64_t destMax, uint6
 RTS_API rtError_t rtReduceAsync(void *dst, uint64_t destMax, const void *src, uint64_t cnt, rtRecudeKind_t kind,
                                 rtDataType_t type, rtStream_t stm);
 
+/**
+ * @ingroup dvrt_mem
+ * @brief asynchronized reduce memcpy
+ * @param [in] dst   destination address pointer
+ * @param [in] Max length of destination address memory
+ * @param [in] src   source address pointer
+ * @param [in] count   the number of byte to copy
+ * @param [in] kind   memcpy type
+ * @param [in] type   data type
+ * @param [in] stm   asynchronized task stream
+ * @param [in] qosCfg   asynchronized task qosCfg
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtReduceAsyncWithCfg(void *dst, uint64_t destMax, const void *src, uint64_t cnt, rtRecudeKind_t kind,
+    rtDataType_t type, rtStream_t stm, uint32_t qosCfg);
+
 /**
  * @ingroup dvrt_mem
  * @brief asynchronized reduce memcpy
diff --git a/third_party/fwkacllib/inc/runtime/rt_mem_queue.h b/third_party/fwkacllib/inc/runtime/rt_mem_queue.h
index 2ed9fd08..0997cfe3 100644
--- a/third_party/fwkacllib/inc/runtime/rt_mem_queue.h
+++ b/third_party/fwkacllib/inc/runtime/rt_mem_queue.h
@@ -372,6 +372,17 @@ RTS_API rtError_t rtMbufInit(rtMemBuffCfg_t *cfg);
 */
 RTS_API rtError_t rtMbufAlloc(rtMbufPtr_t *memBuf, uint64_t size);
 
+/**
+* @ingroup rt_mem_queue
+* @brief alloc buff
+* @param [out] memBuf: buff addr alloced
+* @param [in]  size: The amount of memory space requested
+* @param [in]  flag: Huge page flag(bit0~31: mem type, bit32~bit35: devid, bit36~63: resv)
+* @param [in]  grpId: group id
+* @return RT_ERROR_NONE for ok
+*/
+RTS_API rtError_t rtMbufAllocEx(rtMbufPtr_t *memBuf, uint64_t size, uint64_t flag, int32_t grpId);
+
 /**
 * @ingroup rt_mem_queue
 * @brief free buff
@@ -417,6 +428,15 @@ RTS_API rtError_t rtMbufGetBuffSize(rtMbufPtr_t memBuf, uint64_t *totalSize);
 */
 RTS_API rtError_t rtMbufGetPrivInfo(rtMbufPtr_t memBuf,  void **priv, uint64_t *size);
 
+/**
+* @ingroup rt_mem_queue
+* @brief copy buf ref
+* @param [in] memBuf: src buff addr
+* @param [out] newMemBuf: des buff addr
+* @return RT_ERROR_NONE for ok
+*/
+RTS_API rtError_t rtMbufCopyBufRef(rtMbufPtr_t memBuf, rtMbufPtr_t *newMemBuf);
+
 // mem group
 typedef struct {
     uint64_t maxMemSize; // max buf size in grp, in KB. = 0 means no limit
@@ -430,30 +450,43 @@ typedef struct {
     uint32_t rsv : 28;
 } rtMemGrpShareAttr_t;
 
-#define RT_MEM_GRP_QUERY_GROUPS_OF_PROCESS 1  // query process all grp
+#define RT_MEM_GRP_QUERY_GROUPS_OF_PROCESS 1 // query process all grp
+#define RT_MEM_GRP_QUERY_GROUP_ID 2 // query group id from name
+
+#define RT_MEM_GRP_NAME_LEN 32  // it must be same as driver define BUFF_GRP_NAME_LEN
 
 typedef struct {
     int32_t pid;
 } rtMemGrpQueryByProc_t; // cmd: GRP_QUERY_GROUPS_OF_PROCESS
 
+typedef struct {
+    char grpName[RT_MEM_GRP_NAME_LEN];
+} rtMemGrpQueryGroupId_t; // cmd: RT_MEM_GRP_QUERY_GROUP_ID
+
 typedef struct {
     int32_t cmd;
     union {
         rtMemGrpQueryByProc_t grpQueryByProc; // cmd: GRP_QUERY_GROUPS_OF_PROCESS
+        rtMemGrpQueryGroupId_t grpQueryGroupId; // cmd: RT_MEM_GRP_QUERY_GROUP_ID
     };
 } rtMemGrpQueryInput_t;
 
-#define RT_MEM_GRP_NAME_LEN 32  // it must be same as driver define BUFF_GRP_NAME_LEN
-
 typedef struct {
     char_t groupName[RT_MEM_GRP_NAME_LEN];  // group name
     rtMemGrpShareAttr_t attr; // process in group attribute
 } rtMemGrpOfProc_t; // cmd: GRP_QUERY_GROUPS_OF_PROCESS
 
 typedef struct {
-    rtMemGrpOfProc_t *groupsOfProc; // cmd: GRP_QUERY_GROUPS_OF_PROCESS
+    int32_t groupId; // group id
+} rtMemGrpQueryGroupIdInfo_t; // cmd: RT_MEM_GRP_QUERY_GROUP_ID
+
+typedef struct {
     size_t maxNum; // max number of result
     size_t resultNum; // if the number of results exceeds 'maxNum', only 'maxNum' results are filled in buffer
+    union {
+        rtMemGrpOfProc_t *groupsOfProc; // cmd: GRP_QUERY_GROUPS_OF_PROCESS
+        rtMemGrpQueryGroupIdInfo_t *groupIdInfo; // cmd: RT_MEM_GRP_QUERY_GROUP_ID
+    };
 } rtMemGrpQueryOutput_t;
 
 /**
diff --git a/third_party/fwkacllib/inc/runtime/rt_stars.h b/third_party/fwkacllib/inc/runtime/rt_stars.h
index b778550f..3fc71e8d 100644
--- a/third_party/fwkacllib/inc/runtime/rt_stars.h
+++ b/third_party/fwkacllib/inc/runtime/rt_stars.h
@@ -99,8 +99,66 @@ RTS_API rtError_t rtCmoTaskLaunch(rtCmoTaskInfo_t *taskInfo, rtStream_t stm, uin
  * @return RT_ERROR_NONE for ok, others failed
  */
 RTS_API rtError_t rtBarrierTaskLaunch(rtBarrierTaskInfo_t *taskInfo, rtStream_t stm, uint32_t flag);
-#if defined(__cplusplus)
 
+/**
+ * @ingroup rt_stars
+ * @brief dvpp group handle.
+ */
+typedef void *rtDvppGrp_t;
+
+typedef struct tagDvppGrpRptInfo {
+    uint32_t deviceId;
+    uint32_t streamId;
+    uint32_t taskId;
+    uint8_t sqeType;
+    uint8_t cqeErrorCode;
+    uint8_t reserve[2];
+    uint32_t accErrorCode;
+} rtDvppGrpRptInfo_t;
+
+typedef void (*rtDvppGrpCallback)(rtDvppGrpRptInfo_t *rptInfo);
+
+/**
+ * @ingroup rt_stars
+ * @brief create dvpp group.
+ * @param [in] flags     group flag, reserved parameter
+ * @param [out] grp      group handle
+ * @return RT_ERROR_NONE for ok, others failed
+ */
+RTS_API rtError_t rtDvppGroupCreate(rtDvppGrp_t *grp, uint32_t flags);
+
+/**
+ * @ingroup rt_stars
+ * @brief destroy dvpp group.
+ * @param [in] grp      group handle
+ * @return RT_ERROR_NONE for ok, others failed
+ */
+RTS_API rtError_t rtDvppGroupDestory(rtDvppGrp_t grp);
+
+/**
+ * @ingroup rt_stars
+ * @brief create stream with grp handle
+ * @param [in|out] stm   created stream
+ * @param [in] priority   stream priority
+ * @param [in] flags  stream op flags
+ * @param [in] grp    grp handle
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ * @return RT_ERROR_NONE for ok, others failed
+ */
+RTS_API rtError_t rtStreamCreateByGrp(rtStream_t *stm, int32_t priority, uint32_t flags, rtDvppGrp_t grp);
+
+/**
+ * @ingroup rt_stars
+ * @brief wait report by grp
+ * @param [in] grp              group handle
+ * @param [in] callBackFunc     callback
+ * @param [in] timeout          wait timeout config, ms, -1: wait forever
+ * @return RT_ERROR_NONE for ok, others failed
+ */
+RTS_API rtError_t rtDvppWaitGroupReport(rtDvppGrp_t grp, rtDvppGrpCallback callBackFunc, int32_t timeout);
+
+#if defined(__cplusplus)
 }
 #endif
-#endif // CCE_RUNTIME_RT_STARS_H
\ No newline at end of file
+#endif  // CCE_RUNTIME_RT_STARS_H
diff --git a/third_party/fwkacllib/inc/runtime/stream.h b/third_party/fwkacllib/inc/runtime/stream.h
index a6abc8fa..a4151ca0 100644
--- a/third_party/fwkacllib/inc/runtime/stream.h
+++ b/third_party/fwkacllib/inc/runtime/stream.h
@@ -101,6 +101,16 @@ RTS_API rtError_t rtStreamWaitEventWithTimeout(rtStream_t stm, rtEvent_t evt, ui
  */
 RTS_API rtError_t rtStreamSynchronize(rtStream_t stm);
 
+/**
+ * @ingroup dvrt_stream
+ * @brief wait stream to be complete and set timeout
+ * @param [in] stm   stream to wait
+ * @param [in] timeout   timeout value,the unit is milliseconds
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtStreamSynchronizeWithTimeout(rtStream_t stm, int32_t timeout);
+
 /**
  * @ingroup dvrt_stream
  * @brief queries an asynchronous stream for completion status
@@ -202,7 +212,7 @@ RTS_API rtError_t rtStreamSwitchN(void *ptr, uint32_t size, void *valuePtr, rtSt
  * @return RT_ERROR_INVALID_VALUE for error input
  */
 RTS_API rtError_t rtDebugRegisterForStream(rtStream_t stm, uint32_t flag, const void *addr,
-                                   uint32_t *streamId, uint32_t *taskId);
+                                           uint32_t *streamId, uint32_t *taskId);
 
 /*
  * @ingroup rt_model
diff --git a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
index 2353d967..1f09000b 100644
--- a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
@@ -18,6 +18,8 @@
 #define PROF_TRAINING_TRACE         0x00000040ULL
 #define PROF_MSPROFTX               0x00000080ULL
 #define PROF_RUNTIME_API            0x00000100ULL
+#define PROF_TASK_FRAMEWORK         0x00000200ULL
+#define PROF_TASK_TSFW              0x00000400ULL
 
 // system profilinig switch
 #define PROF_CPU                    0x00010000ULL
@@ -52,6 +54,8 @@ constexpr uint64_t PROF_AICPU_MODEL = 0x4000000000000000ULL;
 #define PROF_TRAINING_TRACE_MASK         0x00000040ULL
 #define PROF_MSPROFTX_MASK               0x00000080ULL
 #define PROF_RUNTIME_API_MASK            0x00000100ULL
+#define PROF_TASK_FRAMEWORK_MASK         0x00000200ULL
+#define PROF_TASK_TSFW_MASK              0x00000400ULL
 
 // system profilinig mask
 #define PROF_CPU_MASK                    0x00010000ULL
@@ -102,7 +106,7 @@ extern "C" {
 
 MSVP_PROF_API uint64_t ProfGetOpExecutionTime(const void *data, uint32_t len, uint32_t index);
 
-typedef int32_t Status;
+using Status = int32_t;
 typedef struct aclprofSubscribeConfig aclprofSubscribeConfig1;
 ///
 /// @ingroup AscendCL
diff --git a/third_party/fwkacllib/inc/toolchain/prof_common.h b/third_party/fwkacllib/inc/toolchain/prof_common.h
index 37702c9b..eb284272 100644
--- a/third_party/fwkacllib/inc/toolchain/prof_common.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_common.h
@@ -159,8 +159,14 @@ enum MsprofGeTaskType {
     MSPROF_GE_TASK_TYPE_AI_CPU,
     MSPROF_GE_TASK_TYPE_AIV,
     MSPROF_GE_TASK_TYPE_WRITE_BACK,
+    MSPROF_GE_TASK_TYPE_MIX_AIC,
+    MSPROF_GE_TASK_TYPE_MIX_AIV,
+    MSPROF_GE_TASK_TYPE_FFTS_PLUS,
+    MSPROF_GE_TASK_TYPE_DSA,
+    MSPROF_GE_TASK_TYPE_DVPP,
     MSPROF_GE_TASK_TYPE_INVALID
 };
+
 enum MsprofGeShapeType {
     MSPROF_GE_SHAPE_TYPE_STATIC = 0,
     MSPROF_GE_SHAPE_TYPE_DYNAMIC,